 /**
 * Marlin 3D Printer Firmware
 * Copyright (C) 2016 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]
 *
 * Based on Sprinter and grbl.
 * Copyright (C) 2011 Camiel Gubbels / Erik van der Zalm
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */

 /**
 * stepper.cpp  A singleton object to execute motion plans using stepper motors
 * Marlin Firmware
 *
 * Derived from Grbl
 * Copyright (c) 20092011 Simen Svale Skogsrud
 *
 * Grbl is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Grbl is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Grbl. If not, see <http://www.gnu.org/licenses/>.
 */

 /**
 * Timer calculations informed by the 'RepRap cartesian firmware' by Zack Smith
 * and Philipp Tiefenbacher.
 */

 /**
 * __________________________
 * / \ _________________ ^
 * /   \ / \ 
 * /   \ /   \ s
 * /      \ p
 * /      \ e
 * +++++++ e
 *  BLOCK 1  BLOCK 2  d
 *
 * time >
 *
 * The trapezoid is the shape the speed curve over time. It starts at block>initial_rate, accelerates
 * first block>accelerate_until step_events_completed, then keeps going at constant speed until
 * step_events_completed reaches block>decelerate_after after which it decelerates until the trapezoid generator is reset.
 * The slope of acceleration is calculated using v = u + at where t is the accumulated timer values of the steps so far.
 */

 /**
 * Marlin uses the Bresenham algorithm. For a detailed explanation of theory and
 * method see https://www.cs.helsinki.fi/group/goa/mallinnus/lines/bresenh.html
 */

 /**
 * Jerk controlled movements planner added Apr 2018 by Eduardo José Tagle.
 * Equations based on Synthethos TinyG2 sources, but the fixedpoint
 * implementation is new, as we are running the ISR with a variable period.
 * Also implemented the Bézier velocity curve evaluation in ARM assembler,
 * to avoid impacting ISR speed.
 */

 #include "stepper.h"

 #ifdef __AVR__
 #include "speed_lookuptable.h"
 #endif

 #include "endstops.h"
 #include "planner.h"
 #include "motion.h"

 #include "../module/temperature.h"
 #include "../lcd/ultralcd.h"
 #include "../core/language.h"
 #include "../gcode/queue.h"
 #include "../sd/cardreader.h"
 #include "../Marlin.h"
 #include "../HAL/Delay.h"

 #if MB(ALLIGATOR)
 #include "../feature/dac/dac_dac084s085.h"
 #endif

 #if HAS_DIGIPOTSS
 #include <SPI.h>
 #endif

 Stepper stepper; // Singleton

 // public:

 block_t* Stepper::current_block = NULL; // A pointer to the block currently being traced

 #if ENABLED(X_DUAL_ENDSTOPS)  ENABLED(Y_DUAL_ENDSTOPS)  ENABLED(Z_DUAL_ENDSTOPS)
 bool Stepper::homing_dual_axis = false;
 #endif

 #if HAS_MOTOR_CURRENT_PWM
 uint32_t Stepper::motor_current_setting[3]; // Initialized by settings.load()
 #endif

 // private:

 uint8_t Stepper::last_direction_bits = 0,
 Stepper::axis_did_move;

 bool Stepper::abort_current_block;

 #if DISABLED(MIXING_EXTRUDER)
 uint8_t Stepper::last_moved_extruder = 0xFF;
 #endif

 #if ENABLED(X_DUAL_ENDSTOPS)
 bool Stepper::locked_X_motor = false, Stepper::locked_X2_motor = false;
 #endif
 #if ENABLED(Y_DUAL_ENDSTOPS)
 bool Stepper::locked_Y_motor = false, Stepper::locked_Y2_motor = false;
 #endif
 #if ENABLED(Z_DUAL_ENDSTOPS)
 bool Stepper::locked_Z_motor = false, Stepper::locked_Z2_motor = false;
 #endif

 uint32_t Stepper::acceleration_time, Stepper::deceleration_time;
 uint8_t Stepper::steps_per_isr;

 #if DISABLED(ADAPTIVE_STEP_SMOOTHING)
 constexpr
 #endif
 uint8_t Stepper::oversampling_factor;

 int32_t Stepper::delta_error[XYZE] = { 0 };

 uint32_t Stepper::advance_dividend[XYZE] = { 0 },
 Stepper::advance_divisor = 0,
 Stepper::step_events_completed = 0, // The number of step events executed in the current block
 Stepper::accelerate_until, // The point from where we need to stop acceleration
 Stepper::decelerate_after, // The point from where we need to start decelerating
 Stepper::step_event_count; // The total event count for the current block

 #if ENABLED(MIXING_EXTRUDER)
 int32_t Stepper::delta_error_m[MIXING_STEPPERS];
 uint32_t Stepper::advance_dividend_m[MIXING_STEPPERS],
 Stepper::advance_divisor_m;
 #else
 int8_t Stepper::active_extruder; // Active extruder
 #endif

 #if ENABLED(S_CURVE_ACCELERATION)
 int32_t __attribute__((used)) Stepper::bezier_A __asm__("bezier_A"); // A coefficient in Bézier speed curve with alias for assembler
 int32_t __attribute__((used)) Stepper::bezier_B __asm__("bezier_B"); // B coefficient in Bézier speed curve with alias for assembler
 int32_t __attribute__((used)) Stepper::bezier_C __asm__("bezier_C"); // C coefficient in Bézier speed curve with alias for assembler
 uint32_t __attribute__((used)) Stepper::bezier_F __asm__("bezier_F"); // F coefficient in Bézier speed curve with alias for assembler
 uint32_t __attribute__((used)) Stepper::bezier_AV __asm__("bezier_AV"); // AV coefficient in Bézier speed curve with alias for assembler
 #ifdef __AVR__
 bool __attribute__((used)) Stepper::A_negative __asm__("A_negative"); // If A coefficient was negative
 #endif
 bool Stepper::bezier_2nd_half; // =false If Bézier curve has been initialized or not
 #endif

 uint32_t Stepper::nextMainISR = 0;

 #if ENABLED(LIN_ADVANCE)

 constexpr uint32_t LA_ADV_NEVER = 0xFFFFFFFF;
 uint32_t Stepper::nextAdvanceISR = LA_ADV_NEVER,
 Stepper::LA_isr_rate = LA_ADV_NEVER;
 uint16_t Stepper::LA_current_adv_steps = 0,
 Stepper::LA_final_adv_steps,
 Stepper::LA_max_adv_steps;

 int8_t Stepper::LA_steps = 0;

 bool Stepper::LA_use_advance_lead;

 #endif // LIN_ADVANCE

 int32_t Stepper::ticks_nominal = 1;
 #if DISABLED(S_CURVE_ACCELERATION)
 uint32_t Stepper::acc_step_rate; // needed for deceleration start point
 #endif

 volatile int32_t Stepper::endstops_trigsteps[XYZ];

 volatile int32_t Stepper::count_position[NUM_AXIS] = { 0 };
 int8_t Stepper::count_direction[NUM_AXIS] = { 0, 0, 0, 0 };

 #if ENABLED(X_DUAL_ENDSTOPS)  ENABLED(Y_DUAL_ENDSTOPS)  ENABLED(Z_DUAL_ENDSTOPS)
 #define DUAL_ENDSTOP_APPLY_STEP(A,V) \
 if (homing_dual_axis) { \
 if (A##_HOME_DIR < 0) { \
 if (!(TEST(endstops.state(), A##_MIN) && count_direction[_AXIS(A)] < 0) && !locked_##A##_motor) A##_STEP_WRITE(V); \
 if (!(TEST(endstops.state(), A##2_MIN) && count_direction[_AXIS(A)] < 0) && !locked_##A##2_motor) A##2_STEP_WRITE(V); \
 } \
 else { \
 if (!(TEST(endstops.state(), A##_MAX) && count_direction[_AXIS(A)] > 0) && !locked_##A##_motor) A##_STEP_WRITE(V); \
 if (!(TEST(endstops.state(), A##2_MAX) && count_direction[_AXIS(A)] > 0) && !locked_##A##2_motor) A##2_STEP_WRITE(V); \
 } \
 } \
 else { \
 A##_STEP_WRITE(V); \
 A##2_STEP_WRITE(V); \
 }
 #endif

 #if ENABLED(X_DUAL_STEPPER_DRIVERS)
 #define X_APPLY_DIR(v,Q) do{ X_DIR_WRITE(v); X2_DIR_WRITE((v) != INVERT_X2_VS_X_DIR); }while(0)
 #if ENABLED(X_DUAL_ENDSTOPS)
 #define X_APPLY_STEP(v,Q) DUAL_ENDSTOP_APPLY_STEP(X,v)
 #else
 #define X_APPLY_STEP(v,Q) do{ X_STEP_WRITE(v); X2_STEP_WRITE(v); }while(0)
 #endif
 #elif ENABLED(DUAL_X_CARRIAGE)
 #define X_APPLY_DIR(v,ALWAYS) \
 if (extruder_duplication_enabled  ALWAYS) { \
 X_DIR_WRITE(v); \
 X2_DIR_WRITE(v); \
 } \
 else { \
 if (movement_extruder()) X2_DIR_WRITE(v); else X_DIR_WRITE(v); \
 }
 #define X_APPLY_STEP(v,ALWAYS) \
 if (extruder_duplication_enabled  ALWAYS) { \
 X_STEP_WRITE(v); \
 X2_STEP_WRITE(v); \
 } \
 else { \
 if (movement_extruder()) X2_STEP_WRITE(v); else X_STEP_WRITE(v); \
 }
 #else
 #define X_APPLY_DIR(v,Q) X_DIR_WRITE(v)
 #define X_APPLY_STEP(v,Q) X_STEP_WRITE(v)
 #endif

 #if ENABLED(Y_DUAL_STEPPER_DRIVERS)
 #define Y_APPLY_DIR(v,Q) do{ Y_DIR_WRITE(v); Y2_DIR_WRITE((v) != INVERT_Y2_VS_Y_DIR); }while(0)
 #if ENABLED(Y_DUAL_ENDSTOPS)
 #define Y_APPLY_STEP(v,Q) DUAL_ENDSTOP_APPLY_STEP(Y,v)
 #else
 #define Y_APPLY_STEP(v,Q) do{ Y_STEP_WRITE(v); Y2_STEP_WRITE(v); }while(0)
 #endif
 #else
 #define Y_APPLY_DIR(v,Q) Y_DIR_WRITE(v)
 #define Y_APPLY_STEP(v,Q) Y_STEP_WRITE(v)
 #endif

 #if ENABLED(Z_DUAL_STEPPER_DRIVERS)
 #define Z_APPLY_DIR(v,Q) do{ Z_DIR_WRITE(v); Z2_DIR_WRITE(v); }while(0)
 #if ENABLED(Z_DUAL_ENDSTOPS)
 #define Z_APPLY_STEP(v,Q) DUAL_ENDSTOP_APPLY_STEP(Z,v)
 #else
 #define Z_APPLY_STEP(v,Q) do{ Z_STEP_WRITE(v); Z2_STEP_WRITE(v); }while(0)
 #endif
 #else
 #define Z_APPLY_DIR(v,Q) Z_DIR_WRITE(v)
 #define Z_APPLY_STEP(v,Q) Z_STEP_WRITE(v)
 #endif

 #if DISABLED(MIXING_EXTRUDER)
 #define E_APPLY_STEP(v,Q) E_STEP_WRITE(active_extruder, v)
 #endif

 void Stepper::wake_up() {
 // TCNT1 = 0;
 ENABLE_STEPPER_DRIVER_INTERRUPT();
 }

 /**
 * Set the stepper direction of each axis
 *
 * COREXY: X_AXIS=A_AXIS and Y_AXIS=B_AXIS
 * COREXZ: X_AXIS=A_AXIS and Z_AXIS=C_AXIS
 * COREYZ: Y_AXIS=B_AXIS and Z_AXIS=C_AXIS
 */
 void Stepper::set_directions() {

 #define SET_STEP_DIR(A) \
 if (motor_direction(_AXIS(A))) { \
 A##_APPLY_DIR(INVERT_## A##_DIR, false); \
 count_direction[_AXIS(A)] = 1; \
 } \
 else { \
 A##_APPLY_DIR(!INVERT_## A##_DIR, false); \
 count_direction[_AXIS(A)] = 1; \
 }

 #if HAS_X_DIR
 SET_STEP_DIR(X); // A
 #endif
 #if HAS_Y_DIR
 SET_STEP_DIR(Y); // B
 #endif
 #if HAS_Z_DIR
 SET_STEP_DIR(Z); // C
 #endif

 #if DISABLED(LIN_ADVANCE)
 #if ENABLED(MIXING_EXTRUDER)
 if (motor_direction(E_AXIS)) {
 MIXING_STEPPERS_LOOP(j) REV_E_DIR(j);
 count_direction[E_AXIS] = 1;
 }
 else {
 MIXING_STEPPERS_LOOP(j) NORM_E_DIR(j);
 count_direction[E_AXIS] = 1;
 }
 #else
 if (motor_direction(E_AXIS)) {
 REV_E_DIR(active_extruder);
 count_direction[E_AXIS] = 1;
 }
 else {
 NORM_E_DIR(active_extruder);
 count_direction[E_AXIS] = 1;
 }
 #endif
 #endif // !LIN_ADVANCE
 }

 #if ENABLED(S_CURVE_ACCELERATION)
 /**
 * This uses a quintic (fifthdegree) Bézier polynomial for the velocity curve, giving
 * a "linear pop" velocity curve; with pop being the sixth derivative of position:
 * velocity  1st, acceleration  2nd, jerk  3rd, snap  4th, crackle  5th, pop  6th
 *
 * The Bézier curve takes the form:
 *
 * V(t) = P_0 * B_0(t) + P_1 * B_1(t) + P_2 * B_2(t) + P_3 * B_3(t) + P_4 * B_4(t) + P_5 * B_5(t)
 *
 * Where 0 <= t <= 1, and V(t) is the velocity. P_0 through P_5 are the control points, and B_0(t)
 * through B_5(t) are the Bernstein basis as follows:
 *
 * B_0(t) = (1t)^5 = t^5 + 5t^4  10t^3 + 10t^2  5t + 1
 * B_1(t) = 5(1t)^4 * t = 5t^5  20t^4 + 30t^3  20t^2 + 5t
 * B_2(t) = 10(1t)^3 * t^2 = 10t^5 + 30t^4  30t^3 + 10t^2
 * B_3(t) = 10(1t)^2 * t^3 = 10t^5  20t^4 + 10t^3
 * B_4(t) = 5(1t) * t^4 = 5t^5 + 5t^4
 * B_5(t) = t^5 = t^5
 * ^ ^ ^ ^ ^ ^
 *      
 * A B C D E F
 *
 * Unfortunately, we cannot use forwarddifferencing to calculate each position through
 * the curve, as Marlin uses variable timer periods. So, we require a formula of the form:
 *
 * V_f(t) = A*t^5 + B*t^4 + C*t^3 + D*t^2 + E*t + F
 *
 * Looking at the above B_0(t) through B_5(t) expanded forms, if we take the coefficients of t^5
 * through t of the Bézier form of V(t), we can determine that:
 *
 * A = P_0 + 5*P_1  10*P_2 + 10*P_3  5*P_4 + P_5
 * B = 5*P_0  20*P_1 + 30*P_2  20*P_3 + 5*P_4
 * C = 10*P_0 + 30*P_1  30*P_2 + 10*P_3
 * D = 10*P_0  20*P_1 + 10*P_2
 * E =  5*P_0 + 5*P_1
 * F = P_0
 *
 * Now, since we will (currently) *always* want the initial acceleration and jerk values to be 0,
 * We set P_i = P_0 = P_1 = P_2 (initial velocity), and P_t = P_3 = P_4 = P_5 (target velocity),
 * which, after simplification, resolves to:
 *
 * A =  6*P_i + 6*P_t = 6*(P_t  P_i)
 * B = 15*P_i  15*P_t = 15*(P_i  P_t)
 * C = 10*P_i + 10*P_t = 10*(P_t  P_i)
 * D = 0
 * E = 0
 * F = P_i
 *
 * As the t is evaluated in non uniform steps here, there is no other way rather than evaluating
 * the Bézier curve at each point:
 *
 * V_f(t) = A*t^5 + B*t^4 + C*t^3 + F [0 <= t <= 1]
 *
 * Floating point arithmetic execution time cost is prohibitive, so we will transform the math to
 * use fixed point values to be able to evaluate it in realtime. Assuming a maximum of 250000 steps
 * per second (driver pulses should at least be 2µS hi/2µS lo), and allocating 2 bits to avoid
 * overflows on the evaluation of the Bézier curve, means we can use
 *
 * t: unsigned Q0.32 (0 <= t < 1) range 0 to 0xFFFFFFFF unsigned
 * A: signed Q24.7 , range = +/ 250000 * 6 * 128 = +/ 192000000 = 0x0B71B000  28 bits + sign
 * B: signed Q24.7 , range = +/ 250000 *15 * 128 = +/ 480000000 = 0x1C9C3800  29 bits + sign
 * C: signed Q24.7 , range = +/ 250000 *10 * 128 = +/ 320000000 = 0x1312D000  29 bits + sign
 * F: signed Q24.7 , range = +/ 250000 * 128 = 32000000 = 0x01E84800  25 bits + sign
 *
 * The trapezoid generator state contains the following information, that we will use to create and evaluate
 * the Bézier curve:
 *
 * blk>step_event_count [TS] = The total count of steps for this movement. (=distance)
 * blk>initial_rate [VI] = The initial steps per second (=velocity)
 * blk>final_rate [VF] = The ending steps per second (=velocity)
 * and the count of events completed (step_events_completed) [CS] (=distance until now)
 *
 * Note the abbreviations we use in the following formulae are between []s
 *
 * For Any 32bit CPU:
 *
 * At the start of each trapezoid, calculate the coefficients A,B,C,F and Advance [AV], as follows:
 *
 * A = 6*128*(VF  VI) = 768*(VF  VI)
 * B = 15*128*(VI  VF) = 1920*(VI  VF)
 * C = 10*128*(VF  VI) = 1280*(VF  VI)
 * F = 128*VI = 128*VI
 * AV = (1<<32)/TS ~= 0xFFFFFFFF / TS (To use ARM UDIV, that is 32 bits) (this is computed at the planner, to offload expensive calculations from the ISR)
 *
 * And for each point, evaluate the curve with the following sequence:
 *
 * void lsrs(uint32_t& d, uint32_t s, int cnt) {
 * d = s >> cnt;
 * }
 * void lsls(uint32_t& d, uint32_t s, int cnt) {
 * d = s << cnt;
 * }
 * void lsrs(int32_t& d, uint32_t s, int cnt) {
 * d = uint32_t(s) >> cnt;
 * }
 * void lsls(int32_t& d, uint32_t s, int cnt) {
 * d = uint32_t(s) << cnt;
 * }
 * void umull(uint32_t& rlo, uint32_t& rhi, uint32_t op1, uint32_t op2) {
 * uint64_t res = uint64_t(op1) * op2;
 * rlo = uint32_t(res & 0xFFFFFFFF);
 * rhi = uint32_t((res >> 32) & 0xFFFFFFFF);
 * }
 * void smlal(int32_t& rlo, int32_t& rhi, int32_t op1, int32_t op2) {
 * int64_t mul = int64_t(op1) * op2;
 * int64_t s = int64_t(uint32_t(rlo)  ((uint64_t(uint32_t(rhi)) << 32U)));
 * mul += s;
 * rlo = int32_t(mul & 0xFFFFFFFF);
 * rhi = int32_t((mul >> 32) & 0xFFFFFFFF);
 * }
 * int32_t _eval_bezier_curve_arm(uint32_t curr_step) {
 * register uint32_t flo = 0;
 * register uint32_t fhi = bezier_AV * curr_step;
 * register uint32_t t = fhi;
 * register int32_t alo = bezier_F;
 * register int32_t ahi = 0;
 * register int32_t A = bezier_A;
 * register int32_t B = bezier_B;
 * register int32_t C = bezier_C;
 *
 * lsrs(ahi, alo, 1); // a = F << 31
 * lsls(alo, alo, 31); //
 * umull(flo, fhi, fhi, t); // f *= t
 * umull(flo, fhi, fhi, t); // f>>=32; f*=t
 * lsrs(flo, fhi, 1); //
 * smlal(alo, ahi, flo, C); // a+=(f>>33)*C
 * umull(flo, fhi, fhi, t); // f>>=32; f*=t
 * lsrs(flo, fhi, 1); //
 * smlal(alo, ahi, flo, B); // a+=(f>>33)*B
 * umull(flo, fhi, fhi, t); // f>>=32; f*=t
 * lsrs(flo, fhi, 1); // f>>=33;
 * smlal(alo, ahi, flo, A); // a+=(f>>33)*A;
 * lsrs(alo, ahi, 6); // a>>=38
 *
 * return alo;
 * }
 *
 * This is rewritten in ARM assembly for optimal performance (43 cycles to execute).
 *
 * For AVR, the precision of coefficients is scaled so the Bézier curve can be evaluated in realtime:
 * Let's reduce precision as much as possible. After some experimentation we found that:
 *
 * Assume t and AV with 24 bits is enough
 * A = 6*(VF  VI)
 * B = 15*(VI  VF)
 * C = 10*(VF  VI)
 * F = VI
 * AV = (1<<24)/TS (this is computed at the planner, to offload expensive calculations from the ISR)
 *
 * Instead of storing sign for each coefficient, we will store its absolute value,
 * and flag the sign of the A coefficient, so we can save to store the sign bit.
 * It always holds that sign(A) =  sign(B) = sign(C)
 *
 * So, the resulting range of the coefficients are:
 *
 * t: unsigned (0 <= t < 1) range 0 to 0xFFFFFF unsigned
 * A: signed Q24 , range = 250000 * 6 = 1500000 = 0x16E360  21 bits
 * B: signed Q24 , range = 250000 *15 = 3750000 = 0x393870  22 bits
 * C: signed Q24 , range = 250000 *10 = 2500000 = 0x1312D0  21 bits
 * F: signed Q24 , range = 250000 = 250000 = 0x0ED090  20 bits
 *
 * And for each curve, estimate its coefficients with:
 *
 * void _calc_bezier_curve_coeffs(int32_t v0, int32_t v1, uint32_t av) {
 * // Calculate the Bézier coefficients
 * if (v1 < v0) {
 * A_negative = true;
 * bezier_A = 6 * (v0  v1);
 * bezier_B = 15 * (v0  v1);
 * bezier_C = 10 * (v0  v1);
 * }
 * else {
 * A_negative = false;
 * bezier_A = 6 * (v1  v0);
 * bezier_B = 15 * (v1  v0);
 * bezier_C = 10 * (v1  v0);
 * }
 * bezier_F = v0;
 * }
 *
 * And for each point, evaluate the curve with the following sequence:
 *
 * // unsigned multiplication of 24 bits x 24bits, return upper 16 bits
 * void umul24x24to16hi(uint16_t& r, uint24_t op1, uint24_t op2) {
 * r = (uint64_t(op1) * op2) >> 8;
 * }
 * // unsigned multiplication of 16 bits x 16bits, return upper 16 bits
 * void umul16x16to16hi(uint16_t& r, uint16_t op1, uint16_t op2) {
 * r = (uint32_t(op1) * op2) >> 16;
 * }
 * // unsigned multiplication of 16 bits x 24bits, return upper 24 bits
 * void umul16x24to24hi(uint24_t& r, uint16_t op1, uint24_t op2) {
 * r = uint24_t((uint64_t(op1) * op2) >> 16);
 * }
 *
 * int32_t _eval_bezier_curve(uint32_t curr_step) {
 * // To save computing, the first step is always the initial speed
 * if (!curr_step)
 * return bezier_F;
 *
 * uint16_t t;
 * umul24x24to16hi(t, bezier_AV, curr_step); // t: Range 0  1^16 = 16 bits
 * uint16_t f = t;
 * umul16x16to16hi(f, f, t); // Range 16 bits (unsigned)
 * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^3 (unsigned)
 * uint24_t acc = bezier_F; // Range 20 bits (unsigned)
 * if (A_negative) {
 * uint24_t v;
 * umul16x24to24hi(v, f, bezier_C); // Range 21bits
 * acc = v;
 * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^4 (unsigned)
 * umul16x24to24hi(v, f, bezier_B); // Range 22bits
 * acc += v;
 * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^5 (unsigned)
 * umul16x24to24hi(v, f, bezier_A); // Range 21bits + 15 = 36bits (plus sign)
 * acc = v;
 * }
 * else {
 * uint24_t v;
 * umul16x24to24hi(v, f, bezier_C); // Range 21bits
 * acc += v;
 * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^4 (unsigned)
 * umul16x24to24hi(v, f, bezier_B); // Range 22bits
 * acc = v;
 * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^5 (unsigned)
 * umul16x24to24hi(v, f, bezier_A); // Range 21bits + 15 = 36bits (plus sign)
 * acc += v;
 * }
 * return acc;
 * }
 * These functions are translated to assembler for optimal performance.
 * Coefficient calculation takes 70 cycles. Bezier point evaluation takes 150 cycles.
 */

 #ifdef __AVR__

 // For AVR we use assembly to maximize speed
 void Stepper::_calc_bezier_curve_coeffs(const int32_t v0, const int32_t v1, const uint32_t av) {

 // Store advance
 bezier_AV = av;

 // Calculate the rest of the coefficients
 register uint8_t r2 = v0 & 0xFF;
 register uint8_t r3 = (v0 >> 8) & 0xFF;
 register uint8_t r12 = (v0 >> 16) & 0xFF;
 register uint8_t r5 = v1 & 0xFF;
 register uint8_t r6 = (v1 >> 8) & 0xFF;
 register uint8_t r7 = (v1 >> 16) & 0xFF;
 register uint8_t r4,r8,r9,r10,r11;

 __asm__ __volatile__(
 /* Calculate the Bézier coefficients */
 /* %10:%1:%0 = v0*/
 /* %5:%4:%3 = v1*/
 /* %7:%6:%10 = temporary*/
 /* %9 = val (must be high register!)*/
 /* %10 (must be high register!)*/

 /* Store initial velocity*/
 A("sts bezier_F, %0")
 A("sts bezier_F+1, %1")
 A("sts bezier_F+2, %10") /* bezier_F = %10:%1:%0 = v0 */

 /* Get delta speed */
 A("ldi %2,1") /* %2 = 0xFF, means A_negative = true */
 A("clr %8") /* %8 = 0 */
 A("sub %0,%3")
 A("sbc %1,%4")
 A("sbc %10,%5") /* v0 = v1, C=1 if result is negative */
 A("brcc 1f") /* branch if result is positive (C=0), that means v0 >= v1 */

 /* Result was negative, get the absolute value*/
 A("com %10")
 A("com %1")
 A("neg %0")
 A("sbc %1,%2")
 A("sbc %10,%2") /* %10:%1:%0 +1 > %10:%1:%0 = (v0  v1) = (v1  v0) */
 A("clr %2") /* %2 = 0, means A_negative = false */

 /* Store negative flag*/
 L("1")
 A("sts A_negative, %2") /* Store negative flag */

 /* Compute coefficients A,B and C [20 cycles worst case]*/
 A("ldi %9,6") /* %9 = 6 */
 A("mul %0,%9") /* r1:r0 = 6*LO(v0v1) */
 A("sts bezier_A, r0")
 A("mov %6,r1")
 A("clr %7") /* %7:%6:r0 = 6*LO(v0v1) */
 A("mul %1,%9") /* r1:r0 = 6*MI(v0v1) */
 A("add %6,r0")
 A("adc %7,r1") /* %7:%6:?? += 6*MI(v0v1) << 8 */
 A("mul %10,%9") /* r1:r0 = 6*HI(v0v1) */
 A("add %7,r0") /* %7:%6:?? += 6*HI(v0v1) << 16 */
 A("sts bezier_A+1, %6")
 A("sts bezier_A+2, %7") /* bezier_A = %7:%6:?? = 6*(v0v1) [35 cycles worst] */

 A("ldi %9,15") /* %9 = 15 */
 A("mul %0,%9") /* r1:r0 = 5*LO(v0v1) */
 A("sts bezier_B, r0")
 A("mov %6,r1")
 A("clr %7") /* %7:%6:?? = 5*LO(v0v1) */
 A("mul %1,%9") /* r1:r0 = 5*MI(v0v1) */
 A("add %6,r0")
 A("adc %7,r1") /* %7:%6:?? += 5*MI(v0v1) << 8 */
 A("mul %10,%9") /* r1:r0 = 5*HI(v0v1) */
 A("add %7,r0") /* %7:%6:?? += 5*HI(v0v1) << 16 */
 A("sts bezier_B+1, %6")
 A("sts bezier_B+2, %7") /* bezier_B = %7:%6:?? = 5*(v0v1) [50 cycles worst] */

 A("ldi %9,10") /* %9 = 10 */
 A("mul %0,%9") /* r1:r0 = 10*LO(v0v1) */
 A("sts bezier_C, r0")
 A("mov %6,r1")
 A("clr %7") /* %7:%6:?? = 10*LO(v0v1) */
 A("mul %1,%9") /* r1:r0 = 10*MI(v0v1) */
 A("add %6,r0")
 A("adc %7,r1") /* %7:%6:?? += 10*MI(v0v1) << 8 */
 A("mul %10,%9") /* r1:r0 = 10*HI(v0v1) */
 A("add %7,r0") /* %7:%6:?? += 10*HI(v0v1) << 16 */
 A("sts bezier_C+1, %6")
 " sts bezier_C+2, %7" /* bezier_C = %7:%6:?? = 10*(v0v1) [65 cycles worst] */
 : "+r" (r2),
 "+d" (r3),
 "=r" (r4),
 "+r" (r5),
 "+r" (r6),
 "+r" (r7),
 "=r" (r8),
 "=r" (r9),
 "=r" (r10),
 "=d" (r11),
 "+r" (r12)
 :
 : "r0", "r1", "cc", "memory"
 );
 }

 FORCE_INLINE int32_t Stepper::_eval_bezier_curve(const uint32_t curr_step) {

 // If dealing with the first step, save expensive computing and return the initial speed
 if (!curr_step)
 return bezier_F;

 register uint8_t r0 = 0; /* Zero register */
 register uint8_t r2 = (curr_step) & 0xFF;
 register uint8_t r3 = (curr_step >> 8) & 0xFF;
 register uint8_t r4 = (curr_step >> 16) & 0xFF;
 register uint8_t r1,r5,r6,r7,r8,r9,r10,r11; /* Temporary registers */

 __asm__ __volatile(
 /* umul24x24to16hi(t, bezier_AV, curr_step); t: Range 0  1^16 = 16 bits*/
 A("lds %9,bezier_AV") /* %9 = LO(AV)*/
 A("mul %9,%2") /* r1:r0 = LO(bezier_AV)*LO(curr_step)*/
 A("mov %7,r1") /* %7 = LO(bezier_AV)*LO(curr_step) >> 8*/
 A("clr %8") /* %8:%7 = LO(bezier_AV)*LO(curr_step) >> 8*/
 A("lds %10,bezier_AV+1") /* %10 = MI(AV)*/
 A("mul %10,%2") /* r1:r0 = MI(bezier_AV)*LO(curr_step)*/
 A("add %7,r0")
 A("adc %8,r1") /* %8:%7 += MI(bezier_AV)*LO(curr_step)*/
 A("lds r1,bezier_AV+2") /* r11 = HI(AV)*/
 A("mul r1,%2") /* r1:r0 = HI(bezier_AV)*LO(curr_step)*/
 A("add %8,r0") /* %8:%7 += HI(bezier_AV)*LO(curr_step) << 8*/
 A("mul %9,%3") /* r1:r0 = LO(bezier_AV)*MI(curr_step)*/
 A("add %7,r0")
 A("adc %8,r1") /* %8:%7 += LO(bezier_AV)*MI(curr_step)*/
 A("mul %10,%3") /* r1:r0 = MI(bezier_AV)*MI(curr_step)*/
 A("add %8,r0") /* %8:%7 += LO(bezier_AV)*MI(curr_step) << 8*/
 A("mul %9,%4") /* r1:r0 = LO(bezier_AV)*HI(curr_step)*/
 A("add %8,r0") /* %8:%7 += LO(bezier_AV)*HI(curr_step) << 8*/
 /* %8:%7 = t*/

 /* uint16_t f = t;*/
 A("mov %5,%7") /* %6:%5 = f*/
 A("mov %6,%8")
 /* %6:%5 = f*/

 /* umul16x16to16hi(f, f, t); / Range 16 bits (unsigned) [17] */
 A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
 A("mov %9,r1") /* store MIL(LO(f) * LO(t)) in %9, we need it for rounding*/
 A("clr %10") /* %10 = 0*/
 A("clr %11") /* %11 = 0*/
 A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
 A("add %9,r0") /* %9 += LO(LO(f) * HI(t))*/
 A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
 A("add %9,r0") /* %9 += LO(HI(f) * LO(t))*/
 A("adc %10,r1") /* %10 += HI(HI(f) * LO(t)) */
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
 A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
 A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
 A("mov %5,%10") /* %6:%5 = */
 A("mov %6,%11") /* f = %10:%11*/

 /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^3 (unsigned) [17]*/
 A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
 A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
 A("clr %10") /* %10 = 0*/
 A("clr %11") /* %11 = 0*/
 A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
 A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
 A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
 A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
 A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
 A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
 A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
 A("mov %5,%10") /* %6:%5 =*/
 A("mov %6,%11") /* f = %10:%11*/
 /* [15 +17*2] = [49]*/

 /* %4:%3:%2 will be acc from now on*/

 /* uint24_t acc = bezier_F; / Range 20 bits (unsigned)*/
 A("clr %9") /* "decimal place we get for free"*/
 A("lds %2,bezier_F")
 A("lds %3,bezier_F+1")
 A("lds %4,bezier_F+2") /* %4:%3:%2 = acc*/

 /* if (A_negative) {*/
 A("lds r0,A_negative")
 A("or r0,%0") /* Is flag signalling negative? */
 A("brne 3f") /* If yes, Skip next instruction if A was negative*/
 A("rjmp 1f") /* Otherwise, jump */

 /* uint24_t v; */
 /* umul16x24to24hi(v, f, bezier_C); / Range 21bits [29] */
 /* acc = v; */
 L("3")
 A("lds %10, bezier_C") /* %10 = LO(bezier_C)*/
 A("mul %10,%5") /* r1:r0 = LO(bezier_C) * LO(f)*/
 A("sub %9,r1")
 A("sbc %2,%0")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = HI(LO(bezier_C) * LO(f))*/
 A("lds %11, bezier_C+1") /* %11 = MI(bezier_C)*/
 A("mul %11,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
 A("sub %9,r0")
 A("sbc %2,r1")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = MI(bezier_C) * LO(f)*/
 A("lds %1, bezier_C+2") /* %1 = HI(bezier_C)*/
 A("mul %1,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
 A("sub %2,r0")
 A("sbc %3,r1")
 A("sbc %4,%0") /* %4:%3:%2:%9 = HI(bezier_C) * LO(f) << 8*/
 A("mul %10,%6") /* r1:r0 = LO(bezier_C) * MI(f)*/
 A("sub %9,r0")
 A("sbc %2,r1")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = LO(bezier_C) * MI(f)*/
 A("mul %11,%6") /* r1:r0 = MI(bezier_C) * MI(f)*/
 A("sub %2,r0")
 A("sbc %3,r1")
 A("sbc %4,%0") /* %4:%3:%2:%9 = MI(bezier_C) * MI(f) << 8*/
 A("mul %1,%6") /* r1:r0 = HI(bezier_C) * LO(f)*/
 A("sub %3,r0")
 A("sbc %4,r1") /* %4:%3:%2:%9 = HI(bezier_C) * LO(f) << 16*/

 /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^3 (unsigned) [17]*/
 A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
 A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
 A("clr %10") /* %10 = 0*/
 A("clr %11") /* %11 = 0*/
 A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
 A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
 A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
 A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
 A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
 A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
 A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
 A("mov %5,%10") /* %6:%5 =*/
 A("mov %6,%11") /* f = %10:%11*/

 /* umul16x24to24hi(v, f, bezier_B); / Range 22bits [29]*/
 /* acc += v; */
 A("lds %10, bezier_B") /* %10 = LO(bezier_B)*/
 A("mul %10,%5") /* r1:r0 = LO(bezier_B) * LO(f)*/
 A("add %9,r1")
 A("adc %2,%0")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += HI(LO(bezier_B) * LO(f))*/
 A("lds %11, bezier_B+1") /* %11 = MI(bezier_B)*/
 A("mul %11,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
 A("add %9,r0")
 A("adc %2,r1")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_B) * LO(f)*/
 A("lds %1, bezier_B+2") /* %1 = HI(bezier_B)*/
 A("mul %1,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
 A("add %2,r0")
 A("adc %3,r1")
 A("adc %4,%0") /* %4:%3:%2:%9 += HI(bezier_B) * LO(f) << 8*/
 A("mul %10,%6") /* r1:r0 = LO(bezier_B) * MI(f)*/
 A("add %9,r0")
 A("adc %2,r1")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += LO(bezier_B) * MI(f)*/
 A("mul %11,%6") /* r1:r0 = MI(bezier_B) * MI(f)*/
 A("add %2,r0")
 A("adc %3,r1")
 A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_B) * MI(f) << 8*/
 A("mul %1,%6") /* r1:r0 = HI(bezier_B) * LO(f)*/
 A("add %3,r0")
 A("adc %4,r1") /* %4:%3:%2:%9 += HI(bezier_B) * LO(f) << 16*/

 /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^5 (unsigned) [17]*/
 A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
 A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
 A("clr %10") /* %10 = 0*/
 A("clr %11") /* %11 = 0*/
 A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
 A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
 A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
 A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
 A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
 A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
 A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
 A("mov %5,%10") /* %6:%5 =*/
 A("mov %6,%11") /* f = %10:%11*/

 /* umul16x24to24hi(v, f, bezier_A); / Range 21bits [29]*/
 /* acc = v; */
 A("lds %10, bezier_A") /* %10 = LO(bezier_A)*/
 A("mul %10,%5") /* r1:r0 = LO(bezier_A) * LO(f)*/
 A("sub %9,r1")
 A("sbc %2,%0")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = HI(LO(bezier_A) * LO(f))*/
 A("lds %11, bezier_A+1") /* %11 = MI(bezier_A)*/
 A("mul %11,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
 A("sub %9,r0")
 A("sbc %2,r1")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = MI(bezier_A) * LO(f)*/
 A("lds %1, bezier_A+2") /* %1 = HI(bezier_A)*/
 A("mul %1,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
 A("sub %2,r0")
 A("sbc %3,r1")
 A("sbc %4,%0") /* %4:%3:%2:%9 = HI(bezier_A) * LO(f) << 8*/
 A("mul %10,%6") /* r1:r0 = LO(bezier_A) * MI(f)*/
 A("sub %9,r0")
 A("sbc %2,r1")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = LO(bezier_A) * MI(f)*/
 A("mul %11,%6") /* r1:r0 = MI(bezier_A) * MI(f)*/
 A("sub %2,r0")
 A("sbc %3,r1")
 A("sbc %4,%0") /* %4:%3:%2:%9 = MI(bezier_A) * MI(f) << 8*/
 A("mul %1,%6") /* r1:r0 = HI(bezier_A) * LO(f)*/
 A("sub %3,r0")
 A("sbc %4,r1") /* %4:%3:%2:%9 = HI(bezier_A) * LO(f) << 16*/
 A("jmp 2f") /* Done!*/

 L("1")

 /* uint24_t v; */
 /* umul16x24to24hi(v, f, bezier_C); / Range 21bits [29]*/
 /* acc += v; */
 A("lds %10, bezier_C") /* %10 = LO(bezier_C)*/
 A("mul %10,%5") /* r1:r0 = LO(bezier_C) * LO(f)*/
 A("add %9,r1")
 A("adc %2,%0")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += HI(LO(bezier_C) * LO(f))*/
 A("lds %11, bezier_C+1") /* %11 = MI(bezier_C)*/
 A("mul %11,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
 A("add %9,r0")
 A("adc %2,r1")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_C) * LO(f)*/
 A("lds %1, bezier_C+2") /* %1 = HI(bezier_C)*/
 A("mul %1,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
 A("add %2,r0")
 A("adc %3,r1")
 A("adc %4,%0") /* %4:%3:%2:%9 += HI(bezier_C) * LO(f) << 8*/
 A("mul %10,%6") /* r1:r0 = LO(bezier_C) * MI(f)*/
 A("add %9,r0")
 A("adc %2,r1")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += LO(bezier_C) * MI(f)*/
 A("mul %11,%6") /* r1:r0 = MI(bezier_C) * MI(f)*/
 A("add %2,r0")
 A("adc %3,r1")
 A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_C) * MI(f) << 8*/
 A("mul %1,%6") /* r1:r0 = HI(bezier_C) * LO(f)*/
 A("add %3,r0")
 A("adc %4,r1") /* %4:%3:%2:%9 += HI(bezier_C) * LO(f) << 16*/

 /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^3 (unsigned) [17]*/
 A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
 A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
 A("clr %10") /* %10 = 0*/
 A("clr %11") /* %11 = 0*/
 A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
 A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
 A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
 A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
 A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
 A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
 A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
 A("mov %5,%10") /* %6:%5 =*/
 A("mov %6,%11") /* f = %10:%11*/

 /* umul16x24to24hi(v, f, bezier_B); / Range 22bits [29]*/
 /* acc = v;*/
 A("lds %10, bezier_B") /* %10 = LO(bezier_B)*/
 A("mul %10,%5") /* r1:r0 = LO(bezier_B) * LO(f)*/
 A("sub %9,r1")
 A("sbc %2,%0")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = HI(LO(bezier_B) * LO(f))*/
 A("lds %11, bezier_B+1") /* %11 = MI(bezier_B)*/
 A("mul %11,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
 A("sub %9,r0")
 A("sbc %2,r1")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = MI(bezier_B) * LO(f)*/
 A("lds %1, bezier_B+2") /* %1 = HI(bezier_B)*/
 A("mul %1,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
 A("sub %2,r0")
 A("sbc %3,r1")
 A("sbc %4,%0") /* %4:%3:%2:%9 = HI(bezier_B) * LO(f) << 8*/
 A("mul %10,%6") /* r1:r0 = LO(bezier_B) * MI(f)*/
 A("sub %9,r0")
 A("sbc %2,r1")
 A("sbc %3,%0")
 A("sbc %4,%0") /* %4:%3:%2:%9 = LO(bezier_B) * MI(f)*/
 A("mul %11,%6") /* r1:r0 = MI(bezier_B) * MI(f)*/
 A("sub %2,r0")
 A("sbc %3,r1")
 A("sbc %4,%0") /* %4:%3:%2:%9 = MI(bezier_B) * MI(f) << 8*/
 A("mul %1,%6") /* r1:r0 = HI(bezier_B) * LO(f)*/
 A("sub %3,r0")
 A("sbc %4,r1") /* %4:%3:%2:%9 = HI(bezier_B) * LO(f) << 16*/

 /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^5 (unsigned) [17]*/
 A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
 A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
 A("clr %10") /* %10 = 0*/
 A("clr %11") /* %11 = 0*/
 A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
 A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
 A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
 A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
 A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
 A("adc %11,%0") /* %11 += carry*/
 A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
 A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
 A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
 A("mov %5,%10") /* %6:%5 =*/
 A("mov %6,%11") /* f = %10:%11*/

 /* umul16x24to24hi(v, f, bezier_A); / Range 21bits [29]*/
 /* acc += v; */
 A("lds %10, bezier_A") /* %10 = LO(bezier_A)*/
 A("mul %10,%5") /* r1:r0 = LO(bezier_A) * LO(f)*/
 A("add %9,r1")
 A("adc %2,%0")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += HI(LO(bezier_A) * LO(f))*/
 A("lds %11, bezier_A+1") /* %11 = MI(bezier_A)*/
 A("mul %11,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
 A("add %9,r0")
 A("adc %2,r1")
 A("adc %3,%0")
 A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_A) * LO(f)*/
 A("lds %1, bezier_A+2") /* %1 = HI(bezier_A)*/
 A("mul %1,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
 A("add %2,r0")
 A("adc %3,r1")<
