You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2554 lines
94 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
5 years ago
4 years ago
4 years ago
  1. /**
  2. * Marlin 3D Printer Firmware
  3. * Copyright (C) 2016 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]
  4. *
  5. * Based on Sprinter and grbl.
  6. * Copyright (C) 2011 Camiel Gubbels / Erik van der Zalm
  7. *
  8. * This program is free software: you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation, either version 3 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  20. *
  21. */
  22. /**
  23. * stepper.cpp - A singleton object to execute motion plans using stepper motors
  24. * Marlin Firmware
  25. *
  26. * Derived from Grbl
  27. * Copyright (c) 2009-2011 Simen Svale Skogsrud
  28. *
  29. * Grbl is free software: you can redistribute it and/or modify
  30. * it under the terms of the GNU General Public License as published by
  31. * the Free Software Foundation, either version 3 of the License, or
  32. * (at your option) any later version.
  33. *
  34. * Grbl is distributed in the hope that it will be useful,
  35. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  36. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  37. * GNU General Public License for more details.
  38. *
  39. * You should have received a copy of the GNU General Public License
  40. * along with Grbl. If not, see <http://www.gnu.org/licenses/>.
  41. */
  42. /**
  43. * Timer calculations informed by the 'RepRap cartesian firmware' by Zack Smith
  44. * and Philipp Tiefenbacher.
  45. */
  46. /**
  47. * __________________________
  48. * /| |\ _________________ ^
  49. * / | | \ /| |\ |
  50. * / | | \ / | | \ s
  51. * / | | | | | \ p
  52. * / | | | | | \ e
  53. * +-----+------------------------+---+--+---------------+----+ e
  54. * | BLOCK 1 | BLOCK 2 | d
  55. *
  56. * time ----->
  57. *
  58. * The trapezoid is the shape the speed curve over time. It starts at block->initial_rate, accelerates
  59. * first block->accelerate_until step_events_completed, then keeps going at constant speed until
  60. * step_events_completed reaches block->decelerate_after after which it decelerates until the trapezoid generator is reset.
  61. * The slope of acceleration is calculated using v = u + at where t is the accumulated timer values of the steps so far.
  62. */
  63. /**
  64. * Marlin uses the Bresenham algorithm. For a detailed explanation of theory and
  65. * method see https://www.cs.helsinki.fi/group/goa/mallinnus/lines/bresenh.html
  66. */
  67. /**
  68. * Jerk controlled movements planner added Apr 2018 by Eduardo José Tagle.
  69. * Equations based on Synthethos TinyG2 sources, but the fixed-point
  70. * implementation is new, as we are running the ISR with a variable period.
  71. * Also implemented the Bézier velocity curve evaluation in ARM assembler,
  72. * to avoid impacting ISR speed.
  73. */
  74. #include "stepper.h"
  75. #ifdef __AVR__
  76. #include "speed_lookuptable.h"
  77. #endif
  78. #include "endstops.h"
  79. #include "planner.h"
  80. #include "motion.h"
  81. #include "../module/temperature.h"
  82. #include "../lcd/ultralcd.h"
  83. #include "../core/language.h"
  84. #include "../gcode/queue.h"
  85. #include "../sd/cardreader.h"
  86. #include "../Marlin.h"
  87. #include "../HAL/Delay.h"
  88. #if MB(ALLIGATOR)
  89. #include "../feature/dac/dac_dac084s085.h"
  90. #endif
  91. #if HAS_DIGIPOTSS
  92. #include <SPI.h>
  93. #endif
  94. Stepper stepper; // Singleton
  95. // public:
  96. block_t* Stepper::current_block = NULL; // A pointer to the block currently being traced
  97. #if ENABLED(X_DUAL_ENDSTOPS) || ENABLED(Y_DUAL_ENDSTOPS) || ENABLED(Z_DUAL_ENDSTOPS)
  98. bool Stepper::homing_dual_axis = false;
  99. #endif
  100. #if HAS_MOTOR_CURRENT_PWM
  101. uint32_t Stepper::motor_current_setting[3]; // Initialized by settings.load()
  102. #endif
  103. // private:
  104. uint8_t Stepper::last_direction_bits = 0,
  105. Stepper::axis_did_move;
  106. bool Stepper::abort_current_block;
  107. #if DISABLED(MIXING_EXTRUDER)
  108. uint8_t Stepper::last_moved_extruder = 0xFF;
  109. #endif
  110. #if ENABLED(X_DUAL_ENDSTOPS)
  111. bool Stepper::locked_X_motor = false, Stepper::locked_X2_motor = false;
  112. #endif
  113. #if ENABLED(Y_DUAL_ENDSTOPS)
  114. bool Stepper::locked_Y_motor = false, Stepper::locked_Y2_motor = false;
  115. #endif
  116. #if ENABLED(Z_DUAL_ENDSTOPS)
  117. bool Stepper::locked_Z_motor = false, Stepper::locked_Z2_motor = false;
  118. #endif
  119. uint32_t Stepper::acceleration_time, Stepper::deceleration_time;
  120. uint8_t Stepper::steps_per_isr;
  121. #if DISABLED(ADAPTIVE_STEP_SMOOTHING)
  122. constexpr
  123. #endif
  124. uint8_t Stepper::oversampling_factor;
  125. int32_t Stepper::delta_error[XYZE] = { 0 };
  126. uint32_t Stepper::advance_dividend[XYZE] = { 0 },
  127. Stepper::advance_divisor = 0,
  128. Stepper::step_events_completed = 0, // The number of step events executed in the current block
  129. Stepper::accelerate_until, // The point from where we need to stop acceleration
  130. Stepper::decelerate_after, // The point from where we need to start decelerating
  131. Stepper::step_event_count; // The total event count for the current block
  132. #if ENABLED(MIXING_EXTRUDER)
  133. int32_t Stepper::delta_error_m[MIXING_STEPPERS];
  134. uint32_t Stepper::advance_dividend_m[MIXING_STEPPERS],
  135. Stepper::advance_divisor_m;
  136. #else
  137. int8_t Stepper::active_extruder; // Active extruder
  138. #endif
  139. #if ENABLED(S_CURVE_ACCELERATION)
  140. int32_t __attribute__((used)) Stepper::bezier_A __asm__("bezier_A"); // A coefficient in Bézier speed curve with alias for assembler
  141. int32_t __attribute__((used)) Stepper::bezier_B __asm__("bezier_B"); // B coefficient in Bézier speed curve with alias for assembler
  142. int32_t __attribute__((used)) Stepper::bezier_C __asm__("bezier_C"); // C coefficient in Bézier speed curve with alias for assembler
  143. uint32_t __attribute__((used)) Stepper::bezier_F __asm__("bezier_F"); // F coefficient in Bézier speed curve with alias for assembler
  144. uint32_t __attribute__((used)) Stepper::bezier_AV __asm__("bezier_AV"); // AV coefficient in Bézier speed curve with alias for assembler
  145. #ifdef __AVR__
  146. bool __attribute__((used)) Stepper::A_negative __asm__("A_negative"); // If A coefficient was negative
  147. #endif
  148. bool Stepper::bezier_2nd_half; // =false If Bézier curve has been initialized or not
  149. #endif
  150. uint32_t Stepper::nextMainISR = 0;
  151. #if ENABLED(LIN_ADVANCE)
  152. constexpr uint32_t LA_ADV_NEVER = 0xFFFFFFFF;
  153. uint32_t Stepper::nextAdvanceISR = LA_ADV_NEVER,
  154. Stepper::LA_isr_rate = LA_ADV_NEVER;
  155. uint16_t Stepper::LA_current_adv_steps = 0,
  156. Stepper::LA_final_adv_steps,
  157. Stepper::LA_max_adv_steps;
  158. int8_t Stepper::LA_steps = 0;
  159. bool Stepper::LA_use_advance_lead;
  160. #endif // LIN_ADVANCE
  161. int32_t Stepper::ticks_nominal = -1;
  162. #if DISABLED(S_CURVE_ACCELERATION)
  163. uint32_t Stepper::acc_step_rate; // needed for deceleration start point
  164. #endif
  165. volatile int32_t Stepper::endstops_trigsteps[XYZ];
  166. volatile int32_t Stepper::count_position[NUM_AXIS] = { 0 };
  167. int8_t Stepper::count_direction[NUM_AXIS] = { 0, 0, 0, 0 };
  168. #if ENABLED(X_DUAL_ENDSTOPS) || ENABLED(Y_DUAL_ENDSTOPS) || ENABLED(Z_DUAL_ENDSTOPS)
  169. #define DUAL_ENDSTOP_APPLY_STEP(A,V) \
  170. if (homing_dual_axis) { \
  171. if (A##_HOME_DIR < 0) { \
  172. if (!(TEST(endstops.state(), A##_MIN) && count_direction[_AXIS(A)] < 0) && !locked_##A##_motor) A##_STEP_WRITE(V); \
  173. if (!(TEST(endstops.state(), A##2_MIN) && count_direction[_AXIS(A)] < 0) && !locked_##A##2_motor) A##2_STEP_WRITE(V); \
  174. } \
  175. else { \
  176. if (!(TEST(endstops.state(), A##_MAX) && count_direction[_AXIS(A)] > 0) && !locked_##A##_motor) A##_STEP_WRITE(V); \
  177. if (!(TEST(endstops.state(), A##2_MAX) && count_direction[_AXIS(A)] > 0) && !locked_##A##2_motor) A##2_STEP_WRITE(V); \
  178. } \
  179. } \
  180. else { \
  181. A##_STEP_WRITE(V); \
  182. A##2_STEP_WRITE(V); \
  183. }
  184. #endif
  185. #if ENABLED(X_DUAL_STEPPER_DRIVERS)
  186. #define X_APPLY_DIR(v,Q) do{ X_DIR_WRITE(v); X2_DIR_WRITE((v) != INVERT_X2_VS_X_DIR); }while(0)
  187. #if ENABLED(X_DUAL_ENDSTOPS)
  188. #define X_APPLY_STEP(v,Q) DUAL_ENDSTOP_APPLY_STEP(X,v)
  189. #else
  190. #define X_APPLY_STEP(v,Q) do{ X_STEP_WRITE(v); X2_STEP_WRITE(v); }while(0)
  191. #endif
  192. #elif ENABLED(DUAL_X_CARRIAGE)
  193. #define X_APPLY_DIR(v,ALWAYS) \
  194. if (extruder_duplication_enabled || ALWAYS) { \
  195. X_DIR_WRITE(v); \
  196. X2_DIR_WRITE(v); \
  197. } \
  198. else { \
  199. if (movement_extruder()) X2_DIR_WRITE(v); else X_DIR_WRITE(v); \
  200. }
  201. #define X_APPLY_STEP(v,ALWAYS) \
  202. if (extruder_duplication_enabled || ALWAYS) { \
  203. X_STEP_WRITE(v); \
  204. X2_STEP_WRITE(v); \
  205. } \
  206. else { \
  207. if (movement_extruder()) X2_STEP_WRITE(v); else X_STEP_WRITE(v); \
  208. }
  209. #else
  210. #define X_APPLY_DIR(v,Q) X_DIR_WRITE(v)
  211. #define X_APPLY_STEP(v,Q) X_STEP_WRITE(v)
  212. #endif
  213. #if ENABLED(Y_DUAL_STEPPER_DRIVERS)
  214. #define Y_APPLY_DIR(v,Q) do{ Y_DIR_WRITE(v); Y2_DIR_WRITE((v) != INVERT_Y2_VS_Y_DIR); }while(0)
  215. #if ENABLED(Y_DUAL_ENDSTOPS)
  216. #define Y_APPLY_STEP(v,Q) DUAL_ENDSTOP_APPLY_STEP(Y,v)
  217. #else
  218. #define Y_APPLY_STEP(v,Q) do{ Y_STEP_WRITE(v); Y2_STEP_WRITE(v); }while(0)
  219. #endif
  220. #else
  221. #define Y_APPLY_DIR(v,Q) Y_DIR_WRITE(v)
  222. #define Y_APPLY_STEP(v,Q) Y_STEP_WRITE(v)
  223. #endif
  224. #if ENABLED(Z_DUAL_STEPPER_DRIVERS)
  225. #define Z_APPLY_DIR(v,Q) do{ Z_DIR_WRITE(v); Z2_DIR_WRITE(v); }while(0)
  226. #if ENABLED(Z_DUAL_ENDSTOPS)
  227. #define Z_APPLY_STEP(v,Q) DUAL_ENDSTOP_APPLY_STEP(Z,v)
  228. #else
  229. #define Z_APPLY_STEP(v,Q) do{ Z_STEP_WRITE(v); Z2_STEP_WRITE(v); }while(0)
  230. #endif
  231. #else
  232. #define Z_APPLY_DIR(v,Q) Z_DIR_WRITE(v)
  233. #define Z_APPLY_STEP(v,Q) Z_STEP_WRITE(v)
  234. #endif
  235. #if DISABLED(MIXING_EXTRUDER)
  236. #define E_APPLY_STEP(v,Q) E_STEP_WRITE(active_extruder, v)
  237. #endif
  238. void Stepper::wake_up() {
  239. // TCNT1 = 0;
  240. ENABLE_STEPPER_DRIVER_INTERRUPT();
  241. }
  242. /**
  243. * Set the stepper direction of each axis
  244. *
  245. * COREXY: X_AXIS=A_AXIS and Y_AXIS=B_AXIS
  246. * COREXZ: X_AXIS=A_AXIS and Z_AXIS=C_AXIS
  247. * COREYZ: Y_AXIS=B_AXIS and Z_AXIS=C_AXIS
  248. */
  249. void Stepper::set_directions() {
  250. #define SET_STEP_DIR(A) \
  251. if (motor_direction(_AXIS(A))) { \
  252. A##_APPLY_DIR(INVERT_## A##_DIR, false); \
  253. count_direction[_AXIS(A)] = -1; \
  254. } \
  255. else { \
  256. A##_APPLY_DIR(!INVERT_## A##_DIR, false); \
  257. count_direction[_AXIS(A)] = 1; \
  258. }
  259. #if HAS_X_DIR
  260. SET_STEP_DIR(X); // A
  261. #endif
  262. #if HAS_Y_DIR
  263. SET_STEP_DIR(Y); // B
  264. #endif
  265. #if HAS_Z_DIR
  266. SET_STEP_DIR(Z); // C
  267. #endif
  268. #if DISABLED(LIN_ADVANCE)
  269. #if ENABLED(MIXING_EXTRUDER)
  270. if (motor_direction(E_AXIS)) {
  271. MIXING_STEPPERS_LOOP(j) REV_E_DIR(j);
  272. count_direction[E_AXIS] = -1;
  273. }
  274. else {
  275. MIXING_STEPPERS_LOOP(j) NORM_E_DIR(j);
  276. count_direction[E_AXIS] = 1;
  277. }
  278. #else
  279. if (motor_direction(E_AXIS)) {
  280. REV_E_DIR(active_extruder);
  281. count_direction[E_AXIS] = -1;
  282. }
  283. else {
  284. NORM_E_DIR(active_extruder);
  285. count_direction[E_AXIS] = 1;
  286. }
  287. #endif
  288. #endif // !LIN_ADVANCE
  289. }
  290. #if ENABLED(S_CURVE_ACCELERATION)
  291. /**
  292. * This uses a quintic (fifth-degree) Bézier polynomial for the velocity curve, giving
  293. * a "linear pop" velocity curve; with pop being the sixth derivative of position:
  294. * velocity - 1st, acceleration - 2nd, jerk - 3rd, snap - 4th, crackle - 5th, pop - 6th
  295. *
  296. * The Bézier curve takes the form:
  297. *
  298. * V(t) = P_0 * B_0(t) + P_1 * B_1(t) + P_2 * B_2(t) + P_3 * B_3(t) + P_4 * B_4(t) + P_5 * B_5(t)
  299. *
  300. * Where 0 <= t <= 1, and V(t) is the velocity. P_0 through P_5 are the control points, and B_0(t)
  301. * through B_5(t) are the Bernstein basis as follows:
  302. *
  303. * B_0(t) = (1-t)^5 = -t^5 + 5t^4 - 10t^3 + 10t^2 - 5t + 1
  304. * B_1(t) = 5(1-t)^4 * t = 5t^5 - 20t^4 + 30t^3 - 20t^2 + 5t
  305. * B_2(t) = 10(1-t)^3 * t^2 = -10t^5 + 30t^4 - 30t^3 + 10t^2
  306. * B_3(t) = 10(1-t)^2 * t^3 = 10t^5 - 20t^4 + 10t^3
  307. * B_4(t) = 5(1-t) * t^4 = -5t^5 + 5t^4
  308. * B_5(t) = t^5 = t^5
  309. * ^ ^ ^ ^ ^ ^
  310. * | | | | | |
  311. * A B C D E F
  312. *
  313. * Unfortunately, we cannot use forward-differencing to calculate each position through
  314. * the curve, as Marlin uses variable timer periods. So, we require a formula of the form:
  315. *
  316. * V_f(t) = A*t^5 + B*t^4 + C*t^3 + D*t^2 + E*t + F
  317. *
  318. * Looking at the above B_0(t) through B_5(t) expanded forms, if we take the coefficients of t^5
  319. * through t of the Bézier form of V(t), we can determine that:
  320. *
  321. * A = -P_0 + 5*P_1 - 10*P_2 + 10*P_3 - 5*P_4 + P_5
  322. * B = 5*P_0 - 20*P_1 + 30*P_2 - 20*P_3 + 5*P_4
  323. * C = -10*P_0 + 30*P_1 - 30*P_2 + 10*P_3
  324. * D = 10*P_0 - 20*P_1 + 10*P_2
  325. * E = - 5*P_0 + 5*P_1
  326. * F = P_0
  327. *
  328. * Now, since we will (currently) *always* want the initial acceleration and jerk values to be 0,
  329. * We set P_i = P_0 = P_1 = P_2 (initial velocity), and P_t = P_3 = P_4 = P_5 (target velocity),
  330. * which, after simplification, resolves to:
  331. *
  332. * A = - 6*P_i + 6*P_t = 6*(P_t - P_i)
  333. * B = 15*P_i - 15*P_t = 15*(P_i - P_t)
  334. * C = -10*P_i + 10*P_t = 10*(P_t - P_i)
  335. * D = 0
  336. * E = 0
  337. * F = P_i
  338. *
  339. * As the t is evaluated in non uniform steps here, there is no other way rather than evaluating
  340. * the Bézier curve at each point:
  341. *
  342. * V_f(t) = A*t^5 + B*t^4 + C*t^3 + F [0 <= t <= 1]
  343. *
  344. * Floating point arithmetic execution time cost is prohibitive, so we will transform the math to
  345. * use fixed point values to be able to evaluate it in realtime. Assuming a maximum of 250000 steps
  346. * per second (driver pulses should at least be 2µS hi/2µS lo), and allocating 2 bits to avoid
  347. * overflows on the evaluation of the Bézier curve, means we can use
  348. *
  349. * t: unsigned Q0.32 (0 <= t < 1) |range 0 to 0xFFFFFFFF unsigned
  350. * A: signed Q24.7 , |range = +/- 250000 * 6 * 128 = +/- 192000000 = 0x0B71B000 | 28 bits + sign
  351. * B: signed Q24.7 , |range = +/- 250000 *15 * 128 = +/- 480000000 = 0x1C9C3800 | 29 bits + sign
  352. * C: signed Q24.7 , |range = +/- 250000 *10 * 128 = +/- 320000000 = 0x1312D000 | 29 bits + sign
  353. * F: signed Q24.7 , |range = +/- 250000 * 128 = 32000000 = 0x01E84800 | 25 bits + sign
  354. *
  355. * The trapezoid generator state contains the following information, that we will use to create and evaluate
  356. * the Bézier curve:
  357. *
  358. * blk->step_event_count [TS] = The total count of steps for this movement. (=distance)
  359. * blk->initial_rate [VI] = The initial steps per second (=velocity)
  360. * blk->final_rate [VF] = The ending steps per second (=velocity)
  361. * and the count of events completed (step_events_completed) [CS] (=distance until now)
  362. *
  363. * Note the abbreviations we use in the following formulae are between []s
  364. *
  365. * For Any 32bit CPU:
  366. *
  367. * At the start of each trapezoid, calculate the coefficients A,B,C,F and Advance [AV], as follows:
  368. *
  369. * A = 6*128*(VF - VI) = 768*(VF - VI)
  370. * B = 15*128*(VI - VF) = 1920*(VI - VF)
  371. * C = 10*128*(VF - VI) = 1280*(VF - VI)
  372. * F = 128*VI = 128*VI
  373. * AV = (1<<32)/TS ~= 0xFFFFFFFF / TS (To use ARM UDIV, that is 32 bits) (this is computed at the planner, to offload expensive calculations from the ISR)
  374. *
  375. * And for each point, evaluate the curve with the following sequence:
  376. *
  377. * void lsrs(uint32_t& d, uint32_t s, int cnt) {
  378. * d = s >> cnt;
  379. * }
  380. * void lsls(uint32_t& d, uint32_t s, int cnt) {
  381. * d = s << cnt;
  382. * }
  383. * void lsrs(int32_t& d, uint32_t s, int cnt) {
  384. * d = uint32_t(s) >> cnt;
  385. * }
  386. * void lsls(int32_t& d, uint32_t s, int cnt) {
  387. * d = uint32_t(s) << cnt;
  388. * }
  389. * void umull(uint32_t& rlo, uint32_t& rhi, uint32_t op1, uint32_t op2) {
  390. * uint64_t res = uint64_t(op1) * op2;
  391. * rlo = uint32_t(res & 0xFFFFFFFF);
  392. * rhi = uint32_t((res >> 32) & 0xFFFFFFFF);
  393. * }
  394. * void smlal(int32_t& rlo, int32_t& rhi, int32_t op1, int32_t op2) {
  395. * int64_t mul = int64_t(op1) * op2;
  396. * int64_t s = int64_t(uint32_t(rlo) | ((uint64_t(uint32_t(rhi)) << 32U)));
  397. * mul += s;
  398. * rlo = int32_t(mul & 0xFFFFFFFF);
  399. * rhi = int32_t((mul >> 32) & 0xFFFFFFFF);
  400. * }
  401. * int32_t _eval_bezier_curve_arm(uint32_t curr_step) {
  402. * register uint32_t flo = 0;
  403. * register uint32_t fhi = bezier_AV * curr_step;
  404. * register uint32_t t = fhi;
  405. * register int32_t alo = bezier_F;
  406. * register int32_t ahi = 0;
  407. * register int32_t A = bezier_A;
  408. * register int32_t B = bezier_B;
  409. * register int32_t C = bezier_C;
  410. *
  411. * lsrs(ahi, alo, 1); // a = F << 31
  412. * lsls(alo, alo, 31); //
  413. * umull(flo, fhi, fhi, t); // f *= t
  414. * umull(flo, fhi, fhi, t); // f>>=32; f*=t
  415. * lsrs(flo, fhi, 1); //
  416. * smlal(alo, ahi, flo, C); // a+=(f>>33)*C
  417. * umull(flo, fhi, fhi, t); // f>>=32; f*=t
  418. * lsrs(flo, fhi, 1); //
  419. * smlal(alo, ahi, flo, B); // a+=(f>>33)*B
  420. * umull(flo, fhi, fhi, t); // f>>=32; f*=t
  421. * lsrs(flo, fhi, 1); // f>>=33;
  422. * smlal(alo, ahi, flo, A); // a+=(f>>33)*A;
  423. * lsrs(alo, ahi, 6); // a>>=38
  424. *
  425. * return alo;
  426. * }
  427. *
  428. * This is rewritten in ARM assembly for optimal performance (43 cycles to execute).
  429. *
  430. * For AVR, the precision of coefficients is scaled so the Bézier curve can be evaluated in real-time:
  431. * Let's reduce precision as much as possible. After some experimentation we found that:
  432. *
  433. * Assume t and AV with 24 bits is enough
  434. * A = 6*(VF - VI)
  435. * B = 15*(VI - VF)
  436. * C = 10*(VF - VI)
  437. * F = VI
  438. * AV = (1<<24)/TS (this is computed at the planner, to offload expensive calculations from the ISR)
  439. *
  440. * Instead of storing sign for each coefficient, we will store its absolute value,
  441. * and flag the sign of the A coefficient, so we can save to store the sign bit.
  442. * It always holds that sign(A) = - sign(B) = sign(C)
  443. *
  444. * So, the resulting range of the coefficients are:
  445. *
  446. * t: unsigned (0 <= t < 1) |range 0 to 0xFFFFFF unsigned
  447. * A: signed Q24 , range = 250000 * 6 = 1500000 = 0x16E360 | 21 bits
  448. * B: signed Q24 , range = 250000 *15 = 3750000 = 0x393870 | 22 bits
  449. * C: signed Q24 , range = 250000 *10 = 2500000 = 0x1312D0 | 21 bits
  450. * F: signed Q24 , range = 250000 = 250000 = 0x0ED090 | 20 bits
  451. *
  452. * And for each curve, estimate its coefficients with:
  453. *
  454. * void _calc_bezier_curve_coeffs(int32_t v0, int32_t v1, uint32_t av) {
  455. * // Calculate the Bézier coefficients
  456. * if (v1 < v0) {
  457. * A_negative = true;
  458. * bezier_A = 6 * (v0 - v1);
  459. * bezier_B = 15 * (v0 - v1);
  460. * bezier_C = 10 * (v0 - v1);
  461. * }
  462. * else {
  463. * A_negative = false;
  464. * bezier_A = 6 * (v1 - v0);
  465. * bezier_B = 15 * (v1 - v0);
  466. * bezier_C = 10 * (v1 - v0);
  467. * }
  468. * bezier_F = v0;
  469. * }
  470. *
  471. * And for each point, evaluate the curve with the following sequence:
  472. *
  473. * // unsigned multiplication of 24 bits x 24bits, return upper 16 bits
  474. * void umul24x24to16hi(uint16_t& r, uint24_t op1, uint24_t op2) {
  475. * r = (uint64_t(op1) * op2) >> 8;
  476. * }
  477. * // unsigned multiplication of 16 bits x 16bits, return upper 16 bits
  478. * void umul16x16to16hi(uint16_t& r, uint16_t op1, uint16_t op2) {
  479. * r = (uint32_t(op1) * op2) >> 16;
  480. * }
  481. * // unsigned multiplication of 16 bits x 24bits, return upper 24 bits
  482. * void umul16x24to24hi(uint24_t& r, uint16_t op1, uint24_t op2) {
  483. * r = uint24_t((uint64_t(op1) * op2) >> 16);
  484. * }
  485. *
  486. * int32_t _eval_bezier_curve(uint32_t curr_step) {
  487. * // To save computing, the first step is always the initial speed
  488. * if (!curr_step)
  489. * return bezier_F;
  490. *
  491. * uint16_t t;
  492. * umul24x24to16hi(t, bezier_AV, curr_step); // t: Range 0 - 1^16 = 16 bits
  493. * uint16_t f = t;
  494. * umul16x16to16hi(f, f, t); // Range 16 bits (unsigned)
  495. * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^3 (unsigned)
  496. * uint24_t acc = bezier_F; // Range 20 bits (unsigned)
  497. * if (A_negative) {
  498. * uint24_t v;
  499. * umul16x24to24hi(v, f, bezier_C); // Range 21bits
  500. * acc -= v;
  501. * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^4 (unsigned)
  502. * umul16x24to24hi(v, f, bezier_B); // Range 22bits
  503. * acc += v;
  504. * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^5 (unsigned)
  505. * umul16x24to24hi(v, f, bezier_A); // Range 21bits + 15 = 36bits (plus sign)
  506. * acc -= v;
  507. * }
  508. * else {
  509. * uint24_t v;
  510. * umul16x24to24hi(v, f, bezier_C); // Range 21bits
  511. * acc += v;
  512. * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^4 (unsigned)
  513. * umul16x24to24hi(v, f, bezier_B); // Range 22bits
  514. * acc -= v;
  515. * umul16x16to16hi(f, f, t); // Range 16 bits : f = t^5 (unsigned)
  516. * umul16x24to24hi(v, f, bezier_A); // Range 21bits + 15 = 36bits (plus sign)
  517. * acc += v;
  518. * }
  519. * return acc;
  520. * }
  521. * These functions are translated to assembler for optimal performance.
  522. * Coefficient calculation takes 70 cycles. Bezier point evaluation takes 150 cycles.
  523. */
  524. #ifdef __AVR__
  525. // For AVR we use assembly to maximize speed
  526. void Stepper::_calc_bezier_curve_coeffs(const int32_t v0, const int32_t v1, const uint32_t av) {
  527. // Store advance
  528. bezier_AV = av;
  529. // Calculate the rest of the coefficients
  530. register uint8_t r2 = v0 & 0xFF;
  531. register uint8_t r3 = (v0 >> 8) & 0xFF;
  532. register uint8_t r12 = (v0 >> 16) & 0xFF;
  533. register uint8_t r5 = v1 & 0xFF;
  534. register uint8_t r6 = (v1 >> 8) & 0xFF;
  535. register uint8_t r7 = (v1 >> 16) & 0xFF;
  536. register uint8_t r4,r8,r9,r10,r11;
  537. __asm__ __volatile__(
  538. /* Calculate the Bézier coefficients */
  539. /* %10:%1:%0 = v0*/
  540. /* %5:%4:%3 = v1*/
  541. /* %7:%6:%10 = temporary*/
  542. /* %9 = val (must be high register!)*/
  543. /* %10 (must be high register!)*/
  544. /* Store initial velocity*/
  545. A("sts bezier_F, %0")
  546. A("sts bezier_F+1, %1")
  547. A("sts bezier_F+2, %10") /* bezier_F = %10:%1:%0 = v0 */
  548. /* Get delta speed */
  549. A("ldi %2,-1") /* %2 = 0xFF, means A_negative = true */
  550. A("clr %8") /* %8 = 0 */
  551. A("sub %0,%3")
  552. A("sbc %1,%4")
  553. A("sbc %10,%5") /* v0 -= v1, C=1 if result is negative */
  554. A("brcc 1f") /* branch if result is positive (C=0), that means v0 >= v1 */
  555. /* Result was negative, get the absolute value*/
  556. A("com %10")
  557. A("com %1")
  558. A("neg %0")
  559. A("sbc %1,%2")
  560. A("sbc %10,%2") /* %10:%1:%0 +1 -> %10:%1:%0 = -(v0 - v1) = (v1 - v0) */
  561. A("clr %2") /* %2 = 0, means A_negative = false */
  562. /* Store negative flag*/
  563. L("1")
  564. A("sts A_negative, %2") /* Store negative flag */
  565. /* Compute coefficients A,B and C [20 cycles worst case]*/
  566. A("ldi %9,6") /* %9 = 6 */
  567. A("mul %0,%9") /* r1:r0 = 6*LO(v0-v1) */
  568. A("sts bezier_A, r0")
  569. A("mov %6,r1")
  570. A("clr %7") /* %7:%6:r0 = 6*LO(v0-v1) */
  571. A("mul %1,%9") /* r1:r0 = 6*MI(v0-v1) */
  572. A("add %6,r0")
  573. A("adc %7,r1") /* %7:%6:?? += 6*MI(v0-v1) << 8 */
  574. A("mul %10,%9") /* r1:r0 = 6*HI(v0-v1) */
  575. A("add %7,r0") /* %7:%6:?? += 6*HI(v0-v1) << 16 */
  576. A("sts bezier_A+1, %6")
  577. A("sts bezier_A+2, %7") /* bezier_A = %7:%6:?? = 6*(v0-v1) [35 cycles worst] */
  578. A("ldi %9,15") /* %9 = 15 */
  579. A("mul %0,%9") /* r1:r0 = 5*LO(v0-v1) */
  580. A("sts bezier_B, r0")
  581. A("mov %6,r1")
  582. A("clr %7") /* %7:%6:?? = 5*LO(v0-v1) */
  583. A("mul %1,%9") /* r1:r0 = 5*MI(v0-v1) */
  584. A("add %6,r0")
  585. A("adc %7,r1") /* %7:%6:?? += 5*MI(v0-v1) << 8 */
  586. A("mul %10,%9") /* r1:r0 = 5*HI(v0-v1) */
  587. A("add %7,r0") /* %7:%6:?? += 5*HI(v0-v1) << 16 */
  588. A("sts bezier_B+1, %6")
  589. A("sts bezier_B+2, %7") /* bezier_B = %7:%6:?? = 5*(v0-v1) [50 cycles worst] */
  590. A("ldi %9,10") /* %9 = 10 */
  591. A("mul %0,%9") /* r1:r0 = 10*LO(v0-v1) */
  592. A("sts bezier_C, r0")
  593. A("mov %6,r1")
  594. A("clr %7") /* %7:%6:?? = 10*LO(v0-v1) */
  595. A("mul %1,%9") /* r1:r0 = 10*MI(v0-v1) */
  596. A("add %6,r0")
  597. A("adc %7,r1") /* %7:%6:?? += 10*MI(v0-v1) << 8 */
  598. A("mul %10,%9") /* r1:r0 = 10*HI(v0-v1) */
  599. A("add %7,r0") /* %7:%6:?? += 10*HI(v0-v1) << 16 */
  600. A("sts bezier_C+1, %6")
  601. " sts bezier_C+2, %7" /* bezier_C = %7:%6:?? = 10*(v0-v1) [65 cycles worst] */
  602. : "+r" (r2),
  603. "+d" (r3),
  604. "=r" (r4),
  605. "+r" (r5),
  606. "+r" (r6),
  607. "+r" (r7),
  608. "=r" (r8),
  609. "=r" (r9),
  610. "=r" (r10),
  611. "=d" (r11),
  612. "+r" (r12)
  613. :
  614. : "r0", "r1", "cc", "memory"
  615. );
  616. }
  617. FORCE_INLINE int32_t Stepper::_eval_bezier_curve(const uint32_t curr_step) {
  618. // If dealing with the first step, save expensive computing and return the initial speed
  619. if (!curr_step)
  620. return bezier_F;
  621. register uint8_t r0 = 0; /* Zero register */
  622. register uint8_t r2 = (curr_step) & 0xFF;
  623. register uint8_t r3 = (curr_step >> 8) & 0xFF;
  624. register uint8_t r4 = (curr_step >> 16) & 0xFF;
  625. register uint8_t r1,r5,r6,r7,r8,r9,r10,r11; /* Temporary registers */
  626. __asm__ __volatile(
  627. /* umul24x24to16hi(t, bezier_AV, curr_step); t: Range 0 - 1^16 = 16 bits*/
  628. A("lds %9,bezier_AV") /* %9 = LO(AV)*/
  629. A("mul %9,%2") /* r1:r0 = LO(bezier_AV)*LO(curr_step)*/
  630. A("mov %7,r1") /* %7 = LO(bezier_AV)*LO(curr_step) >> 8*/
  631. A("clr %8") /* %8:%7 = LO(bezier_AV)*LO(curr_step) >> 8*/
  632. A("lds %10,bezier_AV+1") /* %10 = MI(AV)*/
  633. A("mul %10,%2") /* r1:r0 = MI(bezier_AV)*LO(curr_step)*/
  634. A("add %7,r0")
  635. A("adc %8,r1") /* %8:%7 += MI(bezier_AV)*LO(curr_step)*/
  636. A("lds r1,bezier_AV+2") /* r11 = HI(AV)*/
  637. A("mul r1,%2") /* r1:r0 = HI(bezier_AV)*LO(curr_step)*/
  638. A("add %8,r0") /* %8:%7 += HI(bezier_AV)*LO(curr_step) << 8*/
  639. A("mul %9,%3") /* r1:r0 = LO(bezier_AV)*MI(curr_step)*/
  640. A("add %7,r0")
  641. A("adc %8,r1") /* %8:%7 += LO(bezier_AV)*MI(curr_step)*/
  642. A("mul %10,%3") /* r1:r0 = MI(bezier_AV)*MI(curr_step)*/
  643. A("add %8,r0") /* %8:%7 += LO(bezier_AV)*MI(curr_step) << 8*/
  644. A("mul %9,%4") /* r1:r0 = LO(bezier_AV)*HI(curr_step)*/
  645. A("add %8,r0") /* %8:%7 += LO(bezier_AV)*HI(curr_step) << 8*/
  646. /* %8:%7 = t*/
  647. /* uint16_t f = t;*/
  648. A("mov %5,%7") /* %6:%5 = f*/
  649. A("mov %6,%8")
  650. /* %6:%5 = f*/
  651. /* umul16x16to16hi(f, f, t); / Range 16 bits (unsigned) [17] */
  652. A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
  653. A("mov %9,r1") /* store MIL(LO(f) * LO(t)) in %9, we need it for rounding*/
  654. A("clr %10") /* %10 = 0*/
  655. A("clr %11") /* %11 = 0*/
  656. A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
  657. A("add %9,r0") /* %9 += LO(LO(f) * HI(t))*/
  658. A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
  659. A("adc %11,%0") /* %11 += carry*/
  660. A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
  661. A("add %9,r0") /* %9 += LO(HI(f) * LO(t))*/
  662. A("adc %10,r1") /* %10 += HI(HI(f) * LO(t)) */
  663. A("adc %11,%0") /* %11 += carry*/
  664. A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
  665. A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
  666. A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
  667. A("mov %5,%10") /* %6:%5 = */
  668. A("mov %6,%11") /* f = %10:%11*/
  669. /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^3 (unsigned) [17]*/
  670. A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
  671. A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
  672. A("clr %10") /* %10 = 0*/
  673. A("clr %11") /* %11 = 0*/
  674. A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
  675. A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
  676. A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
  677. A("adc %11,%0") /* %11 += carry*/
  678. A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
  679. A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
  680. A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
  681. A("adc %11,%0") /* %11 += carry*/
  682. A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
  683. A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
  684. A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
  685. A("mov %5,%10") /* %6:%5 =*/
  686. A("mov %6,%11") /* f = %10:%11*/
  687. /* [15 +17*2] = [49]*/
  688. /* %4:%3:%2 will be acc from now on*/
  689. /* uint24_t acc = bezier_F; / Range 20 bits (unsigned)*/
  690. A("clr %9") /* "decimal place we get for free"*/
  691. A("lds %2,bezier_F")
  692. A("lds %3,bezier_F+1")
  693. A("lds %4,bezier_F+2") /* %4:%3:%2 = acc*/
  694. /* if (A_negative) {*/
  695. A("lds r0,A_negative")
  696. A("or r0,%0") /* Is flag signalling negative? */
  697. A("brne 3f") /* If yes, Skip next instruction if A was negative*/
  698. A("rjmp 1f") /* Otherwise, jump */
  699. /* uint24_t v; */
  700. /* umul16x24to24hi(v, f, bezier_C); / Range 21bits [29] */
  701. /* acc -= v; */
  702. L("3")
  703. A("lds %10, bezier_C") /* %10 = LO(bezier_C)*/
  704. A("mul %10,%5") /* r1:r0 = LO(bezier_C) * LO(f)*/
  705. A("sub %9,r1")
  706. A("sbc %2,%0")
  707. A("sbc %3,%0")
  708. A("sbc %4,%0") /* %4:%3:%2:%9 -= HI(LO(bezier_C) * LO(f))*/
  709. A("lds %11, bezier_C+1") /* %11 = MI(bezier_C)*/
  710. A("mul %11,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
  711. A("sub %9,r0")
  712. A("sbc %2,r1")
  713. A("sbc %3,%0")
  714. A("sbc %4,%0") /* %4:%3:%2:%9 -= MI(bezier_C) * LO(f)*/
  715. A("lds %1, bezier_C+2") /* %1 = HI(bezier_C)*/
  716. A("mul %1,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
  717. A("sub %2,r0")
  718. A("sbc %3,r1")
  719. A("sbc %4,%0") /* %4:%3:%2:%9 -= HI(bezier_C) * LO(f) << 8*/
  720. A("mul %10,%6") /* r1:r0 = LO(bezier_C) * MI(f)*/
  721. A("sub %9,r0")
  722. A("sbc %2,r1")
  723. A("sbc %3,%0")
  724. A("sbc %4,%0") /* %4:%3:%2:%9 -= LO(bezier_C) * MI(f)*/
  725. A("mul %11,%6") /* r1:r0 = MI(bezier_C) * MI(f)*/
  726. A("sub %2,r0")
  727. A("sbc %3,r1")
  728. A("sbc %4,%0") /* %4:%3:%2:%9 -= MI(bezier_C) * MI(f) << 8*/
  729. A("mul %1,%6") /* r1:r0 = HI(bezier_C) * LO(f)*/
  730. A("sub %3,r0")
  731. A("sbc %4,r1") /* %4:%3:%2:%9 -= HI(bezier_C) * LO(f) << 16*/
  732. /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^3 (unsigned) [17]*/
  733. A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
  734. A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
  735. A("clr %10") /* %10 = 0*/
  736. A("clr %11") /* %11 = 0*/
  737. A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
  738. A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
  739. A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
  740. A("adc %11,%0") /* %11 += carry*/
  741. A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
  742. A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
  743. A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
  744. A("adc %11,%0") /* %11 += carry*/
  745. A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
  746. A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
  747. A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
  748. A("mov %5,%10") /* %6:%5 =*/
  749. A("mov %6,%11") /* f = %10:%11*/
  750. /* umul16x24to24hi(v, f, bezier_B); / Range 22bits [29]*/
  751. /* acc += v; */
  752. A("lds %10, bezier_B") /* %10 = LO(bezier_B)*/
  753. A("mul %10,%5") /* r1:r0 = LO(bezier_B) * LO(f)*/
  754. A("add %9,r1")
  755. A("adc %2,%0")
  756. A("adc %3,%0")
  757. A("adc %4,%0") /* %4:%3:%2:%9 += HI(LO(bezier_B) * LO(f))*/
  758. A("lds %11, bezier_B+1") /* %11 = MI(bezier_B)*/
  759. A("mul %11,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
  760. A("add %9,r0")
  761. A("adc %2,r1")
  762. A("adc %3,%0")
  763. A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_B) * LO(f)*/
  764. A("lds %1, bezier_B+2") /* %1 = HI(bezier_B)*/
  765. A("mul %1,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
  766. A("add %2,r0")
  767. A("adc %3,r1")
  768. A("adc %4,%0") /* %4:%3:%2:%9 += HI(bezier_B) * LO(f) << 8*/
  769. A("mul %10,%6") /* r1:r0 = LO(bezier_B) * MI(f)*/
  770. A("add %9,r0")
  771. A("adc %2,r1")
  772. A("adc %3,%0")
  773. A("adc %4,%0") /* %4:%3:%2:%9 += LO(bezier_B) * MI(f)*/
  774. A("mul %11,%6") /* r1:r0 = MI(bezier_B) * MI(f)*/
  775. A("add %2,r0")
  776. A("adc %3,r1")
  777. A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_B) * MI(f) << 8*/
  778. A("mul %1,%6") /* r1:r0 = HI(bezier_B) * LO(f)*/
  779. A("add %3,r0")
  780. A("adc %4,r1") /* %4:%3:%2:%9 += HI(bezier_B) * LO(f) << 16*/
  781. /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^5 (unsigned) [17]*/
  782. A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
  783. A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
  784. A("clr %10") /* %10 = 0*/
  785. A("clr %11") /* %11 = 0*/
  786. A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
  787. A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
  788. A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
  789. A("adc %11,%0") /* %11 += carry*/
  790. A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
  791. A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
  792. A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
  793. A("adc %11,%0") /* %11 += carry*/
  794. A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
  795. A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
  796. A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
  797. A("mov %5,%10") /* %6:%5 =*/
  798. A("mov %6,%11") /* f = %10:%11*/
  799. /* umul16x24to24hi(v, f, bezier_A); / Range 21bits [29]*/
  800. /* acc -= v; */
  801. A("lds %10, bezier_A") /* %10 = LO(bezier_A)*/
  802. A("mul %10,%5") /* r1:r0 = LO(bezier_A) * LO(f)*/
  803. A("sub %9,r1")
  804. A("sbc %2,%0")
  805. A("sbc %3,%0")
  806. A("sbc %4,%0") /* %4:%3:%2:%9 -= HI(LO(bezier_A) * LO(f))*/
  807. A("lds %11, bezier_A+1") /* %11 = MI(bezier_A)*/
  808. A("mul %11,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
  809. A("sub %9,r0")
  810. A("sbc %2,r1")
  811. A("sbc %3,%0")
  812. A("sbc %4,%0") /* %4:%3:%2:%9 -= MI(bezier_A) * LO(f)*/
  813. A("lds %1, bezier_A+2") /* %1 = HI(bezier_A)*/
  814. A("mul %1,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
  815. A("sub %2,r0")
  816. A("sbc %3,r1")
  817. A("sbc %4,%0") /* %4:%3:%2:%9 -= HI(bezier_A) * LO(f) << 8*/
  818. A("mul %10,%6") /* r1:r0 = LO(bezier_A) * MI(f)*/
  819. A("sub %9,r0")
  820. A("sbc %2,r1")
  821. A("sbc %3,%0")
  822. A("sbc %4,%0") /* %4:%3:%2:%9 -= LO(bezier_A) * MI(f)*/
  823. A("mul %11,%6") /* r1:r0 = MI(bezier_A) * MI(f)*/
  824. A("sub %2,r0")
  825. A("sbc %3,r1")
  826. A("sbc %4,%0") /* %4:%3:%2:%9 -= MI(bezier_A) * MI(f) << 8*/
  827. A("mul %1,%6") /* r1:r0 = HI(bezier_A) * LO(f)*/
  828. A("sub %3,r0")
  829. A("sbc %4,r1") /* %4:%3:%2:%9 -= HI(bezier_A) * LO(f) << 16*/
  830. A("jmp 2f") /* Done!*/
  831. L("1")
  832. /* uint24_t v; */
  833. /* umul16x24to24hi(v, f, bezier_C); / Range 21bits [29]*/
  834. /* acc += v; */
  835. A("lds %10, bezier_C") /* %10 = LO(bezier_C)*/
  836. A("mul %10,%5") /* r1:r0 = LO(bezier_C) * LO(f)*/
  837. A("add %9,r1")
  838. A("adc %2,%0")
  839. A("adc %3,%0")
  840. A("adc %4,%0") /* %4:%3:%2:%9 += HI(LO(bezier_C) * LO(f))*/
  841. A("lds %11, bezier_C+1") /* %11 = MI(bezier_C)*/
  842. A("mul %11,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
  843. A("add %9,r0")
  844. A("adc %2,r1")
  845. A("adc %3,%0")
  846. A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_C) * LO(f)*/
  847. A("lds %1, bezier_C+2") /* %1 = HI(bezier_C)*/
  848. A("mul %1,%5") /* r1:r0 = MI(bezier_C) * LO(f)*/
  849. A("add %2,r0")
  850. A("adc %3,r1")
  851. A("adc %4,%0") /* %4:%3:%2:%9 += HI(bezier_C) * LO(f) << 8*/
  852. A("mul %10,%6") /* r1:r0 = LO(bezier_C) * MI(f)*/
  853. A("add %9,r0")
  854. A("adc %2,r1")
  855. A("adc %3,%0")
  856. A("adc %4,%0") /* %4:%3:%2:%9 += LO(bezier_C) * MI(f)*/
  857. A("mul %11,%6") /* r1:r0 = MI(bezier_C) * MI(f)*/
  858. A("add %2,r0")
  859. A("adc %3,r1")
  860. A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_C) * MI(f) << 8*/
  861. A("mul %1,%6") /* r1:r0 = HI(bezier_C) * LO(f)*/
  862. A("add %3,r0")
  863. A("adc %4,r1") /* %4:%3:%2:%9 += HI(bezier_C) * LO(f) << 16*/
  864. /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^3 (unsigned) [17]*/
  865. A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
  866. A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
  867. A("clr %10") /* %10 = 0*/
  868. A("clr %11") /* %11 = 0*/
  869. A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
  870. A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
  871. A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
  872. A("adc %11,%0") /* %11 += carry*/
  873. A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
  874. A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
  875. A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
  876. A("adc %11,%0") /* %11 += carry*/
  877. A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
  878. A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
  879. A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
  880. A("mov %5,%10") /* %6:%5 =*/
  881. A("mov %6,%11") /* f = %10:%11*/
  882. /* umul16x24to24hi(v, f, bezier_B); / Range 22bits [29]*/
  883. /* acc -= v;*/
  884. A("lds %10, bezier_B") /* %10 = LO(bezier_B)*/
  885. A("mul %10,%5") /* r1:r0 = LO(bezier_B) * LO(f)*/
  886. A("sub %9,r1")
  887. A("sbc %2,%0")
  888. A("sbc %3,%0")
  889. A("sbc %4,%0") /* %4:%3:%2:%9 -= HI(LO(bezier_B) * LO(f))*/
  890. A("lds %11, bezier_B+1") /* %11 = MI(bezier_B)*/
  891. A("mul %11,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
  892. A("sub %9,r0")
  893. A("sbc %2,r1")
  894. A("sbc %3,%0")
  895. A("sbc %4,%0") /* %4:%3:%2:%9 -= MI(bezier_B) * LO(f)*/
  896. A("lds %1, bezier_B+2") /* %1 = HI(bezier_B)*/
  897. A("mul %1,%5") /* r1:r0 = MI(bezier_B) * LO(f)*/
  898. A("sub %2,r0")
  899. A("sbc %3,r1")
  900. A("sbc %4,%0") /* %4:%3:%2:%9 -= HI(bezier_B) * LO(f) << 8*/
  901. A("mul %10,%6") /* r1:r0 = LO(bezier_B) * MI(f)*/
  902. A("sub %9,r0")
  903. A("sbc %2,r1")
  904. A("sbc %3,%0")
  905. A("sbc %4,%0") /* %4:%3:%2:%9 -= LO(bezier_B) * MI(f)*/
  906. A("mul %11,%6") /* r1:r0 = MI(bezier_B) * MI(f)*/
  907. A("sub %2,r0")
  908. A("sbc %3,r1")
  909. A("sbc %4,%0") /* %4:%3:%2:%9 -= MI(bezier_B) * MI(f) << 8*/
  910. A("mul %1,%6") /* r1:r0 = HI(bezier_B) * LO(f)*/
  911. A("sub %3,r0")
  912. A("sbc %4,r1") /* %4:%3:%2:%9 -= HI(bezier_B) * LO(f) << 16*/
  913. /* umul16x16to16hi(f, f, t); / Range 16 bits : f = t^5 (unsigned) [17]*/
  914. A("mul %5,%7") /* r1:r0 = LO(f) * LO(t)*/
  915. A("mov %1,r1") /* store MIL(LO(f) * LO(t)) in %1, we need it for rounding*/
  916. A("clr %10") /* %10 = 0*/
  917. A("clr %11") /* %11 = 0*/
  918. A("mul %5,%8") /* r1:r0 = LO(f) * HI(t)*/
  919. A("add %1,r0") /* %1 += LO(LO(f) * HI(t))*/
  920. A("adc %10,r1") /* %10 = HI(LO(f) * HI(t))*/
  921. A("adc %11,%0") /* %11 += carry*/
  922. A("mul %6,%7") /* r1:r0 = HI(f) * LO(t)*/
  923. A("add %1,r0") /* %1 += LO(HI(f) * LO(t))*/
  924. A("adc %10,r1") /* %10 += HI(HI(f) * LO(t))*/
  925. A("adc %11,%0") /* %11 += carry*/
  926. A("mul %6,%8") /* r1:r0 = HI(f) * HI(t)*/
  927. A("add %10,r0") /* %10 += LO(HI(f) * HI(t))*/
  928. A("adc %11,r1") /* %11 += HI(HI(f) * HI(t))*/
  929. A("mov %5,%10") /* %6:%5 =*/
  930. A("mov %6,%11") /* f = %10:%11*/
  931. /* umul16x24to24hi(v, f, bezier_A); / Range 21bits [29]*/
  932. /* acc += v; */
  933. A("lds %10, bezier_A") /* %10 = LO(bezier_A)*/
  934. A("mul %10,%5") /* r1:r0 = LO(bezier_A) * LO(f)*/
  935. A("add %9,r1")
  936. A("adc %2,%0")
  937. A("adc %3,%0")
  938. A("adc %4,%0") /* %4:%3:%2:%9 += HI(LO(bezier_A) * LO(f))*/
  939. A("lds %11, bezier_A+1") /* %11 = MI(bezier_A)*/
  940. A("mul %11,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
  941. A("add %9,r0")
  942. A("adc %2,r1")
  943. A("adc %3,%0")
  944. A("adc %4,%0") /* %4:%3:%2:%9 += MI(bezier_A) * LO(f)*/
  945. A("lds %1, bezier_A+2") /* %1 = HI(bezier_A)*/
  946. A("mul %1,%5") /* r1:r0 = MI(bezier_A) * LO(f)*/
  947. A("add %2,r0")
  948. A("adc %3,r1")<