Here is another variant of atan2i(y,x) that gives the same answers as round(atan2(y, x)*128/Pi) for all x=-255..+255 and y=-255..+255 (ignoring x=y=0):
// SPDX-License-Identifier: CC0-1.0
#include <stdint.h>
static const struct {
uint8_t x;
uint8_t y;
} upper_limit[] = {
{ .x = 163, .y = 2 },
{ .x = 163, .y = 6 },
{ .x = 114, .y = 7 },
{ .x = 151, .y = 13 },
{ .x = 253, .y = 28 },
{ .x = 81, .y = 11 },
{ .x = 230, .y = 37 },
{ .x = 188, .y = 35 },
{ .x = 137, .y = 29 },
{ .x = 219, .y = 52 },
{ .x = 129, .y = 34 },
{ .x = 169, .y = 49 },
{ .x = 161, .y = 51 },
{ .x = 125, .y = 43 },
{ .x = 113, .y = 42 },
{ .x = 253, .y = 101 },
{ .x = 7, .y = 3 },
{ .x = 131, .y = 60 },
{ .x = 209, .y = 102 },
{ .x = 239, .y = 124 },
{ .x = 129, .y = 71 },
{ .x = 151, .y = 88 },
{ .x = 99, .y = 61 },
{ .x = 186, .y = 121 },
{ .x = 86, .y = 59 },
{ .x = 101, .y = 73 },
{ .x = 255, .y = 194 },
{ .x = 5, .y = 4 },
{ .x = 227, .y = 191 },
{ .x = 95, .y = 84 },
{ .x = 197, .y = 183 },
{ .x = 206, .y = 201 },
};
#define upper_limits (sizeof upper_limit / sizeof upper_limit[0])
static inline int8_t within_octant(uint8_t x, uint8_t y) {
uint8_t b = 0;
uint8_t q = upper_limits;
while (1) {
uint8_t i = (b + q) / 2;
const uint16_t lhs = (uint16_t)y * upper_limit[i].x;
const uint16_t rhs = (uint16_t)x * upper_limit[i].y;
if (lhs < rhs) {
if (q == i)
return i;
else
q = i;
} else
if (lhs > rhs) {
if (b == i)
return i + 1;
else
b = i;
} else {
return i;
}
}
}
int8_t atan2i(int_fast16_t y, int_fast16_t x) {
if (y < 0) {
const uint8_t ay = -y;
if (x < 0) {
const uint8_t ax = -x;
if (ay < ax) {
return -128 + within_octant(ax, ay);
} else {
return -64 - within_octant(ay, ax);
}
} else {
const uint8_t ax = +x;
if (ay < ax) {
return -within_octant(ax, ay);
} else {
return -64 + within_octant(ay, ax);
}
}
} else {
const int_fast16_t ay = +y;
if (x < 0) {
const int_fast16_t ax = -x;
if (ay < ax) {
return 128 - within_octant(ax, ay);
} else {
return 64 + within_octant(ay, ax);
}
} else {
const int_fast16_t ax = +x;
if (ay < ax) {
return within_octant(ax, ay);
} else {
return 64 - within_octant(ay, ax);
}
}
}
}
This one is based on finding the point in (0,0)-(255,255) that is closest to but does not exceed the angle that rounds to 0, 1, 2, .., 32; these points have been saved to the upper_limit[] array.
If we have P0(x0,y0) and P1(x1,y1) in the first quadrant (nonnegative coordinates), and x0*y1 > y0*x1, then P0 is at a bigger angle than P0. If x0*y1 < y0*x1, then P1 is at a bigger angle than P0. If x0*y1 = y0*x1, then P0 and P1 are on the same line passing through origin.
While this does up to 64 8×8=16-bit multiplications, they only take 2 clocks each on AVR. avr-gcc 5.4.0 -Os generates 242 bytes of code, and the table takes an additional 64 bytes.
I haven't microbenchmarked any of the atan2i() functions –– should time all (-200..+200)×(-200..+200) = 160,801 cases –– so I don't really know which ones are "fast". I do have an ATmega32u4 and an AT90USB1286 somewhere, if I could just find them...