Goldschmidt division
Feed
date: 2026-05-21 19:55:36
categories: programming
firstPublishDate: 2026-05-21 19:55:36
The Goldschmidt division algorithm is a fast division using the Newton-Raphson approximation.
Goldschmidt division on wikipedia
It is an algorithm computing the quotient `c` of `n` (numerator) divided by `d` (divider).
The Newton-Raphson approximation estimates `1/d` and the quotient is then:
`1/d` is estimated by iterating this:
The quotient `c` is estimated with:
Goldschmidt division for 16bit signed integers
We replace `d` with a value `a` so that `a` is in the range [0.5, 1[ and `1/a` is in the range ]1, 2]. We choose the range [0.5, 1[ for `a` because the iteration `x * (2 - a * x)` converges fast for this range.
`a` is `d` divided by a power of 2 number so that `a` is in the expected range:
a = d / 2<<(msb(d)+1)
1/a = 2<<(msb(d)+1) / d
x is an approximation of `1/a`, we get the code:
x = initial value in range [1,2]
loop:
x = x * (2 - x * a)
goto loop (4 times)
c ~= x * n / 2<<(msb(d)+1)
The computations are done using fixed point integers with 15 bit for the fractional number to fit `x * (2 - x * a)` into a 32 bit integer.
- Range 0.5 to 1 maps to 16384 to 32768 (bit 15 is 0, bit 14 is 1)
- Range 1 to 2 maps to 32768 to 65536 (bit 15 is 1)
- 2 is 0x10000 in fixed point integer with 15bit fractional number (bit 16 is 1)
d is a 16bit signed integer in the range 0 to 32767, we compute `a` like this:
clz is a function (or the `lzcnt` instruction) counting the leading 0s in `d` (from 1 to 15), bit 15 is 0 and bit 14 is 1 so `a` in range 16384 to 32768.
The initial value for `x` is 1.5 which in fixed point is: `0xc000`. We get the code:
u16 div(u16 n, u16 d) {
// clz input is 32 bit integer so d needs to be shifted
u16 shift = clz(((u32)d)<<16)-1;
u32 a = d << shift;
u32 x = 0xc000;
// 4 iterations
x = (x*(0x10000 - ((x*a)>>15)))>>15;
x = (x*(0x10000 - ((x*a)>>15)))>>15;
x = (x*(0x10000 - ((x*a)>>15)))>>15;
x = (x*(0x10000 - ((x*a)>>15)))>>15;
r = ((((u32)n * x) >> (15-shift)) >> 15);
return r;
}
`n * 1/a` needs to be adjusted to become `n * 1/d` with shift right by 15-shift positions and the 15bit fractional number is removed with shift right by 15 positions. The fractional number in the result is an estimation of the remainder and is often not accurate.
Signed division is implemented by computing the sign of the result and dividing abs(n) with abs(d).
We can choose an initial value for x to make x converge faster.
range(i, 32) {
(u32)((1/((f32)(0x4000+(i<<9))/((f32)32768)))*32768)
}
// initial value bit 9 to 13 in a are index in initialValuesForx
u32 x = initialValuesForx[(a & 0x3e00) >> 9];
The `clz` function is from the book `Henry S. Warren - Hacker's Delight (2nd Edition)-Addison-Wesley Professional (2012)`:
/* u is for unused elements */
#define u 99
u32 clz(u32 v) {
static char table[64] =
{32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u,
u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u,
u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u,
22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31};
v = v | (v >> 1); // Propagate leftmost
v = v | (v >> 2); // 1-bit to the right.
v = v | (v >> 4);
v = v | (v >> 8);
v = v & ~(v >> 16);
v = v*0xFD7049FF;
return table[v >> 26];
}
Implementation in C16 assembly
C16 CPU
The clz function is implemented like this:
clz:
; count leading zeros
; input r1 output r1
; r0, r1, r2 are used
; v = v | (v >> 1);
mv r2, r1
shri r2, 1
or r1, r2
; v = v | (v >> 2);
mv r2, r1
shri r2, 2
or r1, r2
; v = v | (v >> 4);
mv r2, r1
shri r2, 4
or r1, r2
; v = v | (v >> 8);
mv r2, r1
shri r2, 8
or r1, r2
; v = v & ~(v >> 16);
mv r2, r1
xor r0, r0
loadi 0, 0x10
shr r2 ,r0
not r2
and r1, r2
; load 26 in r2
loadi 0, 0x1a
mv r2, r0
; v = v*0xFD7049FF;
; load 0xFD7049FF in r0
loadi 3, 0xFD
loadi 2, 0x70
loadi 1, 0x49
loadi 0, 0xFF
mul r1, r0
shr r1, r2
; return table[v >> 26];
; table address is 0
xor r2, r2
mula r1, 2
load r1, r1
ret
The nrdiv function is the implementation of the newton-raphson method for the goldschmidt division approximation:
nrdiv:
; divide function the Goldschmidt algorithm
; input i16 r4 n , r5 d output r4 quotient
; r0-r8 are modified
; handle division by 0
xor r0, r0
cmp r5, r0
jnz nrdivByR5
; division by 0
; skip
inc r0
xor r4, r4
ret
nrdivByR5:
; store sign of the result
; r8 sign = 1
xor r8, r8
inc r8
; convert both dividend and divisor to positive
; if (n < 0 and d >= 0) {
; n >= 0
cmp r4, r0
jge endNNegDPos
; d < 0
cmp r5, r0
jl endNNegDPos
; r8 sign = -1
dec r8
dec r8
; n = -n
neg r4
jmp unsignedNrdiv
endNNegDPos:
; else if (n >= 0 and d < 0) {
; n < 0
cmp r4, r0
jl endNPosDNeg
; n >= 0
cmp r5, r0
jge endNPosDNeg
; r8 sign = -1
dec r8
dec r8
; d = -d
neg r5
jmp unsignedNrdiv
endNPosDNeg:
; else if (n < 0 and d < 0) {
; n >= 0
cmp r4, r0
jge endNNegDNeg
; d >= 0
cmp r5, r0
jge endNNegDNeg
dec r0
; n = -n
neg r4
; d = -d
neg r5
endNNegDNeg:
unsignedNrdiv:
; r3 is shift
; u16 shift = __builtin_clz(((u32)d)<<16)-1;
; count leading zeros
mv r1, r5
xor r0, r0
loadi 0, 0x10
shl r1, r0
lzcnt r1, r1
dec r1
mv r3, r1
; r5 is d and becomes a after shl
; u32 a = d << shift;
shl r5, r3
; r2 is x
; u32 x = initialValuesForx[(a & 0x3e00) >> 9];
; initialValuesForx array has address 0x100
xor r0, r0
loadi 1, 0x1
mv r2, r0
mv r1, r5
xor r0, r0
loadi 1, 0x3e
and r1, r0
xor r0, r0
loadi 0, 0x9
shr r1, r0
mula r2, 2
load r2, r2
; r6 is 15
xor r0, r0
loadi 0, 0xf
mv r6, r0
; r7 is 0x10000
xor r0, r0
loadi 2, 0x1
mv r7, r0
; x = (x*(0x10000 - ((x*a)>>15)))>>15;
mv r1, r2 ; r1 = x
mul r1, r5 ; r1 = x * a
shr r1, r6
mv r0, r7
sub r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
mul r0, r2 ; r0 = r0 * x
shr r0, r6
mv r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15
; iteration 2
mv r1, r2 ; r1 = x
mul r1, r5 ; r1 = x * a
shr r1, r6
mv r0, r7
sub r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
mul r0, r2 ; r0 = r0 * x
shr r0, r6
mv r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15
; iteration 3
mv r1, r2 ; r1 = x
mul r1, r5 ; r1 = x * a
shr r1, r6
mv r0, r7
sub r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
mul r0, r2 ; r0 = r0 * x
shr r0, r6
mv r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15
; iteration 4
mv r1, r2 ; r1 = x
mul r1, r5 ; r1 = x * a
shr r1, r6
mv r0, r7
sub r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
mul r0, r2 ; r0 = r0 * x
shr r0, r6
; no more iterations - mv r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15
; (((u32)n * x) >> (15-shift)) >> 15
mul r4, r0 ; r4 = n * x
mv r0, r6
sub r0, r3 ; r0 = 15-shift
shr r4, r0 ; r4 = ((u32)n * x) >> (15-shift)
shr r4, r6 ; r4 = (((u32)n * x) >> (15-shift)) >> 15
; sign
; r4 = ((((u32)n * x) >> (15-shift)) >> 15) * sign
mul r4, r8
ret
It is not very efficient to use the clz function because it takes 50 clock cycles to run and the `lzcnt` instruction gives the result in 1 clock cycle.
Long division
To compare the performance of the goldschmidt division, I implemented the traditional long division in assembly, it is the slowest:
longdiv:
; divide function using the long division algorithm
; input u32 r4, r5 output r2 quotient r3 remainder
; r4 N r5 D
; r2 Q r3 R
; r2 Q = 0
xor r2, r2
; 34 R = 0
xor r3, r3
xor r0, r0
; r1 i 31
loadi 0, 0x1f
mv r1, r0
; r6 31 for shift 31
mv r6, r0
; r8 1 for Q = Q | 1
xor r8, r8
inc r8
; for (int i = 31; i >= 0; --i) {
longdivLoop:
; Q = Q << 1
shli r2, 1
; R = R << 1;
shli r3, 1
; R |= (N >> 31);
mv r7, r4
shr r7, r6
or r3, r7
shli r4, 1
; if (D <= R) {
cmp r5, r3
ja endQuotient
; R = R - D
sub r3, r5
; Q = Q | 1
or r2, r8
endQuotient:
dec r1
jns longdivLoop
ret
Division by 10
There is an efficient divide by 10 function in the book `Henry S. Warren - Hacker's Delight (2nd Edition)-Addison-Wesley Professional (2012)`:
unsigned divu10(unsigned n) {
unsigned q, r;
q = (n >> 1) + (n >> 2);
q = q + (q >> 4);
q = q + (q >> 8);
q = q + (q >> 16);
q = q >> 3;
r = n - (((q << 2) + q) << 1);
return q + (r > 9);
}
This divide by 10 function is useful for displaying a binary number is base 10.
I implemented it in C16 assembly:
div10:
; div10 function
; input u32 r0 n output r1
; q = (n >> 1) + (n >> 2);
mv r1,r0
shri r1, 1
mv r2,r0
shri r2, 2
add r1, r2
; q = q + (q >> 4);
mv r2, r1
shri r2, 4
add r1, r2
; q = q + (q >> 8);
mv r2, r1
shri r2, 8
add r1, r2
; q = q + (q >> 16);
mv r2, r1
shri r2, 8
shri r2, 8
add r1, r2
; q = q >> 3;
shri r1, 3
; r = n - (((q << 2) + q) << 1);
mv r2, r1
mv r3, r1
shli r2, 2
add r3, r2
shli r3, 1
sub r0, r3
; return q + (r > 9);
; r3 r
mv r3, r0
xor r0, r0
loadi 0, 0x9
cmp r3, r0
jbe endDiv10
inc r1
endDiv10:
; result in r1
ret
Hardware implementation in verilog
The divide by 10 function implemented in verilog is:
div10q = {1'b0, regfile_rdata1[31:1]} + {2'b0, regfile_rdata1[31:2]};
div10q = div10q + {4'b0, div10q[31:4]};
div10q = div10q + {8'b0, div10q[31:8]};
div10q = div10q + {16'b0, div10q[31:16]};
div10q = {3'b0, div10q[31:3]};
div10r = regfile_rdata1 - {{{div10q[28:0],2'b0}+div10q[30:0]}, 1'b0};
regfile_wdata1 <= div10q + (div10r > 9 ? 1 : 0);
I added a `div10` instruction to my C16 cpu and it takes 1 clock cycle.
I implemented a 32 bit goldschmidt division in verilog, the iterations need to be computed with 66 bit values.
For the goldschmidt division, we need to count the number of leading zero bits in the dividend to shift the dividend and compute an approximation of 1/dividend.
In verilog, leading zero are counted like this:
divshift[4] = (sdivd[31:16] == 16'b0);
divshift16 = divshift[4] ? sdivd[15:0] : sdivd[31:16];
divshift[3] = (divshift16[15:8] == 8'b0);
divshift8 = divshift[3] ? divshift16[7:0] : divshift16[15:8];
divshift[2] = (divshift8[7:4] == 4'b0);
divshift4 = divshift[2] ? divshift8[3:0] : divshift8[7:4];
divshift[1] = (divshift4[3:2] == 2'b0);
divshift[0] = divshift[1] ? ~divshift4[1] : ~divshift4[3];
`sdivd` is the dividend, it is never equal to 0 because divide by 0 is invalid. It sets bit 4 in the result, then bit 3, 2, 1 and 0. For bit 0, it doesn't need to test if the bit is 1 because we know the dividend is not 0.
`diva` is the shifted dividend in the range 2^31 to 2^32-1. `divx` initial value is in the range 2^32 to 2^33, it selected by using 4 bits (bit 27 to 30, bit 31 is always 1) in `diva`.
diva = {34'b0, sdivd} << divshift;
// initial value
if (diva[30:27] == 0) begin
divx = 66'h200000000;
end
else if (diva[30:27] == 1) begin
divx = 66'h1e1e1e1e1;
end
else if (diva[30:27] == 2) begin
divx = 66'h1c71c71c7;
end
else if (diva[30:27] == 3) begin
divx = 66'h1af286bca;
end
else if (diva[30:27] == 4) begin
divx = 66'h199999999;
end
else if (diva[30:27] == 5) begin
divx = 66'h186186186;
end
else if (diva[30:27] == 6) begin
divx = 66'h1745d1745;
end
else if (diva[30:27] == 7) begin
divx = 66'h1642c8590;
end
else if (diva[30:27] == 8) begin
divx = 66'h155555555;
end
else if (diva[30:27] == 9) begin
divx = 66'h147ae147a;
end
else if (diva[30:27] == 10) begin
divx = 66'h13b13b13b;
end
else if (diva[30:27] == 11) begin
divx = 66'h12f684bda;
end
else if (diva[30:27] == 12) begin
divx = 66'h124924924;
end
else if (diva[30:27] == 13) begin
divx = 66'h11a7b9611;
end
else if (diva[30:27] == 14) begin
divx = 66'h111111111;
end
else if (diva[30:27] == 15) begin
divx = 66'h108421084;
end
// Newton-Raphson method
divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;
divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;
divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;
divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;
// Shift divx * numerator to get the division result
diva = (((sdivn * divx) >> (32-{1'b0,divshift})) >> 32);
The execution stage of the division takes 1 clock cycle at 100Mhz, I don't see any timing violation in the logs from vivado (vivado is the tool from amd/xilinx for compiling designs for FPGAs) so it should be ok.
I have been running 50 million divisions with random numbers in a testbench and the division approximation is correct.
I first tried with 32 initial values and 3 iterations, it didn't work. So I increased to 64 and 128 initial values and the approximation was still incorrect sometimes. Maybe it works with 256 initial values and 3 iterations but it takes less ressources to have 16 initial values and 4 iterations.
Values are clock cycles.
Function/Implemation│ SW │ HW
────────────────────┼──────┼─────
division by 10 │ 91 │ 3
────────────────────┼──────┼─────
goldschmidt division│ 232 │ 3
────────────────────┼──────┼─────
long division │ 943 │ 32
The software implementation of the goldschmidt division is 4 times faster than the long division and the divide by 10 function is ten times faster than the long division. In the hardware column, I wrote 3 clock cycle for the hw division and the hw divide by 10 because the C16 CPU is currently not pipelined and the execution stage takes 1 clock cycle.
Feed
Guestbook