BaseN added Base58 explainer

2025-03-23 16:09:34 -07:00 · 2025-03-23 16:09:34 -07:00 · 09f6a49111
commit 09f6a49111
parent 739e86d04a
1 changed files with 519 additions and 3 deletions
--- a/BaseN.md
+++ b/BaseN.md
@ -73,8 +73,7 @@ It is more accurate to think of the basic unit in computing as a byte,
 which is a number between 0 (`2#0000_0000`) and 255 (`2#1111_1111`).

 (If you're totally bewildered by binary notation, you might want to read
-[\[s:qr-algorithm\]](#s:qr-algorithm){reference-type="ref"
-reference="s:qr-algorithm"} and come back).
+[§ Long Division](#long-division) and come back).

 Base64 processes a binary string 3 bytes at a time (which is 24 bits).
 It regroups those 3 groups of 8 bits into 4 groups of 6 bits.
@ -86,7 +85,7 @@ We now have 4 numbers ranging between 0 (`2#00_0000`) and 63
 (`2#11_1111`), hence the name "Base64".

 Each number in the range 0..63 is assigned to a character from the
-alphabet (see [Base64 Alphabet](#base64-alphabet))
+alphabet (see [§ Base64 Alphabet](#base64-alphabet))

 ```erlang
 % abbreviated
@ -223,6 +222,401 @@ This is marginally trickier because
 1.  it needs an accumulator
 2.  the order of the cases matters

+# Long Division
+
+As mentioned above, Base58 is the "general" BaseN algorithm applied to
+the case `N=58`. This general algorithm I am calling the
+quotient-remainder (QR) algorithm. You might know it as the long division
+algorithm you learned in school.
+
+Consider an integer, say `8763`. To understand the QR algorithm you need
+to make a conceptual distinction between the "conceptual integer" 8763
+and its representation as the sequence of symbols `10#8763`.
+
+The way our notation for numbers works is as follows:
+
+        10#8763 = 8 * 10^3 (1000)
+                + 7 * 10^2 (100)
+                + 6 * 10^1 (10)
+                + 3 * 10^0 (1)
+
+So in each "slot" there can exist one of 10 symbols: `0123456789`. Each
+slot corresponds to a power of 10. You multiply the number in each
+slot by the corresponding power of 10 and then take the sum to
+"obtain" the pure integer.
+
+Binary notation works exactly the same way, except each slot corresponds
+to a power of $2$, and consequently each slot contains one of the two
+symbols `01`.
+
+For instance, to convert `2#1010_0111` to its "conceptual integer", we
+do exactly the same process:
+
+        2#1010\_0111 = 1 * 2^7 (128)
+                     + 0 * 2^6 (64)
+                     + 1 * 2^5 (32)
+                     + 0 * 2^4 (16)
+                     + 0 * 2^3 (8)
+                     + 1 * 2^2 (4)
+                     + 1 * 2^1 (2)
+                     + 1 * 2^0 (1)
+                     = 128 + 32 + 4 + 2 + 1
+                     = 167
+
+Exercise for the reader: take this binary counting concept and figure
+out how you could use your fingers to count up to 1023.
+
+All we have done so far is convert things to base 10. This isn't hard
+because it's already familiar.
+
+If you are a native English speaker and kind of know French, it's much
+easier to translate French into English than it is to translate English
+into French. Expressing things in an unfamiliar context is significantly
+more challenging than comprehending things in the unfamiliar context.
+
+How might we take our integer 8763 and convert it into base 2?
+
+It's pretty simple: divide by 2 over and over using the long division
+you learned in elementary school (you *do* remember how to do long
+division, right?)
+
+            quotients  remainders
+              |----|    |-|
+        8763 = 4381 *2 + 1
+        4381 = 2190 *2 + 1
+        2190 = 1095 *2 + 0
+        1095 =  547 *2 + 1
+         547 =  273 *2 + 1
+         273 =  136 *2 + 1
+         136 =   68 *2 + 0
+          68 =   34 *2 + 0
+          34 =   17 *2 + 0
+          17 =    8 *2 + 1
+           8 =    4 *2 + 0
+           4 =    2 *2 + 0
+           2 =    1 *2 + 0
+           1 =    0 *2 + 1
+           0 =    DONE
+
+The sequence of remainders *in reverse order* is the binary
+representation of 8763. Equivalently, the sequence of remainders
+"grows to the left".
+
+```erlang
+16> 2#1000_1000_1110_11.
+8763
+```
+
+Notice that if we delete digits off the right end, we get back the
+sequence of quotients:
+
+```erlang
+16> 2#1000_1000_1110_11.
+8763
+17> 2#1000_1000_1110_1.
+4381
+18> 2#1000_1000_1110.
+2190
+19> 2#1000_1000_111.
+1095
+20> 2#1000_1000_11.
+547
+```
+
+Take a minute and figure out for yourself why this makes sense.
+
+At any rate, the way to code this is
+
+```erlang
+-module(ntb).
+
+-export([
+    n_to_base/2
+]).
+
+
+n_to_base(N, Base) when N >= 0, Base >= 2 ->
+    n_to_base(N, Base, []).
+
+n_to_base(0, _Base, Digits) ->
+    Digits;
+n_to_base(N, Base, Digits) ->
+    NewN      = N div Base,
+    NewDigit  = N rem Base,
+    NewDigits = [NewDigit | Digits],
+    n_to_base(NewN, Base, NewDigits).
+```
+
+```erlang
+3> ntb:n_to_base(8763, 2).
+[1,0,0,0,1,0,0,0,1,1,1,0,1,1]
+```
+
+Let us do the calculation manually in Base58 to get our digits:
+
+    8763 = 151*58 +  5
+     151 =   2*58 + 35
+       2 =   0*58 +  2
+       0 = DONE
+
+So in "Base58", the sequence of digits is `[2, 35, 5]`. Our program
+agrees:
+
+```erlang
+8> ntb:n_to_base(8763, 58).
+[2,35,5]
+```
+
+Our Base58 code is going to contain exactly this idea modulo an
+unpleasant volume of edge case stupidity.
+
+# Base58 algorithm
+
+Conceptually, we are doing the following:
+
+    binary data <-> integer <-> sequence of numbers 0-57 <-> text characters
+
+The unpleasantry comes from the fact that positional notation for
+numbers makes no distinction between the sequence of symbols `8763` and
+`00008763`.
+
+That is, if we have a bytestring with a bunch of leading 0 bytes, those
+bytes are not going to change the integer that bytestring corresponds
+to. That is, all of the following byte arrays point to the same integer:
+
+    ABCD_EFGH
+    0000_0000 ABCD_EFGH
+    0000_0000 0000_0000 ABCD_EFGH
+    0000_0000 0000_0000 0000_0000 ABCD_EFGH
+
+So when we're going from integers back to binary, we have an ambiguity.
+Because we could in theory have a billion leading zero bytes that are
+just totally unaccounted for.
+
+The hack to deal with this is that leading 0 bytes are represented as
+leading `$1`s in the text representation (`$0` is excluded from the Base58
+alphabet to avoid visual collision with `$O`).
+
+This ends up meaning we have to do an extra preprocessing step
+
+1.  on decode (text -> binary): pre-strip all the leading `$1`
+    characters off the text representation and account for them as
+    leading `0` bytes
+
+2.  on encode (binary -> text): and conversely pre-strip all the leading `0`
+    bytes and account for them as leading `$1` characters.
+
+The result is as follows:
+
+```erlang
+-module(b58).
+
+-export([
+    enc/1,
+    dec/1,
+    chars58/2,
+    unchars58/2,
+    int2char/1,
+    char2int/1
+]).
+
+
+%% consume all the leading 0 bytes
+enc(<<0:8, Rest/binary>>) ->
+    [int2char(0) | enc(Rest)];
+enc(Bytes) when is_binary(Bytes) ->
+    %% Binary -> integer
+    N = binary:decode_unsigned(Bytes),
+    %% Integer -> chars
+    chars58(N, []).
+
+%% combining steps of converting to numbers 0..57 and then converting those
+%% numbers to ascii characters just to avoid traversing the list twice
+chars58(N, Acc) when N > 0 ->
+    Q = N div 58,
+    R = N rem 58,
+    NewAcc = [int2char(R) | Acc],
+    chars58(Q, NewAcc);
+chars58(0, Acc) ->
+    Acc.
+
+
+%% consume all the leading 1 characters
+dec([$1 | Rest]) ->
+    %io:format("dec([$1 | ~p])~n", [Rest]),
+    <<0:8, (dec(Rest))/binary>>;
+%% this case corresponds to the bytestring being entirely 0 bytes
+%% in this case, our usual approach (seen below) would compute N=0 and then
+%% binary:encode_unsigned/1 encodes 0 to <<0>>, thus incorrectly adding an
+%% extra zero byte. this case preempts that
+dec([]) ->
+    %io:format("dec([])~n", []),
+    <<>>;
+dec(Chars) when is_list(Chars) ->
+    %io:format("dec(~p)~n", [Chars]),
+    %% chars -> integer
+    N = unchars58(Chars, 0),
+    %% integer -> binary
+    binary:encode_unsigned(N).
+
+unchars58([Char | Rest], AccN) ->
+    %io:format("unchars58([~p | ~p], ~p)~n", [Char, Rest, AccN]),
+    %% Char corresponds to a number in 0..57
+    Digit057 = char2int(Char),
+    % You can imagine the entire original digit as
+    %      AccN ++ Char ++ Rest
+    % multiplying AccN by 58 corresponds to "shifting" AccN 1 space to the
+    % left. Then we "++ Char" at the right end, then continue on.
+    NewAccN  = AccN*58 + Digit057,
+    unchars58(Rest, NewAccN);
+unchars58([], AccN) ->
+    %io:format("unchars58([], ~p)~n", [AccN]),
+    AccN.
+
+
+int2char( 0) -> $1;
+int2char( 1) -> $2;
+int2char( 2) -> $3;
+int2char( 3) -> $4;
+int2char( 4) -> $5;
+int2char( 5) -> $6;
+int2char( 6) -> $7;
+int2char( 7) -> $8;
+int2char( 8) -> $9;
+int2char( 9) -> $A;
+int2char(10) -> $B;
+int2char(11) -> $C;
+int2char(12) -> $D;
+int2char(13) -> $E;
+int2char(14) -> $F;
+int2char(15) -> $G;
+int2char(16) -> $H;
+int2char(17) -> $J;
+int2char(18) -> $K;
+int2char(19) -> $L;
+int2char(20) -> $M;
+int2char(21) -> $N;
+int2char(22) -> $P;
+int2char(23) -> $Q;
+int2char(24) -> $R;
+int2char(25) -> $S;
+int2char(26) -> $T;
+int2char(27) -> $U;
+int2char(28) -> $V;
+int2char(29) -> $W;
+int2char(30) -> $X;
+int2char(31) -> $Y;
+int2char(32) -> $Z;
+int2char(33) -> $a;
+int2char(34) -> $b;
+int2char(35) -> $c;
+int2char(36) -> $d;
+int2char(37) -> $e;
+int2char(38) -> $f;
+int2char(39) -> $g;
+int2char(40) -> $h;
+int2char(41) -> $i;
+int2char(42) -> $j;
+int2char(43) -> $k;
+int2char(44) -> $m;
+int2char(45) -> $n;
+int2char(46) -> $o;
+int2char(47) -> $p;
+int2char(48) -> $q;
+int2char(49) -> $r;
+int2char(50) -> $s;
+int2char(51) -> $t;
+int2char(52) -> $u;
+int2char(53) -> $v;
+int2char(54) -> $w;
+int2char(55) -> $x;
+int2char(56) -> $y;
+int2char(57) -> $z.
+
+char2int($1) ->  0;
+char2int($2) ->  1;
+char2int($3) ->  2;
+char2int($4) ->  3;
+char2int($5) ->  4;
+char2int($6) ->  5;
+char2int($7) ->  6;
+char2int($8) ->  7;
+char2int($9) ->  8;
+char2int($A) ->  9;
+char2int($B) -> 10;
+char2int($C) -> 11;
+char2int($D) -> 12;
+char2int($E) -> 13;
+char2int($F) -> 14;
+char2int($G) -> 15;
+char2int($H) -> 16;
+char2int($J) -> 17;
+char2int($K) -> 18;
+char2int($L) -> 19;
+char2int($M) -> 20;
+char2int($N) -> 21;
+char2int($P) -> 22;
+char2int($Q) -> 23;
+char2int($R) -> 24;
+char2int($S) -> 25;
+char2int($T) -> 26;
+char2int($U) -> 27;
+char2int($V) -> 28;
+char2int($W) -> 29;
+char2int($X) -> 30;
+char2int($Y) -> 31;
+char2int($Z) -> 32;
+char2int($a) -> 33;
+char2int($b) -> 34;
+char2int($c) -> 35;
+char2int($d) -> 36;
+char2int($e) -> 37;
+char2int($f) -> 38;
+char2int($g) -> 39;
+char2int($h) -> 40;
+char2int($i) -> 41;
+char2int($j) -> 42;
+char2int($k) -> 43;
+char2int($m) -> 44;
+char2int($n) -> 45;
+char2int($o) -> 46;
+char2int($p) -> 47;
+char2int($q) -> 48;
+char2int($r) -> 49;
+char2int($s) -> 50;
+char2int($t) -> 51;
+char2int($u) -> 52;
+char2int($v) -> 53;
+char2int($w) -> 54;
+char2int($x) -> 55;
+char2int($y) -> 56;
+char2int($z) -> 57.
+```
+
+```erlang
+4> b58:chars58(8763, []).
+"3c6"
+6> b58:char2int($3).
+2
+7> b58:char2int($c).
+35
+8> b58:char2int($6).
+5
+43> binary:encode_unsigned(8763).
+<<"\";">>
+44> b58:enc(<<"\";">>).
+"3c6"
+45> b58:enc(<<0, 0, 0, "\";">>).
+"1113c6"
+46> b58:dec("3c6").
+<<"\";">>
+47> b58:dec("13c6").
+<<0,34,59>>
+48> b58:dec("11113c6").
+<<0,0,0,0,34,59>>
+```
+
+

 ## Tables etc

@ -359,3 +753,125 @@ char2int($9) -> 61;
 char2int($+) -> 62;
 char2int($/) -> 63.
 ```
+
+### Base58 Alphabet
+
+```erlang
+int2char( 0) -> $1;
+int2char( 1) -> $2;
+int2char( 2) -> $3;
+int2char( 3) -> $4;
+int2char( 4) -> $5;
+int2char( 5) -> $6;
+int2char( 6) -> $7;
+int2char( 7) -> $8;
+int2char( 8) -> $9;
+int2char( 9) -> $A;
+int2char(10) -> $B;
+int2char(11) -> $C;
+int2char(12) -> $D;
+int2char(13) -> $E;
+int2char(14) -> $F;
+int2char(15) -> $G;
+int2char(16) -> $H;
+int2char(17) -> $J;
+int2char(18) -> $K;
+int2char(19) -> $L;
+int2char(20) -> $M;
+int2char(21) -> $N;
+int2char(22) -> $P;
+int2char(23) -> $Q;
+int2char(24) -> $R;
+int2char(25) -> $S;
+int2char(26) -> $T;
+int2char(27) -> $U;
+int2char(28) -> $V;
+int2char(29) -> $W;
+int2char(30) -> $X;
+int2char(31) -> $Y;
+int2char(32) -> $Z;
+int2char(33) -> $a;
+int2char(34) -> $b;
+int2char(35) -> $c;
+int2char(36) -> $d;
+int2char(37) -> $e;
+int2char(38) -> $f;
+int2char(39) -> $g;
+int2char(40) -> $h;
+int2char(41) -> $i;
+int2char(42) -> $j;
+int2char(43) -> $k;
+int2char(44) -> $m;
+int2char(45) -> $n;
+int2char(46) -> $o;
+int2char(47) -> $p;
+int2char(48) -> $q;
+int2char(49) -> $r;
+int2char(50) -> $s;
+int2char(51) -> $t;
+int2char(52) -> $u;
+int2char(53) -> $v;
+int2char(54) -> $w;
+int2char(55) -> $x;
+int2char(56) -> $y;
+int2char(57) -> $z.
+
+char2int($1) ->  0;
+char2int($2) ->  1;
+char2int($3) ->  2;
+char2int($4) ->  3;
+char2int($5) ->  4;
+char2int($6) ->  5;
+char2int($7) ->  6;
+char2int($8) ->  7;
+char2int($9) ->  8;
+char2int($A) ->  9;
+char2int($B) -> 10;
+char2int($C) -> 11;
+char2int($D) -> 12;
+char2int($E) -> 13;
+char2int($F) -> 14;
+char2int($G) -> 15;
+char2int($H) -> 16;
+char2int($J) -> 17;
+char2int($K) -> 18;
+char2int($L) -> 19;
+char2int($M) -> 20;
+char2int($N) -> 21;
+char2int($P) -> 22;
+char2int($Q) -> 23;
+char2int($R) -> 24;
+char2int($S) -> 25;
+char2int($T) -> 26;
+char2int($U) -> 27;
+char2int($V) -> 28;
+char2int($W) -> 29;
+char2int($X) -> 30;
+char2int($Y) -> 31;
+char2int($Z) -> 32;
+char2int($a) -> 33;
+char2int($b) -> 34;
+char2int($c) -> 35;
+char2int($d) -> 36;
+char2int($e) -> 37;
+char2int($f) -> 38;
+char2int($g) -> 39;
+char2int($h) -> 40;
+char2int($i) -> 41;
+char2int($j) -> 42;
+char2int($k) -> 43;
+char2int($m) -> 44;
+char2int($n) -> 45;
+char2int($o) -> 46;
+char2int($p) -> 47;
+char2int($q) -> 48;
+char2int($r) -> 49;
+char2int($s) -> 50;
+char2int($t) -> 51;
+char2int($u) -> 52;
+char2int($v) -> 53;
+char2int($w) -> 54;
+char2int($x) -> 55;
+char2int($y) -> 56;
+char2int($z) -> 57.
+```