BaseN drafting WIP

2025-03-23 14:18:42 -07:00 · 2025-03-23 14:18:42 -07:00 · ef7c6be734
commit ef7c6be734
parent 2c63bdd744
1 changed files with 373 additions and 0 deletions
--- a/BaseN.md
+++ b/BaseN.md
@ -1,3 +1,376 @@
 # Base64 vs Base58 Explained

 ![BaseN diagram](./uploads/baseN-thumb-original-1024x576.png)
+
+## Quick Reference
+
+1.  Original article: <https://zxq9.com/archives/2688>
+2.  Base64 Erlang:
+    <https://github.com/aeternity/Vanillae/blob/829dd2930ff20ea0473cf2ad562e0a1c2aba0411/utils/vw/src/vb64.erl>
+3.  Base58 Erlang:
+    <https://github.com/aeternity/Vanillae/blob/829dd2930ff20ea0473cf2ad562e0a1c2aba0411/utils/vw/src/vb58.erl>
+4.  Base64 TypeScript:
+    <https://github.com/aeternity/Vanillae/blob/829dd2930ff20ea0473cf2ad562e0a1c2aba0411/bindings/typescript/src/b64.ts>
+5.  Base58 TypeScript:
+    <https://github.com/aeternity/Vanillae/blob/829dd2930ff20ea0473cf2ad562e0a1c2aba0411/bindings/typescript/src/b58.ts>
+
+## tl;dr
+
+Base64 and Base58 are two schema for taking binary data and encoding it
+to and from plain text.
+
+1.  Conceptually:
+
+    1.  These are **NOT** two instances of a general "base N" concept.
+
+    2.  Base64 thinks of the binary data as a long stream of bytes.
+
+        Base64 converts bytes to/from text 3 bytes at a time in a
+        "fire-and-forget" manner.
+
+    3.  Base58 thinks of the binary data as a really really long
+        integer.
+
+        Base58 requires processing the entire bytestring all at once as
+        a singular unit.
+
+    4.  Base58 *is* the general BaseN algorithm. Base64 is simpler
+        because 64 and 256 are both powers of 2 and computers use
+        binary.
+
+2.  Terminologically:
+
+    1.  The terms "encode" and "decode" are meant *from the perspective
+        of the computer program*. From the program's perspective, binary
+        data is what makes sense and plain text is gobbletygook. So
+
+        1.  we *decode* plain text (gahbage) into binary data (what
+            makes sense)
+
+        2.  we *encode* binary data (what makes sense) to plain text
+            (gahbage)
+
+3.  Practically:
+
+    1.  Almost every language (including Erlang) has base64 in its
+        standard library, and you should probably just use that.
+
+    2.  You probably have to code Base58 yourself.
+
+    3.  If you are implementing Base58 in a language that does not have
+        bignum arithmetic, you have to implement it yourself. Thankfully
+        nobody will ever need any language other than Erlang so we can
+        ignore this problem in the context of this wiki.
+
+    4.  Base58 is *super* inefficient both space-wise and time-wise, because it
+        requires processing the entirety of the binary string as a single
+        monolithic piece of data.
+
+    5.  Base58 exists to preempt `Il10O`-type problems (visual
+        ambiguity) and doesn't involve `=+/` characters (the idea being
+        email clients are likely to break long lines at these
+        characters, increasing the likelihood of input errors).
+
+    6.  Consequently, Base58 is only suitable for binary data that is
+        **both**
+
+        1.  short in length
+
+        2.  likely to be entered manually (e.g. wallet/contract
+            addresses)
+# Base64
+
+The term "binary" is misleading, because it leads people to think that
+the basic unit in computing is a bit (a 1 or a 0).
+
+It is more accurate to think of the basic unit in computing as a byte,
+which is a number between 0 (`2#0000_0000`) and 255 (`2#1111_1111`).
+
+(If you're totally bewildered by binary notation, you might want to read
+[\[s:qr-algorithm\]](#s:qr-algorithm){reference-type="ref"
+reference="s:qr-algorithm"} and come back).
+
+Base64 processes a binary string 3 bytes at a time (which is 24 bits).
+It regroups those 3 groups of 8 bits into 4 groups of 6 bits.
+
+    ABCDEFGH 12345678 ABCDEFGH
+    ABCDEF GH1234 5678AB CDEFGH
+
+We now have 4 numbers ranging between 0 (`2#00_0000`) and 63
+(`2#11_1111`), hence the name "Base64".
+
+Each number in the range 0..63 is assigned to a character from the
+alphabet (see [Base64 Alphabet](#base64-alphabet))
+
+```erlang
+% abbreviated
+% 0..25 map to $A..$Z
+int2char(N) when 0 =< N, N =< 25  -> $A + N;
+% 26..51 map to $a..$z
+int2char(N) when 26 =< N, N =< 51 -> $a + (N - 26);
+% 52..61 map to $0..$9
+int2char(N) when 52 =< N, N =< 61 -> $0 + (N - 52);
+% special cases
+int2char(62) -> $+;
+int2char(63) -> $/.
+
+% $A..$Z map to 0..25
+char2int(C) when $A =< C, C =< $Z -> C - $A;
+% $a..$z map to 26..51
+char2int(C) when $a =< C, C =< $z -> C - $a;
+% $0..$9 map to 52..61
+char2int(C) when $0 =< C, C =< $9 -> C - $0;
+% special cases
+char2int($+) -> 62;
+char2int($/) -> 63.
+```
+
+The only stupid cases arise when the number of bytes in the binary data
+is not a multiple of 3. In this case there are two padding rules:
+
+``` {.Erlang language="Erlang"}
+% general case: at least 3 bytes (24 bits = 6+6+6+6) remaining
+%
+% 12345678 abcdefgh 12345678    ...
+% 123456 78abcd efgh12 345678   ...
+%   A      B     C      D       Rest
+% convert to chars ->
+%   CA    CB     CC    CD
+enc(<<A:6, B:6, C:6, D:6, Rest/binary>>) ->
+    CA = int2char(A),
+    CB = int2char(B),
+    CC = int2char(C),
+    CD = int2char(D),
+    [CA, CB, CC, CD | enc(Rest)];
+% terminal case: 2 bytes (16 bits = 6+6+4) remaining
+%
+% 12345678 abcdefgh
+% 123456 78abcd   efgh__
+%    A     B     C bsl 2
+% convert to chars ->
+%   CA     CB       CC    =
+enc(<<A:6, B:6, C:4>>) ->
+    CA = int2char(A),
+    CB = int2char(B),
+    CC = int2char(C bsl 2),
+    [CA, CB, CC, $=];
+% terminal case: 1 byte (8 bits = 6+2) remaining
+%
+% 12345678 ->
+% 123456   78____
+%    A     B bsl 4
+% convert to chars ->
+%   CA      CB     =    =
+enc(<<A:6, B:2>>) ->
+    CA = int2char(A),
+    CB = int2char(B bsl 4),
+    [CA, CB, $=, $=];
+% terminal case: 0 bytes remaining
+enc(<<>>) ->
+    [].
+```
+
+By the way, that's the *entire* encode procedure right there.
+
+The decode procedure is similarly simple but slightly trickier:
+
+``` {.Erlang language="Erlang"}
+dec(Base64_String) ->
+    dec(Base64_String, <<>>).
+
+
+% terminal case: two equal signs at the end = 1 byte (8 bits = 6+2) remaining
+% input (characters) ->
+%         W       X     =    =
+% convert to numbers ->
+%       abcdef gh____   =    =
+%         NW      NX
+% regroup ->
+%          abcdefgh  ____        abcdef   gh____
+%       <<LastByte:8, 0:4>> = <<  NW:6,    NX:6   >>
+dec([W, X, $=, $=], Acc) ->
+    NW = char2int(W),
+    NX = char2int(X),
+    <<LastByte:8, 0:4>> = <<NW:6, NX:6>>,
+    <<Acc/binary, LastByte:8>>;
+% terminal case: one equal sign at the end = 2 bytes remaining
+%
+% input (characters) ->
+%         W       X     Y    =
+% convert to numbers ->
+%       abcdef gh1234  5678__  =
+%         NW      NX    NY
+% regroup ->
+%          abcdefgh  12345678   __          abcdef    gh1234   5678__
+%       <<   B1:8,     B2:8,   0:2  >> = <<  NW:6,     NX:6     NY:6   >>
+dec([W, X, Y, $=], Acc) ->
+    NW = char2int(W),
+    NX = char2int(X),
+    NY = char2int(Y),
+    <<B1:8, B2:8, 0:2>> = <<NW:6, NX:6, NY:6>>,
+    <<Acc/binary, B1:8, B2:8>>;
+% terminal case: 0 bytes remaining
+% nothing to do
+dec([], Acc) ->
+    Acc;
+% general case: no equal signs = 3 or more bytes remaining
+%
+% input (characters) ->
+%         W       X      Y      Z
+% convert to numbers ->
+%       abcdef gh1234  5678ab cdefgh
+%         NW      NX    NY      NZ
+% decompose ->
+%          abcdefgh  12345678   abcdefgh          abcdef    gh1234   5678ab   cdefgh
+%       <<   B1:8,     B2:8,      B3:2   >> = <<  NW:6,     NX:6     NY:6,    NZ:6   >>
+dec([W, X, Y, Z | Rest], Acc) ->
+    NW = char2int(W),
+    NX = char2int(X),
+    NY = char2int(Y),
+    NZ = char2int(Z),
+    NewAcc = <<Acc/binary, NW:6, NX:6, NY:6, NZ:6>>,
+    dec(Rest, NewAcc).
+```
+
+This is marginally trickier because
+
+1.  it needs an accumulator
+
+2.  the order of the cases matters
+
+
+## Tables etc
+
+### Base64 Alphabet
+
+``` {.Erlang language="Erlang"}
+int2char( 0) -> $A;
+int2char( 1) -> $B;
+int2char( 2) -> $C;
+int2char( 3) -> $D;
+int2char( 4) -> $E;
+int2char( 5) -> $F;
+int2char( 6) -> $G;
+int2char( 7) -> $H;
+int2char( 8) -> $I;
+int2char( 9) -> $J;
+int2char(10) -> $K;
+int2char(11) -> $L;
+int2char(12) -> $M;
+int2char(13) -> $N;
+int2char(14) -> $O;
+int2char(15) -> $P;
+int2char(16) -> $Q;
+int2char(17) -> $R;
+int2char(18) -> $S;
+int2char(19) -> $T;
+int2char(20) -> $U;
+int2char(21) -> $V;
+int2char(22) -> $W;
+int2char(23) -> $X;
+int2char(24) -> $Y;
+int2char(25) -> $Z;
+int2char(26) -> $a;
+int2char(27) -> $b;
+int2char(28) -> $c;
+int2char(29) -> $d;
+int2char(30) -> $e;
+int2char(31) -> $f;
+int2char(32) -> $g;
+int2char(33) -> $h;
+int2char(34) -> $i;
+int2char(35) -> $j;
+int2char(36) -> $k;
+int2char(37) -> $l;
+int2char(38) -> $m;
+int2char(39) -> $n;
+int2char(40) -> $o;
+int2char(41) -> $p;
+int2char(42) -> $q;
+int2char(43) -> $r;
+int2char(44) -> $s;
+int2char(45) -> $t;
+int2char(46) -> $u;
+int2char(47) -> $v;
+int2char(48) -> $w;
+int2char(49) -> $x;
+int2char(50) -> $y;
+int2char(51) -> $z;
+int2char(52) -> $0;
+int2char(53) -> $1;
+int2char(54) -> $2;
+int2char(55) -> $3;
+int2char(56) -> $4;
+int2char(57) -> $5;
+int2char(58) -> $6;
+int2char(59) -> $7;
+int2char(60) -> $8;
+int2char(61) -> $9;
+int2char(62) -> $+;
+int2char(63) -> $/.
+
+char2int($A) ->  0;
+char2int($B) ->  1;
+char2int($C) ->  2;
+char2int($D) ->  3;
+char2int($E) ->  4;
+char2int($F) ->  5;
+char2int($G) ->  6;
+char2int($H) ->  7;
+char2int($I) ->  8;
+char2int($J) ->  9;
+char2int($K) -> 10;
+char2int($L) -> 11;
+char2int($M) -> 12;
+char2int($N) -> 13;
+char2int($O) -> 14;
+char2int($P) -> 15;
+char2int($Q) -> 16;
+char2int($R) -> 17;
+char2int($S) -> 18;
+char2int($T) -> 19;
+char2int($U) -> 20;
+char2int($V) -> 21;
+char2int($W) -> 22;
+char2int($X) -> 23;
+char2int($Y) -> 24;
+char2int($Z) -> 25;
+char2int($a) -> 26;
+char2int($b) -> 27;
+char2int($c) -> 28;
+char2int($d) -> 29;
+char2int($e) -> 30;
+char2int($f) -> 31;
+char2int($g) -> 32;
+char2int($h) -> 33;
+char2int($i) -> 34;
+char2int($j) -> 35;
+char2int($k) -> 36;
+char2int($l) -> 37;
+char2int($m) -> 38;
+char2int($n) -> 39;
+char2int($o) -> 40;
+char2int($p) -> 41;
+char2int($q) -> 42;
+char2int($r) -> 43;
+char2int($s) -> 44;
+char2int($t) -> 45;
+char2int($u) -> 46;
+char2int($v) -> 47;
+char2int($w) -> 48;
+char2int($x) -> 49;
+char2int($y) -> 50;
+char2int($z) -> 51;
+char2int($0) -> 52;
+char2int($1) -> 53;
+char2int($2) -> 54;
+char2int($3) -> 55;
+char2int($4) -> 56;
+char2int($5) -> 57;
+char2int($6) -> 58;
+char2int($7) -> 59;
+char2int($8) -> 60;
+char2int($9) -> 61;
+char2int($+) -> 62;
+char2int($/) -> 63.
+```