commit d9d541dce82e83d1d70e4427eb310a1f72189884 Author: SUZUKI Tetsuya Date: Wed Oct 3 15:05:19 2012 +0900 initial import diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..81fb1a5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +.DS_Store +.eunit +ebin +deps +priv +*.o +*.beam +*.plt +*.swp +*.html +*.png +edoc-info +stylesheet.css + diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..cfd50bf --- /dev/null +++ b/.travis.yml @@ -0,0 +1,12 @@ +language: erlang +notifications: + disabled: true +branches: + only: + - develop + - 0.1.0 +otp_release: + - R15B02 + - R15B01 + - R15B + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3e8b822 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +.PHONY: doc + +all: + ./rebar compile + ./rebar doc + ./rebar xref + ./rebar eunit + +compile: + ./rebar compile + +doc: + ./rebar doc + +xref: compile + ./rebar xref + +clean: + ./rebar clean + +test: xref + ./rebar eunit + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9e95ae5 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +erlang-sha3 +=========== + +[![Build Status](https://secure.travis-ci.org/szktty/erlang-sha3.png?branch=develop)](http://travis-ci.org/szktty/erlang-sha3) + +SHA3 for Erlang + + +Licenses +-------- + +This program is distributed under Apache License 2.0. + +Keccak source files are distributed under CC0 1.0 Universal (CC0 1.0) Public Domain Dedication license. + + +Author +------ + +SUZUKI Tetsuya + diff --git a/c_src/AVR8-rotate64.h b/c_src/AVR8-rotate64.h new file mode 100755 index 0000000..4f921b9 --- /dev/null +++ b/c_src/AVR8-rotate64.h @@ -0,0 +1,27 @@ +/* +File: AVR8-rotate64.h + +This code is originally by Daniel Otte (daniel.otte@rub.de) in 2006-2010 as part of the AVR-Crypto-Lib, and was then improved by Ronny Van Keer, STMicroelectronics, in 2010. + +Implementation by Daniel Otte and Ronny Van Keer, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef ROTATE64_H_ +#define ROTATE64_H_ + +#include + + +#define ROT_CODE(a) ((((a)/8+((((a)%8)>4)?1:0))<<4) | ((a) & 7)) + +uint64_t rotate64_1bit_left(uint64_t a); +uint64_t rotate64_1bit_right(uint64_t a); +uint64_t rotate64left_code(uint64_t a, int8_t code); + +#endif /* ROTATE64_H_ */ + diff --git a/c_src/AVR8-rotate64.s b/c_src/AVR8-rotate64.s new file mode 100755 index 0000000..f30d030 --- /dev/null +++ b/c_src/AVR8-rotate64.s @@ -0,0 +1,285 @@ +/* +File: AVR8-rotate64.s + +This code is originally by Daniel Otte (daniel.otte@rub.de) in 2006-2010 as part of the AVR-Crypto-Lib, and was then improved by Ronny Van Keer, STMicroelectronics, in 2010. + +Implementation by Daniel Otte and Ronny Van Keer, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +.global rotate64_1bit_left +rotate64_4bit_left: + lsl r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + adc r18, r1 +rotate64_3bit_left: + lsl r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + adc r18, r1 +rotate64_2bit_left: + lsl r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + adc r18, r1 +rotate64_1bit_left: + lsl r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + adc r18, r1 + ret + +.global rotate64_1bit_right +rotate64_3bit_right: + bst r18, 0 + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + bld r25, 7 +rotate64_2bit_right: + bst r18, 0 + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + bld r25, 7 +rotate64_1bit_right: + bst r18, 0 + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + bld r25, 7 + ret + +/* +** Each byte rotate routine must be 16 instructions long. +*/ +rotate64_0byte_left: + andi r16, 0x07 + ldi r30, pm_lo8(bit_rot_jmp_table) + ldi r31, pm_hi8(bit_rot_jmp_table) + add r30, r16 + + adc r31, r1 + ijmp + nop + nop + + nop + nop + nop + nop + + nop + nop + nop + nop + +rotate64_1byte_left: + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + + mov r22, r21 + mov r21, r20 + mov r20, r19 + mov r19, r18 + + mov r18, r0 + andi r16, 0x07 + ldi r30, pm_lo8(bit_rot_jmp_table) + ldi r31, pm_hi8(bit_rot_jmp_table) + + add r30, r16 + adc r31, r1 + ijmp + nop + +rotate64_2byte_left: + movw r0, r24 + movw r24, r22 + movw r22, r20 + movw r20, r18 + + movw r18, r0 + clr r1 + andi r16, 0x07 + ldi r30, pm_lo8(bit_rot_jmp_table) + + ldi r31, pm_hi8(bit_rot_jmp_table) + add r30, r16 + adc r31, r1 + ijmp + + nop + nop + nop + nop + +rotate64_3byte_left: + mov r0, r25 + mov r25, r22 + mov r22, r19 + mov r19, r24 + + mov r24, r21 + mov r21, r18 + mov r18, r23 + mov r23, r20 + + mov r20, r0 + andi r16, 0x07 + ldi r30, pm_lo8(bit_rot_jmp_table) + ldi r31, pm_hi8(bit_rot_jmp_table) + + add r30, r16 + adc r31, r1 + ijmp + nop + +rotate64_4byte_left: + movw r0, r24 + movw r24, r20 + movw r20, r0 + movw r0, r22 + + movw r22, r18 + movw r18, r0 + clr r1 + andi r16, 0x07 + + ldi r30, pm_lo8(bit_rot_jmp_table) + ldi r31, pm_hi8(bit_rot_jmp_table) + add r30, r16 + adc r31, r1 + + ijmp + nop + nop + nop + +rotate64_5byte_left: + mov r0, r25 + mov r25, r20 + mov r20, r23 + mov r23, r18 + + mov r18, r21 + mov r21, r24 + mov r24, r19 + mov r19, r22 + + mov r22, r0 + andi r16, 0x07 + ldi r30, pm_lo8(bit_rot_jmp_table) + ldi r31, pm_hi8(bit_rot_jmp_table) + + add r30, r16 + adc r31, r1 + ijmp + nop + +rotate64_6byte_left: + movw r0, r18 + movw r18, r20 + movw r20, r22 + movw r22, r24 + + movw r24, r0 + clr r1 + andi r16, 0x07 + ldi r30, pm_lo8(bit_rot_jmp_table) + + ldi r31, pm_hi8(bit_rot_jmp_table) + add r30, r16 + adc r31, r1 + ijmp + + nop + nop + nop + nop + +rotate64_7byte_left: + mov r0, r18 + mov r18, r19 + mov r19, r20 + mov r20, r21 + + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + + mov r25, r0 + andi r16, 0x07 + ldi r30, pm_lo8(bit_rot_jmp_table) + ldi r31, pm_hi8(bit_rot_jmp_table) + + add r30, r16 + adc r31, r1 + ijmp + nop + + +bit_rot_jmp_table: + ret + rjmp rotate64_1bit_left + rjmp rotate64_2bit_left + rjmp rotate64_3bit_left + rjmp rotate64_4bit_left + rjmp rotate64_3bit_right + rjmp rotate64_2bit_right + rjmp rotate64_1bit_right + +.global rotate64left_code +rotate64left_code: + ldi r30, pm_lo8(rotate64_0byte_left) + ldi r31, pm_hi8(rotate64_0byte_left) + mov r0, r16 + andi r16, 0x70 + add r30, r16 + adc r31, r1 + mov r16, r0 + ijmp + \ No newline at end of file diff --git a/c_src/Keccak-avr8-settings.h b/c_src/Keccak-avr8-settings.h new file mode 100755 index 0000000..030e8eb --- /dev/null +++ b/c_src/Keccak-avr8-settings.h @@ -0,0 +1,2 @@ +#define cKeccakR 1088 +#define cKeccakFixedOutputLengthInBytes 32 diff --git a/c_src/KeccakF-1600-32-rvk.macros b/c_src/KeccakF-1600-32-rvk.macros new file mode 100755 index 0000000..c0c9029 --- /dev/null +++ b/c_src/KeccakF-1600-32-rvk.macros @@ -0,0 +1,555 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by Ronny Van Keer, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +static const UINT32 KeccakF1600RoundConstants_int2[2*24] = +{ + 0x00000001UL, 0x00000000UL, + 0x00000000UL, 0x00000089UL, + 0x00000000UL, 0x8000008bUL, + 0x00000000UL, 0x80008080UL, + 0x00000001UL, 0x0000008bUL, + 0x00000001UL, 0x00008000UL, + 0x00000001UL, 0x80008088UL, + 0x00000001UL, 0x80000082UL, + 0x00000000UL, 0x0000000bUL, + 0x00000000UL, 0x0000000aUL, + 0x00000001UL, 0x00008082UL, + 0x00000000UL, 0x00008003UL, + 0x00000001UL, 0x0000808bUL, + 0x00000001UL, 0x8000000bUL, + 0x00000001UL, 0x8000008aUL, + 0x00000001UL, 0x80000081UL, + 0x00000000UL, 0x80000081UL, + 0x00000000UL, 0x80000008UL, + 0x00000000UL, 0x00000083UL, + 0x00000000UL, 0x80008003UL, + 0x00000001UL, 0x80008088UL, + 0x00000000UL, 0x80000088UL, + 0x00000001UL, 0x00008000UL, + 0x00000000UL, 0x80008082UL +}; + +#undef rounds + +#define rounds \ +{ \ + UINT32 Da0, De0, Di0, Do0, Du0; \ + UINT32 Da1, De1, Di1, Do1, Du1; \ + UINT32 Ba, Be, Bi, Bo, Bu; \ + UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \ + UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \ + UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \ + UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \ + UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \ + UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \ + UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \ + UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \ + UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \ + UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \ + UINT32 Cw, Cx, Cy, Cz; \ + UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \ + UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \ + UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \ + UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \ + UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \ + UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \ + UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \ + UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \ + UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \ + UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \ + const UINT32 * pRoundConstants = KeccakF1600RoundConstants_int2; \ + UINT32 i; \ +\ + copyFromState(A, state) \ +\ + for( i = 12; i != 0; --i ) { \ + Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \ + Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \ + Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \ + Da1 = Cz^Du0; \ +\ + Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \ + Do1 = Cy^Cx; \ +\ + Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \ + De1 = Cz^Cw; \ +\ + Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \ + Di1 = Du1^Cw; \ +\ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Aba0 ^= Da0; \ + Ba = Aba0; \ + Age0 ^= De0; \ + Be = ROL32(Age0, 22); \ + Aki1 ^= Di1; \ + Bi = ROL32(Aki1, 22); \ + Amo1 ^= Do1; \ + Bo = ROL32(Amo1, 11); \ + Asu0 ^= Du0; \ + Bu = ROL32(Asu0, 7); \ + Eba0 = Ba ^((~Be)& Bi ) ^ *(pRoundConstants++); \ + Ebe0 = Be ^((~Bi)& Bo ); \ + Ebi0 = Bi ^((~Bo)& Bu ); \ + Ebo0 = Bo ^((~Bu)& Ba ); \ + Ebu0 = Bu ^((~Ba)& Be ); \ +\ + Abo0 ^= Do0; \ + Ba = ROL32(Abo0, 14); \ + Agu0 ^= Du0; \ + Be = ROL32(Agu0, 10); \ + Aka1 ^= Da1; \ + Bi = ROL32(Aka1, 2); \ + Ame1 ^= De1; \ + Bo = ROL32(Ame1, 23); \ + Asi1 ^= Di1; \ + Bu = ROL32(Asi1, 31); \ + Ega0 = Ba ^((~Be)& Bi ); \ + Ege0 = Be ^((~Bi)& Bo ); \ + Egi0 = Bi ^((~Bo)& Bu ); \ + Ego0 = Bo ^((~Bu)& Ba ); \ + Egu0 = Bu ^((~Ba)& Be ); \ +\ + Abe1 ^= De1; \ + Ba = ROL32(Abe1, 1); \ + Agi0 ^= Di0; \ + Be = ROL32(Agi0, 3); \ + Ako1 ^= Do1; \ + Bi = ROL32(Ako1, 13); \ + Amu0 ^= Du0; \ + Bo = ROL32(Amu0, 4); \ + Asa0 ^= Da0; \ + Bu = ROL32(Asa0, 9); \ + Eka0 = Ba ^((~Be)& Bi ); \ + Eke0 = Be ^((~Bi)& Bo ); \ + Eki0 = Bi ^((~Bo)& Bu ); \ + Eko0 = Bo ^((~Bu)& Ba ); \ + Eku0 = Bu ^((~Ba)& Be ); \ +\ + Abu1 ^= Du1; \ + Ba = ROL32(Abu1, 14); \ + Aga0 ^= Da0; \ + Be = ROL32(Aga0, 18); \ + Ake0 ^= De0; \ + Bi = ROL32(Ake0, 5); \ + Ami1 ^= Di1; \ + Bo = ROL32(Ami1, 8); \ + Aso0 ^= Do0; \ + Bu = ROL32(Aso0, 28); \ + Ema0 = Ba ^((~Be)& Bi ); \ + Eme0 = Be ^((~Bi)& Bo ); \ + Emi0 = Bi ^((~Bo)& Bu ); \ + Emo0 = Bo ^((~Bu)& Ba ); \ + Emu0 = Bu ^((~Ba)& Be ); \ +\ + Abi0 ^= Di0; \ + Ba = ROL32(Abi0, 31); \ + Ago1 ^= Do1; \ + Be = ROL32(Ago1, 28); \ + Aku1 ^= Du1; \ + Bi = ROL32(Aku1, 20); \ + Ama1 ^= Da1; \ + Bo = ROL32(Ama1, 21); \ + Ase0 ^= De0; \ + Bu = ROL32(Ase0, 1); \ + Esa0 = Ba ^((~Be)& Bi ); \ + Ese0 = Be ^((~Bi)& Bo ); \ + Esi0 = Bi ^((~Bo)& Bu ); \ + Eso0 = Bo ^((~Bu)& Ba ); \ + Esu0 = Bu ^((~Ba)& Be ); \ +\ + Aba1 ^= Da1; \ + Ba = Aba1; \ + Age1 ^= De1; \ + Be = ROL32(Age1, 22); \ + Aki0 ^= Di0; \ + Bi = ROL32(Aki0, 21); \ + Amo0 ^= Do0; \ + Bo = ROL32(Amo0, 10); \ + Asu1 ^= Du1; \ + Bu = ROL32(Asu1, 7); \ + Eba1 = Ba ^((~Be)& Bi ); \ + Eba1 ^= *(pRoundConstants++); \ + Ebe1 = Be ^((~Bi)& Bo ); \ + Ebi1 = Bi ^((~Bo)& Bu ); \ + Ebo1 = Bo ^((~Bu)& Ba ); \ + Ebu1 = Bu ^((~Ba)& Be ); \ +\ + Abo1 ^= Do1; \ + Ba = ROL32(Abo1, 14); \ + Agu1 ^= Du1; \ + Be = ROL32(Agu1, 10); \ + Aka0 ^= Da0; \ + Bi = ROL32(Aka0, 1); \ + Ame0 ^= De0; \ + Bo = ROL32(Ame0, 22); \ + Asi0 ^= Di0; \ + Bu = ROL32(Asi0, 30); \ + Ega1 = Ba ^((~Be)& Bi ); \ + Ege1 = Be ^((~Bi)& Bo ); \ + Egi1 = Bi ^((~Bo)& Bu ); \ + Ego1 = Bo ^((~Bu)& Ba ); \ + Egu1 = Bu ^((~Ba)& Be ); \ +\ + Abe0 ^= De0; \ + Ba = Abe0; \ + Agi1 ^= Di1; \ + Be = ROL32(Agi1, 3); \ + Ako0 ^= Do0; \ + Bi = ROL32(Ako0, 12); \ + Amu1 ^= Du1; \ + Bo = ROL32(Amu1, 4); \ + Asa1 ^= Da1; \ + Bu = ROL32(Asa1, 9); \ + Eka1 = Ba ^((~Be)& Bi ); \ + Eke1 = Be ^((~Bi)& Bo ); \ + Eki1 = Bi ^((~Bo)& Bu ); \ + Eko1 = Bo ^((~Bu)& Ba ); \ + Eku1 = Bu ^((~Ba)& Be ); \ +\ + Abu0 ^= Du0; \ + Ba = ROL32(Abu0, 13); \ + Aga1 ^= Da1; \ + Be = ROL32(Aga1, 18); \ + Ake1 ^= De1; \ + Bi = ROL32(Ake1, 5); \ + Ami0 ^= Di0; \ + Bo = ROL32(Ami0, 7); \ + Aso1 ^= Do1; \ + Bu = ROL32(Aso1, 28); \ + Ema1 = Ba ^((~Be)& Bi ); \ + Eme1 = Be ^((~Bi)& Bo ); \ + Emi1 = Bi ^((~Bo)& Bu ); \ + Emo1 = Bo ^((~Bu)& Ba ); \ + Emu1 = Bu ^((~Ba)& Be ); \ +\ + Abi1 ^= Di1; \ + Ba = ROL32(Abi1, 31); \ + Ago0 ^= Do0; \ + Be = ROL32(Ago0, 27); \ + Aku0 ^= Du0; \ + Bi = ROL32(Aku0, 19); \ + Ama0 ^= Da0; \ + Bo = ROL32(Ama0, 20); \ + Ase1 ^= De1; \ + Bu = ROL32(Ase1, 1); \ + Esa1 = Ba ^((~Be)& Bi ); \ + Ese1 = Be ^((~Bi)& Bo ); \ + Esi1 = Bi ^((~Bo)& Bu ); \ + Eso1 = Bo ^((~Bu)& Ba ); \ + Esu1 = Bu ^((~Ba)& Be ); \ +\ + Cx = Ebu0^Egu0^Eku0^Emu0^Esu0; \ + Du1 = Ebe1^Ege1^Eke1^Eme1^Ese1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Ebu1^Egu1^Eku1^Emu1^Esu1; \ + Du0 = Ebe0^Ege0^Eke0^Eme0^Ese0; \ + Da1 = Cz^Du0; \ +\ + Cw = Ebi0^Egi0^Eki0^Emi0^Esi0; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Ebi1^Egi1^Eki1^Emi1^Esi1; \ + Do1 = Cy^Cx; \ +\ + Cx = Eba0^Ega0^Eka0^Ema0^Esa0; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Eba1^Ega1^Eka1^Ema1^Esa1; \ + De1 = Cz^Cw; \ +\ + Cy = Ebo1^Ego1^Eko1^Emo1^Eso1; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Ebo0^Ego0^Eko0^Emo0^Eso0; \ + Di1 = Du1^Cw; \ +\ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Eba0 ^= Da0; \ + Ba = Eba0; \ + Ege0 ^= De0; \ + Be = ROL32(Ege0, 22); \ + Eki1 ^= Di1; \ + Bi = ROL32(Eki1, 22); \ + Emo1 ^= Do1; \ + Bo = ROL32(Emo1, 11); \ + Esu0 ^= Du0; \ + Bu = ROL32(Esu0, 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ +\ + Ebo0 ^= Do0; \ + Ba = ROL32(Ebo0, 14); \ + Egu0 ^= Du0; \ + Be = ROL32(Egu0, 10); \ + Eka1 ^= Da1; \ + Bi = ROL32(Eka1, 2); \ + Eme1 ^= De1; \ + Bo = ROL32(Eme1, 23); \ + Esi1 ^= Di1; \ + Bu = ROL32(Esi1, 31); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Age0 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ +\ + Ebe1 ^= De1; \ + Ba = ROL32(Ebe1, 1); \ + Egi0 ^= Di0; \ + Be = ROL32(Egi0, 3); \ + Eko1 ^= Do1; \ + Bi = ROL32(Eko1, 13); \ + Emu0 ^= Du0; \ + Bo = ROL32(Emu0, 4); \ + Esa0 ^= Da0; \ + Bu = ROL32(Esa0, 9); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ); \ +\ + Ebu1 ^= Du1; \ + Ba = ROL32(Ebu1, 14); \ + Ega0 ^= Da0; \ + Be = ROL32(Ega0, 18); \ + Eke0 ^= De0; \ + Bi = ROL32(Eke0, 5); \ + Emi1 ^= Di1; \ + Bo = ROL32(Emi1, 8); \ + Eso0 ^= Do0; \ + Bu = ROL32(Eso0, 28); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); \ +\ + Ebi0 ^= Di0; \ + Ba = ROL32(Ebi0, 31); \ + Ego1 ^= Do1; \ + Be = ROL32(Ego1, 28); \ + Eku1 ^= Du1; \ + Bi = ROL32(Eku1, 20); \ + Ema1 ^= Da1; \ + Bo = ROL32(Ema1, 21); \ + Ese0 ^= De0; \ + Bu = ROL32(Ese0, 1); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ +\ + Eba1 ^= Da1; \ + Ba = Eba1; \ + Ege1 ^= De1; \ + Be = ROL32(Ege1, 22); \ + Eki0 ^= Di0; \ + Bi = ROL32(Eki0, 21); \ + Emo0 ^= Do0; \ + Bo = ROL32(Emo0, 10); \ + Esu1 ^= Du1; \ + Bu = ROL32(Esu1, 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); \ +\ + Ebo1 ^= Do1; \ + Ba = ROL32(Ebo1, 14); \ + Egu1 ^= Du1; \ + Be = ROL32(Egu1, 10); \ + Eka0 ^= Da0; \ + Bi = ROL32(Eka0, 1); \ + Eme0 ^= De0; \ + Bo = ROL32(Eme0, 22); \ + Esi0 ^= Di0; \ + Bu = ROL32(Esi0, 30); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Age1 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ +\ + Ebe0 ^= De0; \ + Ba = Ebe0; \ + Egi1 ^= Di1; \ + Be = ROL32(Egi1, 3); \ + Eko0 ^= Do0; \ + Bi = ROL32(Eko0, 12); \ + Emu1 ^= Du1; \ + Bo = ROL32(Emu1, 4); \ + Esa1 ^= Da1; \ + Bu = ROL32(Esa1, 9); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ +\ + Ebu0 ^= Du0; \ + Ba = ROL32(Ebu0, 13); \ + Ega1 ^= Da1; \ + Be = ROL32(Ega1, 18); \ + Eke1 ^= De1; \ + Bi = ROL32(Eke1, 5); \ + Emi0 ^= Di0; \ + Bo = ROL32(Emi0, 7); \ + Eso1 ^= Do1; \ + Bu = ROL32(Eso1, 28); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ +\ + Ebi1 ^= Di1; \ + Ba = ROL32(Ebi1, 31); \ + Ego0 ^= Do0; \ + Be = ROL32(Ego0, 27); \ + Eku0 ^= Du0; \ + Bi = ROL32(Eku0, 19); \ + Ema0 ^= Da0; \ + Bo = ROL32(Ema0, 20); \ + Ese1 ^= De1; \ + Bu = ROL32(Ese1, 1); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); \ + } \ + copyToState(state, A) \ +} + +#define copyFromState(X, state) \ + X##ba0 = state[ 0]; \ + X##ba1 = state[ 1]; \ + X##be0 = state[ 2]; \ + X##be1 = state[ 3]; \ + X##bi0 = state[ 4]; \ + X##bi1 = state[ 5]; \ + X##bo0 = state[ 6]; \ + X##bo1 = state[ 7]; \ + X##bu0 = state[ 8]; \ + X##bu1 = state[ 9]; \ + X##ga0 = state[10]; \ + X##ga1 = state[11]; \ + X##ge0 = state[12]; \ + X##ge1 = state[13]; \ + X##gi0 = state[14]; \ + X##gi1 = state[15]; \ + X##go0 = state[16]; \ + X##go1 = state[17]; \ + X##gu0 = state[18]; \ + X##gu1 = state[19]; \ + X##ka0 = state[20]; \ + X##ka1 = state[21]; \ + X##ke0 = state[22]; \ + X##ke1 = state[23]; \ + X##ki0 = state[24]; \ + X##ki1 = state[25]; \ + X##ko0 = state[26]; \ + X##ko1 = state[27]; \ + X##ku0 = state[28]; \ + X##ku1 = state[29]; \ + X##ma0 = state[30]; \ + X##ma1 = state[31]; \ + X##me0 = state[32]; \ + X##me1 = state[33]; \ + X##mi0 = state[34]; \ + X##mi1 = state[35]; \ + X##mo0 = state[36]; \ + X##mo1 = state[37]; \ + X##mu0 = state[38]; \ + X##mu1 = state[39]; \ + X##sa0 = state[40]; \ + X##sa1 = state[41]; \ + X##se0 = state[42]; \ + X##se1 = state[43]; \ + X##si0 = state[44]; \ + X##si1 = state[45]; \ + X##so0 = state[46]; \ + X##so1 = state[47]; \ + X##su0 = state[48]; \ + X##su1 = state[49]; \ + +#define copyToState(state, X) \ + state[ 0] = X##ba0; \ + state[ 1] = X##ba1; \ + state[ 2] = X##be0; \ + state[ 3] = X##be1; \ + state[ 4] = X##bi0; \ + state[ 5] = X##bi1; \ + state[ 6] = X##bo0; \ + state[ 7] = X##bo1; \ + state[ 8] = X##bu0; \ + state[ 9] = X##bu1; \ + state[10] = X##ga0; \ + state[11] = X##ga1; \ + state[12] = X##ge0; \ + state[13] = X##ge1; \ + state[14] = X##gi0; \ + state[15] = X##gi1; \ + state[16] = X##go0; \ + state[17] = X##go1; \ + state[18] = X##gu0; \ + state[19] = X##gu1; \ + state[20] = X##ka0; \ + state[21] = X##ka1; \ + state[22] = X##ke0; \ + state[23] = X##ke1; \ + state[24] = X##ki0; \ + state[25] = X##ki1; \ + state[26] = X##ko0; \ + state[27] = X##ko1; \ + state[28] = X##ku0; \ + state[29] = X##ku1; \ + state[30] = X##ma0; \ + state[31] = X##ma1; \ + state[32] = X##me0; \ + state[33] = X##me1; \ + state[34] = X##mi0; \ + state[35] = X##mi1; \ + state[36] = X##mo0; \ + state[37] = X##mo1; \ + state[38] = X##mu0; \ + state[39] = X##mu1; \ + state[40] = X##sa0; \ + state[41] = X##sa1; \ + state[42] = X##se0; \ + state[43] = X##se1; \ + state[44] = X##si0; \ + state[45] = X##si1; \ + state[46] = X##so0; \ + state[47] = X##so1; \ + state[48] = X##su0; \ + state[49] = X##su1; \ + diff --git a/c_src/KeccakF-1600-32-s1.macros b/c_src/KeccakF-1600-32-s1.macros new file mode 100755 index 0000000..973cc19 --- /dev/null +++ b/c_src/KeccakF-1600-32-s1.macros @@ -0,0 +1,1187 @@ +/* +Code automatically generated by KeccakTools! + +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#define declareABCDE \ + UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \ + UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \ + UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \ + UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \ + UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \ + UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \ + UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \ + UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \ + UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \ + UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \ + UINT32 Bba0, Bbe0, Bbi0, Bbo0, Bbu0; \ + UINT32 Bba1, Bbe1, Bbi1, Bbo1, Bbu1; \ + UINT32 Bga0, Bge0, Bgi0, Bgo0, Bgu0; \ + UINT32 Bga1, Bge1, Bgi1, Bgo1, Bgu1; \ + UINT32 Bka0, Bke0, Bki0, Bko0, Bku0; \ + UINT32 Bka1, Bke1, Bki1, Bko1, Bku1; \ + UINT32 Bma0, Bme0, Bmi0, Bmo0, Bmu0; \ + UINT32 Bma1, Bme1, Bmi1, Bmo1, Bmu1; \ + UINT32 Bsa0, Bse0, Bsi0, Bso0, Bsu0; \ + UINT32 Bsa1, Bse1, Bsi1, Bso1, Bsu1; \ + UINT32 Ca0, Ce0, Ci0, Co0, Cu0; \ + UINT32 Ca1, Ce1, Ci1, Co1, Cu1; \ + UINT32 Da0, De0, Di0, Do0, Du0; \ + UINT32 Da1, De1, Di1, Do1, Du1; \ + UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \ + UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \ + UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \ + UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \ + UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \ + UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \ + UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \ + UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \ + UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \ + UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \ + +#define prepareTheta \ + Ca0 = Aba0^Aga0^Aka0^Ama0^Asa0; \ + Ca1 = Aba1^Aga1^Aka1^Ama1^Asa1; \ + Ce0 = Abe0^Age0^Ake0^Ame0^Ase0; \ + Ce1 = Abe1^Age1^Ake1^Ame1^Ase1; \ + Ci0 = Abi0^Agi0^Aki0^Ami0^Asi0; \ + Ci1 = Abi1^Agi1^Aki1^Ami1^Asi1; \ + Co0 = Abo0^Ago0^Ako0^Amo0^Aso0; \ + Co1 = Abo1^Ago1^Ako1^Amo1^Aso1; \ + Cu0 = Abu0^Agu0^Aku0^Amu0^Asu0; \ + Cu1 = Abu1^Agu1^Aku1^Amu1^Asu1; \ + +#ifdef UseBebigokimisa +// --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##ba0 = Bba0 ^( Bbe0 | Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + Ca0 = E##ba0; \ + E##be0 = Bbe0 ^((~Bbi0)| Bbo0 ); \ + Ce0 = E##be0; \ + E##bi0 = Bbi0 ^( Bbo0 & Bbu0 ); \ + Ci0 = E##bi0; \ + E##bo0 = Bbo0 ^( Bbu0 | Bba0 ); \ + Co0 = E##bo0; \ + E##bu0 = Bbu0 ^( Bba0 & Bbe0 ); \ + Cu0 = E##bu0; \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##ba1 = Bba1 ^( Bbe1 | Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + Ca1 = E##ba1; \ + E##be1 = Bbe1 ^((~Bbi1)| Bbo1 ); \ + Ce1 = E##be1; \ + E##bi1 = Bbi1 ^( Bbo1 & Bbu1 ); \ + Ci1 = E##bi1; \ + E##bo1 = Bbo1 ^( Bbu1 | Bba1 ); \ + Co1 = E##bo1; \ + E##bu1 = Bbu1 ^( Bba1 & Bbe1 ); \ + Cu1 = E##bu1; \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##ga0 = Bga0 ^( Bge0 | Bgi0 ); \ + Ca0 ^= E##ga0; \ + E##ge0 = Bge0 ^( Bgi0 & Bgo0 ); \ + Ce0 ^= E##ge0; \ + E##gi0 = Bgi0 ^( Bgo0 |(~Bgu0)); \ + Ci0 ^= E##gi0; \ + E##go0 = Bgo0 ^( Bgu0 | Bga0 ); \ + Co0 ^= E##go0; \ + E##gu0 = Bgu0 ^( Bga0 & Bge0 ); \ + Cu0 ^= E##gu0; \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##ga1 = Bga1 ^( Bge1 | Bgi1 ); \ + Ca1 ^= E##ga1; \ + E##ge1 = Bge1 ^( Bgi1 & Bgo1 ); \ + Ce1 ^= E##ge1; \ + E##gi1 = Bgi1 ^( Bgo1 |(~Bgu1)); \ + Ci1 ^= E##gi1; \ + E##go1 = Bgo1 ^( Bgu1 | Bga1 ); \ + Co1 ^= E##go1; \ + E##gu1 = Bgu1 ^( Bga1 & Bge1 ); \ + Cu1 ^= E##gu1; \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ka0 = Bka0 ^( Bke0 | Bki0 ); \ + Ca0 ^= E##ka0; \ + E##ke0 = Bke0 ^( Bki0 & Bko0 ); \ + Ce0 ^= E##ke0; \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + Ci0 ^= E##ki0; \ + E##ko0 = (~Bko0)^( Bku0 | Bka0 ); \ + Co0 ^= E##ko0; \ + E##ku0 = Bku0 ^( Bka0 & Bke0 ); \ + Cu0 ^= E##ku0; \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ka1 = Bka1 ^( Bke1 | Bki1 ); \ + Ca1 ^= E##ka1; \ + E##ke1 = Bke1 ^( Bki1 & Bko1 ); \ + Ce1 ^= E##ke1; \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + Ci1 ^= E##ki1; \ + E##ko1 = (~Bko1)^( Bku1 | Bka1 ); \ + Co1 ^= E##ko1; \ + E##ku1 = Bku1 ^( Bka1 & Bke1 ); \ + Cu1 ^= E##ku1; \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##ma0 = Bma0 ^( Bme0 & Bmi0 ); \ + Ca0 ^= E##ma0; \ + E##me0 = Bme0 ^( Bmi0 | Bmo0 ); \ + Ce0 ^= E##me0; \ + E##mi0 = Bmi0 ^((~Bmo0)| Bmu0 ); \ + Ci0 ^= E##mi0; \ + E##mo0 = (~Bmo0)^( Bmu0 & Bma0 ); \ + Co0 ^= E##mo0; \ + E##mu0 = Bmu0 ^( Bma0 | Bme0 ); \ + Cu0 ^= E##mu0; \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##ma1 = Bma1 ^( Bme1 & Bmi1 ); \ + Ca1 ^= E##ma1; \ + E##me1 = Bme1 ^( Bmi1 | Bmo1 ); \ + Ce1 ^= E##me1; \ + E##mi1 = Bmi1 ^((~Bmo1)| Bmu1 ); \ + Ci1 ^= E##mi1; \ + E##mo1 = (~Bmo1)^( Bmu1 & Bma1 ); \ + Co1 ^= E##mo1; \ + E##mu1 = Bmu1 ^( Bma1 | Bme1 ); \ + Cu1 ^= E##mu1; \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + Ca0 ^= E##sa0; \ + E##se0 = (~Bse0)^( Bsi0 | Bso0 ); \ + Ce0 ^= E##se0; \ + E##si0 = Bsi0 ^( Bso0 & Bsu0 ); \ + Ci0 ^= E##si0; \ + E##so0 = Bso0 ^( Bsu0 | Bsa0 ); \ + Co0 ^= E##so0; \ + E##su0 = Bsu0 ^( Bsa0 & Bse0 ); \ + Cu0 ^= E##su0; \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + Ca1 ^= E##sa1; \ + E##se1 = (~Bse1)^( Bsi1 | Bso1 ); \ + Ce1 ^= E##se1; \ + E##si1 = Bsi1 ^( Bso1 & Bsu1 ); \ + Ci1 ^= E##si1; \ + E##so1 = Bso1 ^( Bsu1 | Bsa1 ); \ + Co1 ^= E##so1; \ + E##su1 = Bsu1 ^( Bsa1 & Bse1 ); \ + Cu1 ^= E##su1; \ +\ + +// --- Code for round (lane complementing pattern 'bebigokimisa') +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIota(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##ba0 = Bba0 ^( Bbe0 | Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + E##be0 = Bbe0 ^((~Bbi0)| Bbo0 ); \ + E##bi0 = Bbi0 ^( Bbo0 & Bbu0 ); \ + E##bo0 = Bbo0 ^( Bbu0 | Bba0 ); \ + E##bu0 = Bbu0 ^( Bba0 & Bbe0 ); \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##ba1 = Bba1 ^( Bbe1 | Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + E##be1 = Bbe1 ^((~Bbi1)| Bbo1 ); \ + E##bi1 = Bbi1 ^( Bbo1 & Bbu1 ); \ + E##bo1 = Bbo1 ^( Bbu1 | Bba1 ); \ + E##bu1 = Bbu1 ^( Bba1 & Bbe1 ); \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##ga0 = Bga0 ^( Bge0 | Bgi0 ); \ + E##ge0 = Bge0 ^( Bgi0 & Bgo0 ); \ + E##gi0 = Bgi0 ^( Bgo0 |(~Bgu0)); \ + E##go0 = Bgo0 ^( Bgu0 | Bga0 ); \ + E##gu0 = Bgu0 ^( Bga0 & Bge0 ); \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##ga1 = Bga1 ^( Bge1 | Bgi1 ); \ + E##ge1 = Bge1 ^( Bgi1 & Bgo1 ); \ + E##gi1 = Bgi1 ^( Bgo1 |(~Bgu1)); \ + E##go1 = Bgo1 ^( Bgu1 | Bga1 ); \ + E##gu1 = Bgu1 ^( Bga1 & Bge1 ); \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ka0 = Bka0 ^( Bke0 | Bki0 ); \ + E##ke0 = Bke0 ^( Bki0 & Bko0 ); \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + E##ko0 = (~Bko0)^( Bku0 | Bka0 ); \ + E##ku0 = Bku0 ^( Bka0 & Bke0 ); \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ka1 = Bka1 ^( Bke1 | Bki1 ); \ + E##ke1 = Bke1 ^( Bki1 & Bko1 ); \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + E##ko1 = (~Bko1)^( Bku1 | Bka1 ); \ + E##ku1 = Bku1 ^( Bka1 & Bke1 ); \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##ma0 = Bma0 ^( Bme0 & Bmi0 ); \ + E##me0 = Bme0 ^( Bmi0 | Bmo0 ); \ + E##mi0 = Bmi0 ^((~Bmo0)| Bmu0 ); \ + E##mo0 = (~Bmo0)^( Bmu0 & Bma0 ); \ + E##mu0 = Bmu0 ^( Bma0 | Bme0 ); \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##ma1 = Bma1 ^( Bme1 & Bmi1 ); \ + E##me1 = Bme1 ^( Bmi1 | Bmo1 ); \ + E##mi1 = Bmi1 ^((~Bmo1)| Bmu1 ); \ + E##mo1 = (~Bmo1)^( Bmu1 & Bma1 ); \ + E##mu1 = Bmu1 ^( Bma1 | Bme1 ); \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + E##se0 = (~Bse0)^( Bsi0 | Bso0 ); \ + E##si0 = Bsi0 ^( Bso0 & Bsu0 ); \ + E##so0 = Bso0 ^( Bsu0 | Bsa0 ); \ + E##su0 = Bsu0 ^( Bsa0 & Bse0 ); \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + E##se1 = (~Bse1)^( Bsi1 | Bso1 ); \ + E##si1 = Bsi1 ^( Bso1 & Bsu1 ); \ + E##so1 = Bso1 ^( Bsu1 | Bsa1 ); \ + E##su1 = Bsu1 ^( Bsa1 & Bse1 ); \ +\ + +#else // UseBebigokimisa +// --- Code for round, with prepare-theta +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##ba0 = Bba0 ^((~Bbe0)& Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + Ca0 = E##ba0; \ + E##be0 = Bbe0 ^((~Bbi0)& Bbo0 ); \ + Ce0 = E##be0; \ + E##bi0 = Bbi0 ^((~Bbo0)& Bbu0 ); \ + Ci0 = E##bi0; \ + E##bo0 = Bbo0 ^((~Bbu0)& Bba0 ); \ + Co0 = E##bo0; \ + E##bu0 = Bbu0 ^((~Bba0)& Bbe0 ); \ + Cu0 = E##bu0; \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##ba1 = Bba1 ^((~Bbe1)& Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + Ca1 = E##ba1; \ + E##be1 = Bbe1 ^((~Bbi1)& Bbo1 ); \ + Ce1 = E##be1; \ + E##bi1 = Bbi1 ^((~Bbo1)& Bbu1 ); \ + Ci1 = E##bi1; \ + E##bo1 = Bbo1 ^((~Bbu1)& Bba1 ); \ + Co1 = E##bo1; \ + E##bu1 = Bbu1 ^((~Bba1)& Bbe1 ); \ + Cu1 = E##bu1; \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##ga0 = Bga0 ^((~Bge0)& Bgi0 ); \ + Ca0 ^= E##ga0; \ + E##ge0 = Bge0 ^((~Bgi0)& Bgo0 ); \ + Ce0 ^= E##ge0; \ + E##gi0 = Bgi0 ^((~Bgo0)& Bgu0 ); \ + Ci0 ^= E##gi0; \ + E##go0 = Bgo0 ^((~Bgu0)& Bga0 ); \ + Co0 ^= E##go0; \ + E##gu0 = Bgu0 ^((~Bga0)& Bge0 ); \ + Cu0 ^= E##gu0; \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##ga1 = Bga1 ^((~Bge1)& Bgi1 ); \ + Ca1 ^= E##ga1; \ + E##ge1 = Bge1 ^((~Bgi1)& Bgo1 ); \ + Ce1 ^= E##ge1; \ + E##gi1 = Bgi1 ^((~Bgo1)& Bgu1 ); \ + Ci1 ^= E##gi1; \ + E##go1 = Bgo1 ^((~Bgu1)& Bga1 ); \ + Co1 ^= E##go1; \ + E##gu1 = Bgu1 ^((~Bga1)& Bge1 ); \ + Cu1 ^= E##gu1; \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ka0 = Bka0 ^((~Bke0)& Bki0 ); \ + Ca0 ^= E##ka0; \ + E##ke0 = Bke0 ^((~Bki0)& Bko0 ); \ + Ce0 ^= E##ke0; \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + Ci0 ^= E##ki0; \ + E##ko0 = Bko0 ^((~Bku0)& Bka0 ); \ + Co0 ^= E##ko0; \ + E##ku0 = Bku0 ^((~Bka0)& Bke0 ); \ + Cu0 ^= E##ku0; \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ka1 = Bka1 ^((~Bke1)& Bki1 ); \ + Ca1 ^= E##ka1; \ + E##ke1 = Bke1 ^((~Bki1)& Bko1 ); \ + Ce1 ^= E##ke1; \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + Ci1 ^= E##ki1; \ + E##ko1 = Bko1 ^((~Bku1)& Bka1 ); \ + Co1 ^= E##ko1; \ + E##ku1 = Bku1 ^((~Bka1)& Bke1 ); \ + Cu1 ^= E##ku1; \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##ma0 = Bma0 ^((~Bme0)& Bmi0 ); \ + Ca0 ^= E##ma0; \ + E##me0 = Bme0 ^((~Bmi0)& Bmo0 ); \ + Ce0 ^= E##me0; \ + E##mi0 = Bmi0 ^((~Bmo0)& Bmu0 ); \ + Ci0 ^= E##mi0; \ + E##mo0 = Bmo0 ^((~Bmu0)& Bma0 ); \ + Co0 ^= E##mo0; \ + E##mu0 = Bmu0 ^((~Bma0)& Bme0 ); \ + Cu0 ^= E##mu0; \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##ma1 = Bma1 ^((~Bme1)& Bmi1 ); \ + Ca1 ^= E##ma1; \ + E##me1 = Bme1 ^((~Bmi1)& Bmo1 ); \ + Ce1 ^= E##me1; \ + E##mi1 = Bmi1 ^((~Bmo1)& Bmu1 ); \ + Ci1 ^= E##mi1; \ + E##mo1 = Bmo1 ^((~Bmu1)& Bma1 ); \ + Co1 ^= E##mo1; \ + E##mu1 = Bmu1 ^((~Bma1)& Bme1 ); \ + Cu1 ^= E##mu1; \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + Ca0 ^= E##sa0; \ + E##se0 = Bse0 ^((~Bsi0)& Bso0 ); \ + Ce0 ^= E##se0; \ + E##si0 = Bsi0 ^((~Bso0)& Bsu0 ); \ + Ci0 ^= E##si0; \ + E##so0 = Bso0 ^((~Bsu0)& Bsa0 ); \ + Co0 ^= E##so0; \ + E##su0 = Bsu0 ^((~Bsa0)& Bse0 ); \ + Cu0 ^= E##su0; \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + Ca1 ^= E##sa1; \ + E##se1 = Bse1 ^((~Bsi1)& Bso1 ); \ + Ce1 ^= E##se1; \ + E##si1 = Bsi1 ^((~Bso1)& Bsu1 ); \ + Ci1 ^= E##si1; \ + E##so1 = Bso1 ^((~Bsu1)& Bsa1 ); \ + Co1 ^= E##so1; \ + E##su1 = Bsu1 ^((~Bsa1)& Bse1 ); \ + Cu1 ^= E##su1; \ +\ + +// --- Code for round +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIota(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##ba0 = Bba0 ^((~Bbe0)& Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + E##be0 = Bbe0 ^((~Bbi0)& Bbo0 ); \ + E##bi0 = Bbi0 ^((~Bbo0)& Bbu0 ); \ + E##bo0 = Bbo0 ^((~Bbu0)& Bba0 ); \ + E##bu0 = Bbu0 ^((~Bba0)& Bbe0 ); \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##ba1 = Bba1 ^((~Bbe1)& Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + E##be1 = Bbe1 ^((~Bbi1)& Bbo1 ); \ + E##bi1 = Bbi1 ^((~Bbo1)& Bbu1 ); \ + E##bo1 = Bbo1 ^((~Bbu1)& Bba1 ); \ + E##bu1 = Bbu1 ^((~Bba1)& Bbe1 ); \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##ga0 = Bga0 ^((~Bge0)& Bgi0 ); \ + E##ge0 = Bge0 ^((~Bgi0)& Bgo0 ); \ + E##gi0 = Bgi0 ^((~Bgo0)& Bgu0 ); \ + E##go0 = Bgo0 ^((~Bgu0)& Bga0 ); \ + E##gu0 = Bgu0 ^((~Bga0)& Bge0 ); \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##ga1 = Bga1 ^((~Bge1)& Bgi1 ); \ + E##ge1 = Bge1 ^((~Bgi1)& Bgo1 ); \ + E##gi1 = Bgi1 ^((~Bgo1)& Bgu1 ); \ + E##go1 = Bgo1 ^((~Bgu1)& Bga1 ); \ + E##gu1 = Bgu1 ^((~Bga1)& Bge1 ); \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ka0 = Bka0 ^((~Bke0)& Bki0 ); \ + E##ke0 = Bke0 ^((~Bki0)& Bko0 ); \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + E##ko0 = Bko0 ^((~Bku0)& Bka0 ); \ + E##ku0 = Bku0 ^((~Bka0)& Bke0 ); \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ka1 = Bka1 ^((~Bke1)& Bki1 ); \ + E##ke1 = Bke1 ^((~Bki1)& Bko1 ); \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + E##ko1 = Bko1 ^((~Bku1)& Bka1 ); \ + E##ku1 = Bku1 ^((~Bka1)& Bke1 ); \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##ma0 = Bma0 ^((~Bme0)& Bmi0 ); \ + E##me0 = Bme0 ^((~Bmi0)& Bmo0 ); \ + E##mi0 = Bmi0 ^((~Bmo0)& Bmu0 ); \ + E##mo0 = Bmo0 ^((~Bmu0)& Bma0 ); \ + E##mu0 = Bmu0 ^((~Bma0)& Bme0 ); \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##ma1 = Bma1 ^((~Bme1)& Bmi1 ); \ + E##me1 = Bme1 ^((~Bmi1)& Bmo1 ); \ + E##mi1 = Bmi1 ^((~Bmo1)& Bmu1 ); \ + E##mo1 = Bmo1 ^((~Bmu1)& Bma1 ); \ + E##mu1 = Bmu1 ^((~Bma1)& Bme1 ); \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + E##se0 = Bse0 ^((~Bsi0)& Bso0 ); \ + E##si0 = Bsi0 ^((~Bso0)& Bsu0 ); \ + E##so0 = Bso0 ^((~Bsu0)& Bsa0 ); \ + E##su0 = Bsu0 ^((~Bsa0)& Bse0 ); \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + E##se1 = Bse1 ^((~Bsi1)& Bso1 ); \ + E##si1 = Bsi1 ^((~Bso1)& Bsu1 ); \ + E##so1 = Bso1 ^((~Bsu1)& Bsa1 ); \ + E##su1 = Bsu1 ^((~Bsa1)& Bse1 ); \ +\ + +#endif // UseBebigokimisa + +const UINT32 KeccakF1600RoundConstants_int2_0[24] = { + 0x00000001UL, + 0x00000000UL, + 0x00000000UL, + 0x00000000UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000000UL, + 0x00000000UL, + 0x00000001UL, + 0x00000000UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000000UL, + 0x00000000UL, + 0x00000000UL, + 0x00000000UL, + 0x00000001UL, + 0x00000000UL, + 0x00000001UL, + 0x00000000UL }; + +const UINT32 KeccakF1600RoundConstants_int2_1[24] = { + 0x00000000UL, + 0x00000089UL, + 0x8000008bUL, + 0x80008080UL, + 0x0000008bUL, + 0x00008000UL, + 0x80008088UL, + 0x80000082UL, + 0x0000000bUL, + 0x0000000aUL, + 0x00008082UL, + 0x00008003UL, + 0x0000808bUL, + 0x8000000bUL, + 0x8000008aUL, + 0x80000081UL, + 0x80000081UL, + 0x80000008UL, + 0x00000083UL, + 0x80008003UL, + 0x80008088UL, + 0x80000088UL, + 0x00008000UL, + 0x80008082UL }; + +#define copyFromStateAndXor1024bits(X, state, input) \ + X##ba0 = state[ 0]^input[ 0]; \ + X##ba1 = state[ 1]^input[ 1]; \ + X##be0 = state[ 2]^input[ 2]; \ + X##be1 = state[ 3]^input[ 3]; \ + X##bi0 = state[ 4]^input[ 4]; \ + X##bi1 = state[ 5]^input[ 5]; \ + X##bo0 = state[ 6]^input[ 6]; \ + X##bo1 = state[ 7]^input[ 7]; \ + X##bu0 = state[ 8]^input[ 8]; \ + X##bu1 = state[ 9]^input[ 9]; \ + X##ga0 = state[10]^input[10]; \ + X##ga1 = state[11]^input[11]; \ + X##ge0 = state[12]^input[12]; \ + X##ge1 = state[13]^input[13]; \ + X##gi0 = state[14]^input[14]; \ + X##gi1 = state[15]^input[15]; \ + X##go0 = state[16]^input[16]; \ + X##go1 = state[17]^input[17]; \ + X##gu0 = state[18]^input[18]; \ + X##gu1 = state[19]^input[19]; \ + X##ka0 = state[20]^input[20]; \ + X##ka1 = state[21]^input[21]; \ + X##ke0 = state[22]^input[22]; \ + X##ke1 = state[23]^input[23]; \ + X##ki0 = state[24]^input[24]; \ + X##ki1 = state[25]^input[25]; \ + X##ko0 = state[26]^input[26]; \ + X##ko1 = state[27]^input[27]; \ + X##ku0 = state[28]^input[28]; \ + X##ku1 = state[29]^input[29]; \ + X##ma0 = state[30]^input[30]; \ + X##ma1 = state[31]^input[31]; \ + X##me0 = state[32]; \ + X##me1 = state[33]; \ + X##mi0 = state[34]; \ + X##mi1 = state[35]; \ + X##mo0 = state[36]; \ + X##mo1 = state[37]; \ + X##mu0 = state[38]; \ + X##mu1 = state[39]; \ + X##sa0 = state[40]; \ + X##sa1 = state[41]; \ + X##se0 = state[42]; \ + X##se1 = state[43]; \ + X##si0 = state[44]; \ + X##si1 = state[45]; \ + X##so0 = state[46]; \ + X##so1 = state[47]; \ + X##su0 = state[48]; \ + X##su1 = state[49]; \ + +#define copyFromStateAndXor1088bits(X, state, input) \ + X##ba0 = state[ 0]^input[ 0]; \ + X##ba1 = state[ 1]^input[ 1]; \ + X##be0 = state[ 2]^input[ 2]; \ + X##be1 = state[ 3]^input[ 3]; \ + X##bi0 = state[ 4]^input[ 4]; \ + X##bi1 = state[ 5]^input[ 5]; \ + X##bo0 = state[ 6]^input[ 6]; \ + X##bo1 = state[ 7]^input[ 7]; \ + X##bu0 = state[ 8]^input[ 8]; \ + X##bu1 = state[ 9]^input[ 9]; \ + X##ga0 = state[10]^input[10]; \ + X##ga1 = state[11]^input[11]; \ + X##ge0 = state[12]^input[12]; \ + X##ge1 = state[13]^input[13]; \ + X##gi0 = state[14]^input[14]; \ + X##gi1 = state[15]^input[15]; \ + X##go0 = state[16]^input[16]; \ + X##go1 = state[17]^input[17]; \ + X##gu0 = state[18]^input[18]; \ + X##gu1 = state[19]^input[19]; \ + X##ka0 = state[20]^input[20]; \ + X##ka1 = state[21]^input[21]; \ + X##ke0 = state[22]^input[22]; \ + X##ke1 = state[23]^input[23]; \ + X##ki0 = state[24]^input[24]; \ + X##ki1 = state[25]^input[25]; \ + X##ko0 = state[26]^input[26]; \ + X##ko1 = state[27]^input[27]; \ + X##ku0 = state[28]^input[28]; \ + X##ku1 = state[29]^input[29]; \ + X##ma0 = state[30]^input[30]; \ + X##ma1 = state[31]^input[31]; \ + X##me0 = state[32]^input[32]; \ + X##me1 = state[33]^input[33]; \ + X##mi0 = state[34]; \ + X##mi1 = state[35]; \ + X##mo0 = state[36]; \ + X##mo1 = state[37]; \ + X##mu0 = state[38]; \ + X##mu1 = state[39]; \ + X##sa0 = state[40]; \ + X##sa1 = state[41]; \ + X##se0 = state[42]; \ + X##se1 = state[43]; \ + X##si0 = state[44]; \ + X##si1 = state[45]; \ + X##so0 = state[46]; \ + X##so1 = state[47]; \ + X##su0 = state[48]; \ + X##su1 = state[49]; \ + +#define copyFromState(X, state) \ + X##ba0 = state[ 0]; \ + X##ba1 = state[ 1]; \ + X##be0 = state[ 2]; \ + X##be1 = state[ 3]; \ + X##bi0 = state[ 4]; \ + X##bi1 = state[ 5]; \ + X##bo0 = state[ 6]; \ + X##bo1 = state[ 7]; \ + X##bu0 = state[ 8]; \ + X##bu1 = state[ 9]; \ + X##ga0 = state[10]; \ + X##ga1 = state[11]; \ + X##ge0 = state[12]; \ + X##ge1 = state[13]; \ + X##gi0 = state[14]; \ + X##gi1 = state[15]; \ + X##go0 = state[16]; \ + X##go1 = state[17]; \ + X##gu0 = state[18]; \ + X##gu1 = state[19]; \ + X##ka0 = state[20]; \ + X##ka1 = state[21]; \ + X##ke0 = state[22]; \ + X##ke1 = state[23]; \ + X##ki0 = state[24]; \ + X##ki1 = state[25]; \ + X##ko0 = state[26]; \ + X##ko1 = state[27]; \ + X##ku0 = state[28]; \ + X##ku1 = state[29]; \ + X##ma0 = state[30]; \ + X##ma1 = state[31]; \ + X##me0 = state[32]; \ + X##me1 = state[33]; \ + X##mi0 = state[34]; \ + X##mi1 = state[35]; \ + X##mo0 = state[36]; \ + X##mo1 = state[37]; \ + X##mu0 = state[38]; \ + X##mu1 = state[39]; \ + X##sa0 = state[40]; \ + X##sa1 = state[41]; \ + X##se0 = state[42]; \ + X##se1 = state[43]; \ + X##si0 = state[44]; \ + X##si1 = state[45]; \ + X##so0 = state[46]; \ + X##so1 = state[47]; \ + X##su0 = state[48]; \ + X##su1 = state[49]; \ + +#define copyToState(state, X) \ + state[ 0] = X##ba0; \ + state[ 1] = X##ba1; \ + state[ 2] = X##be0; \ + state[ 3] = X##be1; \ + state[ 4] = X##bi0; \ + state[ 5] = X##bi1; \ + state[ 6] = X##bo0; \ + state[ 7] = X##bo1; \ + state[ 8] = X##bu0; \ + state[ 9] = X##bu1; \ + state[10] = X##ga0; \ + state[11] = X##ga1; \ + state[12] = X##ge0; \ + state[13] = X##ge1; \ + state[14] = X##gi0; \ + state[15] = X##gi1; \ + state[16] = X##go0; \ + state[17] = X##go1; \ + state[18] = X##gu0; \ + state[19] = X##gu1; \ + state[20] = X##ka0; \ + state[21] = X##ka1; \ + state[22] = X##ke0; \ + state[23] = X##ke1; \ + state[24] = X##ki0; \ + state[25] = X##ki1; \ + state[26] = X##ko0; \ + state[27] = X##ko1; \ + state[28] = X##ku0; \ + state[29] = X##ku1; \ + state[30] = X##ma0; \ + state[31] = X##ma1; \ + state[32] = X##me0; \ + state[33] = X##me1; \ + state[34] = X##mi0; \ + state[35] = X##mi1; \ + state[36] = X##mo0; \ + state[37] = X##mo1; \ + state[38] = X##mu0; \ + state[39] = X##mu1; \ + state[40] = X##sa0; \ + state[41] = X##sa1; \ + state[42] = X##se0; \ + state[43] = X##se1; \ + state[44] = X##si0; \ + state[45] = X##si1; \ + state[46] = X##so0; \ + state[47] = X##so1; \ + state[48] = X##su0; \ + state[49] = X##su1; \ + +#define copyStateVariables(X, Y) \ + X##ba0 = Y##ba0; \ + X##ba1 = Y##ba1; \ + X##be0 = Y##be0; \ + X##be1 = Y##be1; \ + X##bi0 = Y##bi0; \ + X##bi1 = Y##bi1; \ + X##bo0 = Y##bo0; \ + X##bo1 = Y##bo1; \ + X##bu0 = Y##bu0; \ + X##bu1 = Y##bu1; \ + X##ga0 = Y##ga0; \ + X##ga1 = Y##ga1; \ + X##ge0 = Y##ge0; \ + X##ge1 = Y##ge1; \ + X##gi0 = Y##gi0; \ + X##gi1 = Y##gi1; \ + X##go0 = Y##go0; \ + X##go1 = Y##go1; \ + X##gu0 = Y##gu0; \ + X##gu1 = Y##gu1; \ + X##ka0 = Y##ka0; \ + X##ka1 = Y##ka1; \ + X##ke0 = Y##ke0; \ + X##ke1 = Y##ke1; \ + X##ki0 = Y##ki0; \ + X##ki1 = Y##ki1; \ + X##ko0 = Y##ko0; \ + X##ko1 = Y##ko1; \ + X##ku0 = Y##ku0; \ + X##ku1 = Y##ku1; \ + X##ma0 = Y##ma0; \ + X##ma1 = Y##ma1; \ + X##me0 = Y##me0; \ + X##me1 = Y##me1; \ + X##mi0 = Y##mi0; \ + X##mi1 = Y##mi1; \ + X##mo0 = Y##mo0; \ + X##mo1 = Y##mo1; \ + X##mu0 = Y##mu0; \ + X##mu1 = Y##mu1; \ + X##sa0 = Y##sa0; \ + X##sa1 = Y##sa1; \ + X##se0 = Y##se0; \ + X##se1 = Y##se1; \ + X##si0 = Y##si0; \ + X##si1 = Y##si1; \ + X##so0 = Y##so0; \ + X##so1 = Y##so1; \ + X##su0 = Y##su0; \ + X##su1 = Y##su1; \ + diff --git a/c_src/KeccakF-1600-32-s2.macros b/c_src/KeccakF-1600-32-s2.macros new file mode 100755 index 0000000..3c27a34 --- /dev/null +++ b/c_src/KeccakF-1600-32-s2.macros @@ -0,0 +1,1187 @@ +/* +Code automatically generated by KeccakTools! + +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#define declareABCDE \ + UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \ + UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \ + UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \ + UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \ + UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \ + UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \ + UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \ + UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \ + UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \ + UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \ + UINT32 Bba0, Bbe0, Bbi0, Bbo0, Bbu0; \ + UINT32 Bba1, Bbe1, Bbi1, Bbo1, Bbu1; \ + UINT32 Bga0, Bge0, Bgi0, Bgo0, Bgu0; \ + UINT32 Bga1, Bge1, Bgi1, Bgo1, Bgu1; \ + UINT32 Bka0, Bke0, Bki0, Bko0, Bku0; \ + UINT32 Bka1, Bke1, Bki1, Bko1, Bku1; \ + UINT32 Bma0, Bme0, Bmi0, Bmo0, Bmu0; \ + UINT32 Bma1, Bme1, Bmi1, Bmo1, Bmu1; \ + UINT32 Bsa0, Bse0, Bsi0, Bso0, Bsu0; \ + UINT32 Bsa1, Bse1, Bsi1, Bso1, Bsu1; \ + UINT32 Ca0, Ce0, Ci0, Co0, Cu0; \ + UINT32 Ca1, Ce1, Ci1, Co1, Cu1; \ + UINT32 Da0, De0, Di0, Do0, Du0; \ + UINT32 Da1, De1, Di1, Do1, Du1; \ + UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \ + UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \ + UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \ + UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \ + UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \ + UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \ + UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \ + UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \ + UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \ + UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \ + +#define prepareTheta \ + Ca0 = Aba0^Aga0^Aka0^Ama0^Asa0; \ + Ca1 = Aba1^Aga1^Aka1^Ama1^Asa1; \ + Ce0 = Abe0^Age0^Ake0^Ame0^Ase0; \ + Ce1 = Abe1^Age1^Ake1^Ame1^Ase1; \ + Ci0 = Abi0^Agi0^Aki0^Ami0^Asi0; \ + Ci1 = Abi1^Agi1^Aki1^Ami1^Asi1; \ + Co0 = Abo0^Ago0^Ako0^Amo0^Aso0; \ + Co1 = Abo1^Ago1^Ako1^Amo1^Aso1; \ + Cu0 = Abu0^Agu0^Aku0^Amu0^Asu0; \ + Cu1 = Abu1^Agu1^Aku1^Amu1^Asu1; \ + +#ifdef UseBebigokimisa +// --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + E##ba0 = Bba0 ^( Bbe0 | Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + Ca0 = E##ba0; \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + E##be0 = Bbe0 ^((~Bbi0)| Bbo0 ); \ + Ce0 = E##be0; \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##bi0 = Bbi0 ^( Bbo0 & Bbu0 ); \ + Ci0 = E##bi0; \ + E##bo0 = Bbo0 ^( Bbu0 | Bba0 ); \ + Co0 = E##bo0; \ + E##bu0 = Bbu0 ^( Bba0 & Bbe0 ); \ + Cu0 = E##bu0; \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + E##ba1 = Bba1 ^( Bbe1 | Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + Ca1 = E##ba1; \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + E##be1 = Bbe1 ^((~Bbi1)| Bbo1 ); \ + Ce1 = E##be1; \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##bi1 = Bbi1 ^( Bbo1 & Bbu1 ); \ + Ci1 = E##bi1; \ + E##bo1 = Bbo1 ^( Bbu1 | Bba1 ); \ + Co1 = E##bo1; \ + E##bu1 = Bbu1 ^( Bba1 & Bbe1 ); \ + Cu1 = E##bu1; \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + E##ga0 = Bga0 ^( Bge0 | Bgi0 ); \ + Ca0 ^= E##ga0; \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + E##ge0 = Bge0 ^( Bgi0 & Bgo0 ); \ + Ce0 ^= E##ge0; \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##gi0 = Bgi0 ^( Bgo0 |(~Bgu0)); \ + Ci0 ^= E##gi0; \ + E##go0 = Bgo0 ^( Bgu0 | Bga0 ); \ + Co0 ^= E##go0; \ + E##gu0 = Bgu0 ^( Bga0 & Bge0 ); \ + Cu0 ^= E##gu0; \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + E##ga1 = Bga1 ^( Bge1 | Bgi1 ); \ + Ca1 ^= E##ga1; \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + E##ge1 = Bge1 ^( Bgi1 & Bgo1 ); \ + Ce1 ^= E##ge1; \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##gi1 = Bgi1 ^( Bgo1 |(~Bgu1)); \ + Ci1 ^= E##gi1; \ + E##go1 = Bgo1 ^( Bgu1 | Bga1 ); \ + Co1 ^= E##go1; \ + E##gu1 = Bgu1 ^( Bga1 & Bge1 ); \ + Cu1 ^= E##gu1; \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + E##ka0 = Bka0 ^( Bke0 | Bki0 ); \ + Ca0 ^= E##ka0; \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + E##ke0 = Bke0 ^( Bki0 & Bko0 ); \ + Ce0 ^= E##ke0; \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + Ci0 ^= E##ki0; \ + E##ko0 = (~Bko0)^( Bku0 | Bka0 ); \ + Co0 ^= E##ko0; \ + E##ku0 = Bku0 ^( Bka0 & Bke0 ); \ + Cu0 ^= E##ku0; \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + E##ka1 = Bka1 ^( Bke1 | Bki1 ); \ + Ca1 ^= E##ka1; \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + E##ke1 = Bke1 ^( Bki1 & Bko1 ); \ + Ce1 ^= E##ke1; \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + Ci1 ^= E##ki1; \ + E##ko1 = (~Bko1)^( Bku1 | Bka1 ); \ + Co1 ^= E##ko1; \ + E##ku1 = Bku1 ^( Bka1 & Bke1 ); \ + Cu1 ^= E##ku1; \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + E##ma0 = Bma0 ^( Bme0 & Bmi0 ); \ + Ca0 ^= E##ma0; \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + E##me0 = Bme0 ^( Bmi0 | Bmo0 ); \ + Ce0 ^= E##me0; \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##mi0 = Bmi0 ^((~Bmo0)| Bmu0 ); \ + Ci0 ^= E##mi0; \ + E##mo0 = (~Bmo0)^( Bmu0 & Bma0 ); \ + Co0 ^= E##mo0; \ + E##mu0 = Bmu0 ^( Bma0 | Bme0 ); \ + Cu0 ^= E##mu0; \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + E##ma1 = Bma1 ^( Bme1 & Bmi1 ); \ + Ca1 ^= E##ma1; \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + E##me1 = Bme1 ^( Bmi1 | Bmo1 ); \ + Ce1 ^= E##me1; \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##mi1 = Bmi1 ^((~Bmo1)| Bmu1 ); \ + Ci1 ^= E##mi1; \ + E##mo1 = (~Bmo1)^( Bmu1 & Bma1 ); \ + Co1 ^= E##mo1; \ + E##mu1 = Bmu1 ^( Bma1 | Bme1 ); \ + Cu1 ^= E##mu1; \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + Ca0 ^= E##sa0; \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + E##se0 = (~Bse0)^( Bsi0 | Bso0 ); \ + Ce0 ^= E##se0; \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##si0 = Bsi0 ^( Bso0 & Bsu0 ); \ + Ci0 ^= E##si0; \ + E##so0 = Bso0 ^( Bsu0 | Bsa0 ); \ + Co0 ^= E##so0; \ + E##su0 = Bsu0 ^( Bsa0 & Bse0 ); \ + Cu0 ^= E##su0; \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + Ca1 ^= E##sa1; \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + E##se1 = (~Bse1)^( Bsi1 | Bso1 ); \ + Ce1 ^= E##se1; \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##si1 = Bsi1 ^( Bso1 & Bsu1 ); \ + Ci1 ^= E##si1; \ + E##so1 = Bso1 ^( Bsu1 | Bsa1 ); \ + Co1 ^= E##so1; \ + E##su1 = Bsu1 ^( Bsa1 & Bse1 ); \ + Cu1 ^= E##su1; \ +\ + +// --- Code for round (lane complementing pattern 'bebigokimisa') +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIota(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + E##ba0 = Bba0 ^( Bbe0 | Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + E##be0 = Bbe0 ^((~Bbi0)| Bbo0 ); \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##bi0 = Bbi0 ^( Bbo0 & Bbu0 ); \ + E##bo0 = Bbo0 ^( Bbu0 | Bba0 ); \ + E##bu0 = Bbu0 ^( Bba0 & Bbe0 ); \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + E##ba1 = Bba1 ^( Bbe1 | Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + E##be1 = Bbe1 ^((~Bbi1)| Bbo1 ); \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##bi1 = Bbi1 ^( Bbo1 & Bbu1 ); \ + E##bo1 = Bbo1 ^( Bbu1 | Bba1 ); \ + E##bu1 = Bbu1 ^( Bba1 & Bbe1 ); \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + E##ga0 = Bga0 ^( Bge0 | Bgi0 ); \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + E##ge0 = Bge0 ^( Bgi0 & Bgo0 ); \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##gi0 = Bgi0 ^( Bgo0 |(~Bgu0)); \ + E##go0 = Bgo0 ^( Bgu0 | Bga0 ); \ + E##gu0 = Bgu0 ^( Bga0 & Bge0 ); \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + E##ga1 = Bga1 ^( Bge1 | Bgi1 ); \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + E##ge1 = Bge1 ^( Bgi1 & Bgo1 ); \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##gi1 = Bgi1 ^( Bgo1 |(~Bgu1)); \ + E##go1 = Bgo1 ^( Bgu1 | Bga1 ); \ + E##gu1 = Bgu1 ^( Bga1 & Bge1 ); \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + E##ka0 = Bka0 ^( Bke0 | Bki0 ); \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + E##ke0 = Bke0 ^( Bki0 & Bko0 ); \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + E##ko0 = (~Bko0)^( Bku0 | Bka0 ); \ + E##ku0 = Bku0 ^( Bka0 & Bke0 ); \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + E##ka1 = Bka1 ^( Bke1 | Bki1 ); \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + E##ke1 = Bke1 ^( Bki1 & Bko1 ); \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + E##ko1 = (~Bko1)^( Bku1 | Bka1 ); \ + E##ku1 = Bku1 ^( Bka1 & Bke1 ); \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + E##ma0 = Bma0 ^( Bme0 & Bmi0 ); \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + E##me0 = Bme0 ^( Bmi0 | Bmo0 ); \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##mi0 = Bmi0 ^((~Bmo0)| Bmu0 ); \ + E##mo0 = (~Bmo0)^( Bmu0 & Bma0 ); \ + E##mu0 = Bmu0 ^( Bma0 | Bme0 ); \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + E##ma1 = Bma1 ^( Bme1 & Bmi1 ); \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + E##me1 = Bme1 ^( Bmi1 | Bmo1 ); \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##mi1 = Bmi1 ^((~Bmo1)| Bmu1 ); \ + E##mo1 = (~Bmo1)^( Bmu1 & Bma1 ); \ + E##mu1 = Bmu1 ^( Bma1 | Bme1 ); \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + E##se0 = (~Bse0)^( Bsi0 | Bso0 ); \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##si0 = Bsi0 ^( Bso0 & Bsu0 ); \ + E##so0 = Bso0 ^( Bsu0 | Bsa0 ); \ + E##su0 = Bsu0 ^( Bsa0 & Bse0 ); \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + E##se1 = (~Bse1)^( Bsi1 | Bso1 ); \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##si1 = Bsi1 ^( Bso1 & Bsu1 ); \ + E##so1 = Bso1 ^( Bsu1 | Bsa1 ); \ + E##su1 = Bsu1 ^( Bsa1 & Bse1 ); \ +\ + +#else // UseBebigokimisa +// --- Code for round, with prepare-theta +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + E##ba0 = Bba0 ^((~Bbe0)& Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + Ca0 = E##ba0; \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + E##be0 = Bbe0 ^((~Bbi0)& Bbo0 ); \ + Ce0 = E##be0; \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##bi0 = Bbi0 ^((~Bbo0)& Bbu0 ); \ + Ci0 = E##bi0; \ + E##bo0 = Bbo0 ^((~Bbu0)& Bba0 ); \ + Co0 = E##bo0; \ + E##bu0 = Bbu0 ^((~Bba0)& Bbe0 ); \ + Cu0 = E##bu0; \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + E##ba1 = Bba1 ^((~Bbe1)& Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + Ca1 = E##ba1; \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + E##be1 = Bbe1 ^((~Bbi1)& Bbo1 ); \ + Ce1 = E##be1; \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##bi1 = Bbi1 ^((~Bbo1)& Bbu1 ); \ + Ci1 = E##bi1; \ + E##bo1 = Bbo1 ^((~Bbu1)& Bba1 ); \ + Co1 = E##bo1; \ + E##bu1 = Bbu1 ^((~Bba1)& Bbe1 ); \ + Cu1 = E##bu1; \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + E##ga0 = Bga0 ^((~Bge0)& Bgi0 ); \ + Ca0 ^= E##ga0; \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + E##ge0 = Bge0 ^((~Bgi0)& Bgo0 ); \ + Ce0 ^= E##ge0; \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##gi0 = Bgi0 ^((~Bgo0)& Bgu0 ); \ + Ci0 ^= E##gi0; \ + E##go0 = Bgo0 ^((~Bgu0)& Bga0 ); \ + Co0 ^= E##go0; \ + E##gu0 = Bgu0 ^((~Bga0)& Bge0 ); \ + Cu0 ^= E##gu0; \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + E##ga1 = Bga1 ^((~Bge1)& Bgi1 ); \ + Ca1 ^= E##ga1; \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + E##ge1 = Bge1 ^((~Bgi1)& Bgo1 ); \ + Ce1 ^= E##ge1; \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##gi1 = Bgi1 ^((~Bgo1)& Bgu1 ); \ + Ci1 ^= E##gi1; \ + E##go1 = Bgo1 ^((~Bgu1)& Bga1 ); \ + Co1 ^= E##go1; \ + E##gu1 = Bgu1 ^((~Bga1)& Bge1 ); \ + Cu1 ^= E##gu1; \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + E##ka0 = Bka0 ^((~Bke0)& Bki0 ); \ + Ca0 ^= E##ka0; \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + E##ke0 = Bke0 ^((~Bki0)& Bko0 ); \ + Ce0 ^= E##ke0; \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + Ci0 ^= E##ki0; \ + E##ko0 = Bko0 ^((~Bku0)& Bka0 ); \ + Co0 ^= E##ko0; \ + E##ku0 = Bku0 ^((~Bka0)& Bke0 ); \ + Cu0 ^= E##ku0; \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + E##ka1 = Bka1 ^((~Bke1)& Bki1 ); \ + Ca1 ^= E##ka1; \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + E##ke1 = Bke1 ^((~Bki1)& Bko1 ); \ + Ce1 ^= E##ke1; \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + Ci1 ^= E##ki1; \ + E##ko1 = Bko1 ^((~Bku1)& Bka1 ); \ + Co1 ^= E##ko1; \ + E##ku1 = Bku1 ^((~Bka1)& Bke1 ); \ + Cu1 ^= E##ku1; \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + E##ma0 = Bma0 ^((~Bme0)& Bmi0 ); \ + Ca0 ^= E##ma0; \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + E##me0 = Bme0 ^((~Bmi0)& Bmo0 ); \ + Ce0 ^= E##me0; \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##mi0 = Bmi0 ^((~Bmo0)& Bmu0 ); \ + Ci0 ^= E##mi0; \ + E##mo0 = Bmo0 ^((~Bmu0)& Bma0 ); \ + Co0 ^= E##mo0; \ + E##mu0 = Bmu0 ^((~Bma0)& Bme0 ); \ + Cu0 ^= E##mu0; \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + E##ma1 = Bma1 ^((~Bme1)& Bmi1 ); \ + Ca1 ^= E##ma1; \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + E##me1 = Bme1 ^((~Bmi1)& Bmo1 ); \ + Ce1 ^= E##me1; \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##mi1 = Bmi1 ^((~Bmo1)& Bmu1 ); \ + Ci1 ^= E##mi1; \ + E##mo1 = Bmo1 ^((~Bmu1)& Bma1 ); \ + Co1 ^= E##mo1; \ + E##mu1 = Bmu1 ^((~Bma1)& Bme1 ); \ + Cu1 ^= E##mu1; \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + Ca0 ^= E##sa0; \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + E##se0 = Bse0 ^((~Bsi0)& Bso0 ); \ + Ce0 ^= E##se0; \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##si0 = Bsi0 ^((~Bso0)& Bsu0 ); \ + Ci0 ^= E##si0; \ + E##so0 = Bso0 ^((~Bsu0)& Bsa0 ); \ + Co0 ^= E##so0; \ + E##su0 = Bsu0 ^((~Bsa0)& Bse0 ); \ + Cu0 ^= E##su0; \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + Ca1 ^= E##sa1; \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + E##se1 = Bse1 ^((~Bsi1)& Bso1 ); \ + Ce1 ^= E##se1; \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##si1 = Bsi1 ^((~Bso1)& Bsu1 ); \ + Ci1 ^= E##si1; \ + E##so1 = Bso1 ^((~Bsu1)& Bsa1 ); \ + Co1 ^= E##so1; \ + E##su1 = Bsu1 ^((~Bsa1)& Bse1 ); \ + Cu1 ^= E##su1; \ +\ + +// --- Code for round +// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words +#define thetaRhoPiChiIota(i, A, E) \ + Da0 = Cu0^ROL32(Ce1, 1); \ + Da1 = Cu1^Ce0; \ + De0 = Ca0^ROL32(Ci1, 1); \ + De1 = Ca1^Ci0; \ + Di0 = Ce0^ROL32(Co1, 1); \ + Di1 = Ce1^Co0; \ + Do0 = Ci0^ROL32(Cu1, 1); \ + Do1 = Ci1^Cu0; \ + Du0 = Co0^ROL32(Ca1, 1); \ + Du1 = Co1^Ca0; \ +\ + A##ba0 ^= Da0; \ + Bba0 = A##ba0; \ + A##ge0 ^= De0; \ + Bbe0 = ROL32(A##ge0, 22); \ + A##ki1 ^= Di1; \ + Bbi0 = ROL32(A##ki1, 22); \ + E##ba0 = Bba0 ^((~Bbe0)& Bbi0 ); \ + E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \ + A##mo1 ^= Do1; \ + Bbo0 = ROL32(A##mo1, 11); \ + E##be0 = Bbe0 ^((~Bbi0)& Bbo0 ); \ + A##su0 ^= Du0; \ + Bbu0 = ROL32(A##su0, 7); \ + E##bi0 = Bbi0 ^((~Bbo0)& Bbu0 ); \ + E##bo0 = Bbo0 ^((~Bbu0)& Bba0 ); \ + E##bu0 = Bbu0 ^((~Bba0)& Bbe0 ); \ +\ + A##ba1 ^= Da1; \ + Bba1 = A##ba1; \ + A##ge1 ^= De1; \ + Bbe1 = ROL32(A##ge1, 22); \ + A##ki0 ^= Di0; \ + Bbi1 = ROL32(A##ki0, 21); \ + E##ba1 = Bba1 ^((~Bbe1)& Bbi1 ); \ + E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \ + A##mo0 ^= Do0; \ + Bbo1 = ROL32(A##mo0, 10); \ + E##be1 = Bbe1 ^((~Bbi1)& Bbo1 ); \ + A##su1 ^= Du1; \ + Bbu1 = ROL32(A##su1, 7); \ + E##bi1 = Bbi1 ^((~Bbo1)& Bbu1 ); \ + E##bo1 = Bbo1 ^((~Bbu1)& Bba1 ); \ + E##bu1 = Bbu1 ^((~Bba1)& Bbe1 ); \ +\ + A##bo0 ^= Do0; \ + Bga0 = ROL32(A##bo0, 14); \ + A##gu0 ^= Du0; \ + Bge0 = ROL32(A##gu0, 10); \ + A##ka1 ^= Da1; \ + Bgi0 = ROL32(A##ka1, 2); \ + E##ga0 = Bga0 ^((~Bge0)& Bgi0 ); \ + A##me1 ^= De1; \ + Bgo0 = ROL32(A##me1, 23); \ + E##ge0 = Bge0 ^((~Bgi0)& Bgo0 ); \ + A##si1 ^= Di1; \ + Bgu0 = ROL32(A##si1, 31); \ + E##gi0 = Bgi0 ^((~Bgo0)& Bgu0 ); \ + E##go0 = Bgo0 ^((~Bgu0)& Bga0 ); \ + E##gu0 = Bgu0 ^((~Bga0)& Bge0 ); \ +\ + A##bo1 ^= Do1; \ + Bga1 = ROL32(A##bo1, 14); \ + A##gu1 ^= Du1; \ + Bge1 = ROL32(A##gu1, 10); \ + A##ka0 ^= Da0; \ + Bgi1 = ROL32(A##ka0, 1); \ + E##ga1 = Bga1 ^((~Bge1)& Bgi1 ); \ + A##me0 ^= De0; \ + Bgo1 = ROL32(A##me0, 22); \ + E##ge1 = Bge1 ^((~Bgi1)& Bgo1 ); \ + A##si0 ^= Di0; \ + Bgu1 = ROL32(A##si0, 30); \ + E##gi1 = Bgi1 ^((~Bgo1)& Bgu1 ); \ + E##go1 = Bgo1 ^((~Bgu1)& Bga1 ); \ + E##gu1 = Bgu1 ^((~Bga1)& Bge1 ); \ +\ + A##be1 ^= De1; \ + Bka0 = ROL32(A##be1, 1); \ + A##gi0 ^= Di0; \ + Bke0 = ROL32(A##gi0, 3); \ + A##ko1 ^= Do1; \ + Bki0 = ROL32(A##ko1, 13); \ + E##ka0 = Bka0 ^((~Bke0)& Bki0 ); \ + A##mu0 ^= Du0; \ + Bko0 = ROL32(A##mu0, 4); \ + E##ke0 = Bke0 ^((~Bki0)& Bko0 ); \ + A##sa0 ^= Da0; \ + Bku0 = ROL32(A##sa0, 9); \ + E##ki0 = Bki0 ^((~Bko0)& Bku0 ); \ + E##ko0 = Bko0 ^((~Bku0)& Bka0 ); \ + E##ku0 = Bku0 ^((~Bka0)& Bke0 ); \ +\ + A##be0 ^= De0; \ + Bka1 = A##be0; \ + A##gi1 ^= Di1; \ + Bke1 = ROL32(A##gi1, 3); \ + A##ko0 ^= Do0; \ + Bki1 = ROL32(A##ko0, 12); \ + E##ka1 = Bka1 ^((~Bke1)& Bki1 ); \ + A##mu1 ^= Du1; \ + Bko1 = ROL32(A##mu1, 4); \ + E##ke1 = Bke1 ^((~Bki1)& Bko1 ); \ + A##sa1 ^= Da1; \ + Bku1 = ROL32(A##sa1, 9); \ + E##ki1 = Bki1 ^((~Bko1)& Bku1 ); \ + E##ko1 = Bko1 ^((~Bku1)& Bka1 ); \ + E##ku1 = Bku1 ^((~Bka1)& Bke1 ); \ +\ + A##bu1 ^= Du1; \ + Bma0 = ROL32(A##bu1, 14); \ + A##ga0 ^= Da0; \ + Bme0 = ROL32(A##ga0, 18); \ + A##ke0 ^= De0; \ + Bmi0 = ROL32(A##ke0, 5); \ + E##ma0 = Bma0 ^((~Bme0)& Bmi0 ); \ + A##mi1 ^= Di1; \ + Bmo0 = ROL32(A##mi1, 8); \ + E##me0 = Bme0 ^((~Bmi0)& Bmo0 ); \ + A##so0 ^= Do0; \ + Bmu0 = ROL32(A##so0, 28); \ + E##mi0 = Bmi0 ^((~Bmo0)& Bmu0 ); \ + E##mo0 = Bmo0 ^((~Bmu0)& Bma0 ); \ + E##mu0 = Bmu0 ^((~Bma0)& Bme0 ); \ +\ + A##bu0 ^= Du0; \ + Bma1 = ROL32(A##bu0, 13); \ + A##ga1 ^= Da1; \ + Bme1 = ROL32(A##ga1, 18); \ + A##ke1 ^= De1; \ + Bmi1 = ROL32(A##ke1, 5); \ + E##ma1 = Bma1 ^((~Bme1)& Bmi1 ); \ + A##mi0 ^= Di0; \ + Bmo1 = ROL32(A##mi0, 7); \ + E##me1 = Bme1 ^((~Bmi1)& Bmo1 ); \ + A##so1 ^= Do1; \ + Bmu1 = ROL32(A##so1, 28); \ + E##mi1 = Bmi1 ^((~Bmo1)& Bmu1 ); \ + E##mo1 = Bmo1 ^((~Bmu1)& Bma1 ); \ + E##mu1 = Bmu1 ^((~Bma1)& Bme1 ); \ +\ + A##bi0 ^= Di0; \ + Bsa0 = ROL32(A##bi0, 31); \ + A##go1 ^= Do1; \ + Bse0 = ROL32(A##go1, 28); \ + A##ku1 ^= Du1; \ + Bsi0 = ROL32(A##ku1, 20); \ + E##sa0 = Bsa0 ^((~Bse0)& Bsi0 ); \ + A##ma1 ^= Da1; \ + Bso0 = ROL32(A##ma1, 21); \ + E##se0 = Bse0 ^((~Bsi0)& Bso0 ); \ + A##se0 ^= De0; \ + Bsu0 = ROL32(A##se0, 1); \ + E##si0 = Bsi0 ^((~Bso0)& Bsu0 ); \ + E##so0 = Bso0 ^((~Bsu0)& Bsa0 ); \ + E##su0 = Bsu0 ^((~Bsa0)& Bse0 ); \ +\ + A##bi1 ^= Di1; \ + Bsa1 = ROL32(A##bi1, 31); \ + A##go0 ^= Do0; \ + Bse1 = ROL32(A##go0, 27); \ + A##ku0 ^= Du0; \ + Bsi1 = ROL32(A##ku0, 19); \ + E##sa1 = Bsa1 ^((~Bse1)& Bsi1 ); \ + A##ma0 ^= Da0; \ + Bso1 = ROL32(A##ma0, 20); \ + E##se1 = Bse1 ^((~Bsi1)& Bso1 ); \ + A##se1 ^= De1; \ + Bsu1 = ROL32(A##se1, 1); \ + E##si1 = Bsi1 ^((~Bso1)& Bsu1 ); \ + E##so1 = Bso1 ^((~Bsu1)& Bsa1 ); \ + E##su1 = Bsu1 ^((~Bsa1)& Bse1 ); \ +\ + +#endif // UseBebigokimisa + +const UINT32 KeccakF1600RoundConstants_int2_0[24] = { + 0x00000001UL, + 0x00000000UL, + 0x00000000UL, + 0x00000000UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000000UL, + 0x00000000UL, + 0x00000001UL, + 0x00000000UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000001UL, + 0x00000000UL, + 0x00000000UL, + 0x00000000UL, + 0x00000000UL, + 0x00000001UL, + 0x00000000UL, + 0x00000001UL, + 0x00000000UL }; + +const UINT32 KeccakF1600RoundConstants_int2_1[24] = { + 0x00000000UL, + 0x00000089UL, + 0x8000008bUL, + 0x80008080UL, + 0x0000008bUL, + 0x00008000UL, + 0x80008088UL, + 0x80000082UL, + 0x0000000bUL, + 0x0000000aUL, + 0x00008082UL, + 0x00008003UL, + 0x0000808bUL, + 0x8000000bUL, + 0x8000008aUL, + 0x80000081UL, + 0x80000081UL, + 0x80000008UL, + 0x00000083UL, + 0x80008003UL, + 0x80008088UL, + 0x80000088UL, + 0x00008000UL, + 0x80008082UL }; + +#define copyFromStateAndXor1024bits(X, state, input) \ + X##ba0 = state[ 0]^input[ 0]; \ + X##ba1 = state[ 1]^input[ 1]; \ + X##be0 = state[ 2]^input[ 2]; \ + X##be1 = state[ 3]^input[ 3]; \ + X##bi0 = state[ 4]^input[ 4]; \ + X##bi1 = state[ 5]^input[ 5]; \ + X##bo0 = state[ 6]^input[ 6]; \ + X##bo1 = state[ 7]^input[ 7]; \ + X##bu0 = state[ 8]^input[ 8]; \ + X##bu1 = state[ 9]^input[ 9]; \ + X##ga0 = state[10]^input[10]; \ + X##ga1 = state[11]^input[11]; \ + X##ge0 = state[12]^input[12]; \ + X##ge1 = state[13]^input[13]; \ + X##gi0 = state[14]^input[14]; \ + X##gi1 = state[15]^input[15]; \ + X##go0 = state[16]^input[16]; \ + X##go1 = state[17]^input[17]; \ + X##gu0 = state[18]^input[18]; \ + X##gu1 = state[19]^input[19]; \ + X##ka0 = state[20]^input[20]; \ + X##ka1 = state[21]^input[21]; \ + X##ke0 = state[22]^input[22]; \ + X##ke1 = state[23]^input[23]; \ + X##ki0 = state[24]^input[24]; \ + X##ki1 = state[25]^input[25]; \ + X##ko0 = state[26]^input[26]; \ + X##ko1 = state[27]^input[27]; \ + X##ku0 = state[28]^input[28]; \ + X##ku1 = state[29]^input[29]; \ + X##ma0 = state[30]^input[30]; \ + X##ma1 = state[31]^input[31]; \ + X##me0 = state[32]; \ + X##me1 = state[33]; \ + X##mi0 = state[34]; \ + X##mi1 = state[35]; \ + X##mo0 = state[36]; \ + X##mo1 = state[37]; \ + X##mu0 = state[38]; \ + X##mu1 = state[39]; \ + X##sa0 = state[40]; \ + X##sa1 = state[41]; \ + X##se0 = state[42]; \ + X##se1 = state[43]; \ + X##si0 = state[44]; \ + X##si1 = state[45]; \ + X##so0 = state[46]; \ + X##so1 = state[47]; \ + X##su0 = state[48]; \ + X##su1 = state[49]; \ + +#define copyFromStateAndXor1088bits(X, state, input) \ + X##ba0 = state[ 0]^input[ 0]; \ + X##ba1 = state[ 1]^input[ 1]; \ + X##be0 = state[ 2]^input[ 2]; \ + X##be1 = state[ 3]^input[ 3]; \ + X##bi0 = state[ 4]^input[ 4]; \ + X##bi1 = state[ 5]^input[ 5]; \ + X##bo0 = state[ 6]^input[ 6]; \ + X##bo1 = state[ 7]^input[ 7]; \ + X##bu0 = state[ 8]^input[ 8]; \ + X##bu1 = state[ 9]^input[ 9]; \ + X##ga0 = state[10]^input[10]; \ + X##ga1 = state[11]^input[11]; \ + X##ge0 = state[12]^input[12]; \ + X##ge1 = state[13]^input[13]; \ + X##gi0 = state[14]^input[14]; \ + X##gi1 = state[15]^input[15]; \ + X##go0 = state[16]^input[16]; \ + X##go1 = state[17]^input[17]; \ + X##gu0 = state[18]^input[18]; \ + X##gu1 = state[19]^input[19]; \ + X##ka0 = state[20]^input[20]; \ + X##ka1 = state[21]^input[21]; \ + X##ke0 = state[22]^input[22]; \ + X##ke1 = state[23]^input[23]; \ + X##ki0 = state[24]^input[24]; \ + X##ki1 = state[25]^input[25]; \ + X##ko0 = state[26]^input[26]; \ + X##ko1 = state[27]^input[27]; \ + X##ku0 = state[28]^input[28]; \ + X##ku1 = state[29]^input[29]; \ + X##ma0 = state[30]^input[30]; \ + X##ma1 = state[31]^input[31]; \ + X##me0 = state[32]^input[32]; \ + X##me1 = state[33]^input[33]; \ + X##mi0 = state[34]; \ + X##mi1 = state[35]; \ + X##mo0 = state[36]; \ + X##mo1 = state[37]; \ + X##mu0 = state[38]; \ + X##mu1 = state[39]; \ + X##sa0 = state[40]; \ + X##sa1 = state[41]; \ + X##se0 = state[42]; \ + X##se1 = state[43]; \ + X##si0 = state[44]; \ + X##si1 = state[45]; \ + X##so0 = state[46]; \ + X##so1 = state[47]; \ + X##su0 = state[48]; \ + X##su1 = state[49]; \ + +#define copyFromState(X, state) \ + X##ba0 = state[ 0]; \ + X##ba1 = state[ 1]; \ + X##be0 = state[ 2]; \ + X##be1 = state[ 3]; \ + X##bi0 = state[ 4]; \ + X##bi1 = state[ 5]; \ + X##bo0 = state[ 6]; \ + X##bo1 = state[ 7]; \ + X##bu0 = state[ 8]; \ + X##bu1 = state[ 9]; \ + X##ga0 = state[10]; \ + X##ga1 = state[11]; \ + X##ge0 = state[12]; \ + X##ge1 = state[13]; \ + X##gi0 = state[14]; \ + X##gi1 = state[15]; \ + X##go0 = state[16]; \ + X##go1 = state[17]; \ + X##gu0 = state[18]; \ + X##gu1 = state[19]; \ + X##ka0 = state[20]; \ + X##ka1 = state[21]; \ + X##ke0 = state[22]; \ + X##ke1 = state[23]; \ + X##ki0 = state[24]; \ + X##ki1 = state[25]; \ + X##ko0 = state[26]; \ + X##ko1 = state[27]; \ + X##ku0 = state[28]; \ + X##ku1 = state[29]; \ + X##ma0 = state[30]; \ + X##ma1 = state[31]; \ + X##me0 = state[32]; \ + X##me1 = state[33]; \ + X##mi0 = state[34]; \ + X##mi1 = state[35]; \ + X##mo0 = state[36]; \ + X##mo1 = state[37]; \ + X##mu0 = state[38]; \ + X##mu1 = state[39]; \ + X##sa0 = state[40]; \ + X##sa1 = state[41]; \ + X##se0 = state[42]; \ + X##se1 = state[43]; \ + X##si0 = state[44]; \ + X##si1 = state[45]; \ + X##so0 = state[46]; \ + X##so1 = state[47]; \ + X##su0 = state[48]; \ + X##su1 = state[49]; \ + +#define copyToState(state, X) \ + state[ 0] = X##ba0; \ + state[ 1] = X##ba1; \ + state[ 2] = X##be0; \ + state[ 3] = X##be1; \ + state[ 4] = X##bi0; \ + state[ 5] = X##bi1; \ + state[ 6] = X##bo0; \ + state[ 7] = X##bo1; \ + state[ 8] = X##bu0; \ + state[ 9] = X##bu1; \ + state[10] = X##ga0; \ + state[11] = X##ga1; \ + state[12] = X##ge0; \ + state[13] = X##ge1; \ + state[14] = X##gi0; \ + state[15] = X##gi1; \ + state[16] = X##go0; \ + state[17] = X##go1; \ + state[18] = X##gu0; \ + state[19] = X##gu1; \ + state[20] = X##ka0; \ + state[21] = X##ka1; \ + state[22] = X##ke0; \ + state[23] = X##ke1; \ + state[24] = X##ki0; \ + state[25] = X##ki1; \ + state[26] = X##ko0; \ + state[27] = X##ko1; \ + state[28] = X##ku0; \ + state[29] = X##ku1; \ + state[30] = X##ma0; \ + state[31] = X##ma1; \ + state[32] = X##me0; \ + state[33] = X##me1; \ + state[34] = X##mi0; \ + state[35] = X##mi1; \ + state[36] = X##mo0; \ + state[37] = X##mo1; \ + state[38] = X##mu0; \ + state[39] = X##mu1; \ + state[40] = X##sa0; \ + state[41] = X##sa1; \ + state[42] = X##se0; \ + state[43] = X##se1; \ + state[44] = X##si0; \ + state[45] = X##si1; \ + state[46] = X##so0; \ + state[47] = X##so1; \ + state[48] = X##su0; \ + state[49] = X##su1; \ + +#define copyStateVariables(X, Y) \ + X##ba0 = Y##ba0; \ + X##ba1 = Y##ba1; \ + X##be0 = Y##be0; \ + X##be1 = Y##be1; \ + X##bi0 = Y##bi0; \ + X##bi1 = Y##bi1; \ + X##bo0 = Y##bo0; \ + X##bo1 = Y##bo1; \ + X##bu0 = Y##bu0; \ + X##bu1 = Y##bu1; \ + X##ga0 = Y##ga0; \ + X##ga1 = Y##ga1; \ + X##ge0 = Y##ge0; \ + X##ge1 = Y##ge1; \ + X##gi0 = Y##gi0; \ + X##gi1 = Y##gi1; \ + X##go0 = Y##go0; \ + X##go1 = Y##go1; \ + X##gu0 = Y##gu0; \ + X##gu1 = Y##gu1; \ + X##ka0 = Y##ka0; \ + X##ka1 = Y##ka1; \ + X##ke0 = Y##ke0; \ + X##ke1 = Y##ke1; \ + X##ki0 = Y##ki0; \ + X##ki1 = Y##ki1; \ + X##ko0 = Y##ko0; \ + X##ko1 = Y##ko1; \ + X##ku0 = Y##ku0; \ + X##ku1 = Y##ku1; \ + X##ma0 = Y##ma0; \ + X##ma1 = Y##ma1; \ + X##me0 = Y##me0; \ + X##me1 = Y##me1; \ + X##mi0 = Y##mi0; \ + X##mi1 = Y##mi1; \ + X##mo0 = Y##mo0; \ + X##mo1 = Y##mo1; \ + X##mu0 = Y##mu0; \ + X##mu1 = Y##mu1; \ + X##sa0 = Y##sa0; \ + X##sa1 = Y##sa1; \ + X##se0 = Y##se0; \ + X##se1 = Y##se1; \ + X##si0 = Y##si0; \ + X##si1 = Y##si1; \ + X##so0 = Y##so0; \ + X##so1 = Y##so1; \ + X##su0 = Y##su0; \ + X##su1 = Y##su1; \ + diff --git a/c_src/KeccakF-1600-32.macros b/c_src/KeccakF-1600-32.macros new file mode 100755 index 0000000..9ade600 --- /dev/null +++ b/c_src/KeccakF-1600-32.macros @@ -0,0 +1,26 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifdef UseSchedule + #if (UseSchedule == 1) + #include "KeccakF-1600-32-s1.macros" + #elif (UseSchedule == 2) + #include "KeccakF-1600-32-s2.macros" + #elif (UseSchedule == 3) + #include "KeccakF-1600-32-rvk.macros" + #else + #error "This schedule is not supported." + #endif +#else + #include "KeccakF-1600-32-s1.macros" +#endif diff --git a/c_src/KeccakF-1600-64.macros b/c_src/KeccakF-1600-64.macros new file mode 100755 index 0000000..0c20bca --- /dev/null +++ b/c_src/KeccakF-1600-64.macros @@ -0,0 +1,728 @@ +/* +Code automatically generated by KeccakTools! + +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#define declareABCDE \ + UINT64 Aba, Abe, Abi, Abo, Abu; \ + UINT64 Aga, Age, Agi, Ago, Agu; \ + UINT64 Aka, Ake, Aki, Ako, Aku; \ + UINT64 Ama, Ame, Ami, Amo, Amu; \ + UINT64 Asa, Ase, Asi, Aso, Asu; \ + UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \ + UINT64 Bga, Bge, Bgi, Bgo, Bgu; \ + UINT64 Bka, Bke, Bki, Bko, Bku; \ + UINT64 Bma, Bme, Bmi, Bmo, Bmu; \ + UINT64 Bsa, Bse, Bsi, Bso, Bsu; \ + UINT64 Ca, Ce, Ci, Co, Cu; \ + UINT64 Da, De, Di, Do, Du; \ + UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \ + UINT64 Ega, Ege, Egi, Ego, Egu; \ + UINT64 Eka, Eke, Eki, Eko, Eku; \ + UINT64 Ema, Eme, Emi, Emo, Emu; \ + UINT64 Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = Aba^Aga^Aka^Ama^Asa; \ + Ce = Abe^Age^Ake^Ame^Ase; \ + Ci = Abi^Agi^Aki^Ami^Asi; \ + Co = Abo^Ago^Ako^Amo^Aso; \ + Cu = Abu^Agu^Aku^Amu^Asu; \ + +#ifdef UseBebigokimisa +// --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') +// --- 64-bit lanes mapped to 64-bit words +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^( Bbe | Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + Ca = E##ba; \ + E##be = Bbe ^((~Bbi)| Bbo ); \ + Ce = E##be; \ + E##bi = Bbi ^( Bbo & Bbu ); \ + Ci = E##bi; \ + E##bo = Bbo ^( Bbu | Bba ); \ + Co = E##bo; \ + E##bu = Bbu ^( Bba & Bbe ); \ + Cu = E##bu; \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^( Bge | Bgi ); \ + Ca ^= E##ga; \ + E##ge = Bge ^( Bgi & Bgo ); \ + Ce ^= E##ge; \ + E##gi = Bgi ^( Bgo |(~Bgu)); \ + Ci ^= E##gi; \ + E##go = Bgo ^( Bgu | Bga ); \ + Co ^= E##go; \ + E##gu = Bgu ^( Bga & Bge ); \ + Cu ^= E##gu; \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^( Bke | Bki ); \ + Ca ^= E##ka; \ + E##ke = Bke ^( Bki & Bko ); \ + Ce ^= E##ke; \ + E##ki = Bki ^((~Bko)& Bku ); \ + Ci ^= E##ki; \ + E##ko = (~Bko)^( Bku | Bka ); \ + Co ^= E##ko; \ + E##ku = Bku ^( Bka & Bke ); \ + Cu ^= E##ku; \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^( Bme & Bmi ); \ + Ca ^= E##ma; \ + E##me = Bme ^( Bmi | Bmo ); \ + Ce ^= E##me; \ + E##mi = Bmi ^((~Bmo)| Bmu ); \ + Ci ^= E##mi; \ + E##mo = (~Bmo)^( Bmu & Bma ); \ + Co ^= E##mo; \ + E##mu = Bmu ^( Bma | Bme ); \ + Cu ^= E##mu; \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + Ca ^= E##sa; \ + E##se = (~Bse)^( Bsi | Bso ); \ + Ce ^= E##se; \ + E##si = Bsi ^( Bso & Bsu ); \ + Ci ^= E##si; \ + E##so = Bso ^( Bsu | Bsa ); \ + Co ^= E##so; \ + E##su = Bsu ^( Bsa & Bse ); \ + Cu ^= E##su; \ +\ + +// --- Code for round (lane complementing pattern 'bebigokimisa') +// --- 64-bit lanes mapped to 64-bit words +#define thetaRhoPiChiIota(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^( Bbe | Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + E##be = Bbe ^((~Bbi)| Bbo ); \ + E##bi = Bbi ^( Bbo & Bbu ); \ + E##bo = Bbo ^( Bbu | Bba ); \ + E##bu = Bbu ^( Bba & Bbe ); \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^( Bge | Bgi ); \ + E##ge = Bge ^( Bgi & Bgo ); \ + E##gi = Bgi ^( Bgo |(~Bgu)); \ + E##go = Bgo ^( Bgu | Bga ); \ + E##gu = Bgu ^( Bga & Bge ); \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^( Bke | Bki ); \ + E##ke = Bke ^( Bki & Bko ); \ + E##ki = Bki ^((~Bko)& Bku ); \ + E##ko = (~Bko)^( Bku | Bka ); \ + E##ku = Bku ^( Bka & Bke ); \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^( Bme & Bmi ); \ + E##me = Bme ^( Bmi | Bmo ); \ + E##mi = Bmi ^((~Bmo)| Bmu ); \ + E##mo = (~Bmo)^( Bmu & Bma ); \ + E##mu = Bmu ^( Bma | Bme ); \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + E##se = (~Bse)^( Bsi | Bso ); \ + E##si = Bsi ^( Bso & Bsu ); \ + E##so = Bso ^( Bsu | Bsa ); \ + E##su = Bsu ^( Bsa & Bse ); \ +\ + +#else // UseBebigokimisa +// --- Code for round, with prepare-theta +// --- 64-bit lanes mapped to 64-bit words +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^((~Bbe)& Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + Ca = E##ba; \ + E##be = Bbe ^((~Bbi)& Bbo ); \ + Ce = E##be; \ + E##bi = Bbi ^((~Bbo)& Bbu ); \ + Ci = E##bi; \ + E##bo = Bbo ^((~Bbu)& Bba ); \ + Co = E##bo; \ + E##bu = Bbu ^((~Bba)& Bbe ); \ + Cu = E##bu; \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^((~Bge)& Bgi ); \ + Ca ^= E##ga; \ + E##ge = Bge ^((~Bgi)& Bgo ); \ + Ce ^= E##ge; \ + E##gi = Bgi ^((~Bgo)& Bgu ); \ + Ci ^= E##gi; \ + E##go = Bgo ^((~Bgu)& Bga ); \ + Co ^= E##go; \ + E##gu = Bgu ^((~Bga)& Bge ); \ + Cu ^= E##gu; \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^((~Bke)& Bki ); \ + Ca ^= E##ka; \ + E##ke = Bke ^((~Bki)& Bko ); \ + Ce ^= E##ke; \ + E##ki = Bki ^((~Bko)& Bku ); \ + Ci ^= E##ki; \ + E##ko = Bko ^((~Bku)& Bka ); \ + Co ^= E##ko; \ + E##ku = Bku ^((~Bka)& Bke ); \ + Cu ^= E##ku; \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^((~Bme)& Bmi ); \ + Ca ^= E##ma; \ + E##me = Bme ^((~Bmi)& Bmo ); \ + Ce ^= E##me; \ + E##mi = Bmi ^((~Bmo)& Bmu ); \ + Ci ^= E##mi; \ + E##mo = Bmo ^((~Bmu)& Bma ); \ + Co ^= E##mo; \ + E##mu = Bmu ^((~Bma)& Bme ); \ + Cu ^= E##mu; \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + Ca ^= E##sa; \ + E##se = Bse ^((~Bsi)& Bso ); \ + Ce ^= E##se; \ + E##si = Bsi ^((~Bso)& Bsu ); \ + Ci ^= E##si; \ + E##so = Bso ^((~Bsu)& Bsa ); \ + Co ^= E##so; \ + E##su = Bsu ^((~Bsa)& Bse ); \ + Cu ^= E##su; \ +\ + +// --- Code for round +// --- 64-bit lanes mapped to 64-bit words +#define thetaRhoPiChiIota(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^((~Bbe)& Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + E##be = Bbe ^((~Bbi)& Bbo ); \ + E##bi = Bbi ^((~Bbo)& Bbu ); \ + E##bo = Bbo ^((~Bbu)& Bba ); \ + E##bu = Bbu ^((~Bba)& Bbe ); \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^((~Bge)& Bgi ); \ + E##ge = Bge ^((~Bgi)& Bgo ); \ + E##gi = Bgi ^((~Bgo)& Bgu ); \ + E##go = Bgo ^((~Bgu)& Bga ); \ + E##gu = Bgu ^((~Bga)& Bge ); \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^((~Bke)& Bki ); \ + E##ke = Bke ^((~Bki)& Bko ); \ + E##ki = Bki ^((~Bko)& Bku ); \ + E##ko = Bko ^((~Bku)& Bka ); \ + E##ku = Bku ^((~Bka)& Bke ); \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^((~Bme)& Bmi ); \ + E##me = Bme ^((~Bmi)& Bmo ); \ + E##mi = Bmi ^((~Bmo)& Bmu ); \ + E##mo = Bmo ^((~Bmu)& Bma ); \ + E##mu = Bmu ^((~Bma)& Bme ); \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + E##se = Bse ^((~Bsi)& Bso ); \ + E##si = Bsi ^((~Bso)& Bsu ); \ + E##so = Bso ^((~Bsu)& Bsa ); \ + E##su = Bsu ^((~Bsa)& Bse ); \ +\ + +#endif // UseBebigokimisa + +const UINT64 KeccakF1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL }; + +#define copyFromStateAndXor576bits(X, state, input) \ + X##ba = state[ 0]^input[ 0]; \ + X##be = state[ 1]^input[ 1]; \ + X##bi = state[ 2]^input[ 2]; \ + X##bo = state[ 3]^input[ 3]; \ + X##bu = state[ 4]^input[ 4]; \ + X##ga = state[ 5]^input[ 5]; \ + X##ge = state[ 6]^input[ 6]; \ + X##gi = state[ 7]^input[ 7]; \ + X##go = state[ 8]^input[ 8]; \ + X##gu = state[ 9]; \ + X##ka = state[10]; \ + X##ke = state[11]; \ + X##ki = state[12]; \ + X##ko = state[13]; \ + X##ku = state[14]; \ + X##ma = state[15]; \ + X##me = state[16]; \ + X##mi = state[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyFromStateAndXor832bits(X, state, input) \ + X##ba = state[ 0]^input[ 0]; \ + X##be = state[ 1]^input[ 1]; \ + X##bi = state[ 2]^input[ 2]; \ + X##bo = state[ 3]^input[ 3]; \ + X##bu = state[ 4]^input[ 4]; \ + X##ga = state[ 5]^input[ 5]; \ + X##ge = state[ 6]^input[ 6]; \ + X##gi = state[ 7]^input[ 7]; \ + X##go = state[ 8]^input[ 8]; \ + X##gu = state[ 9]^input[ 9]; \ + X##ka = state[10]^input[10]; \ + X##ke = state[11]^input[11]; \ + X##ki = state[12]^input[12]; \ + X##ko = state[13]; \ + X##ku = state[14]; \ + X##ma = state[15]; \ + X##me = state[16]; \ + X##mi = state[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyFromStateAndXor1024bits(X, state, input) \ + X##ba = state[ 0]^input[ 0]; \ + X##be = state[ 1]^input[ 1]; \ + X##bi = state[ 2]^input[ 2]; \ + X##bo = state[ 3]^input[ 3]; \ + X##bu = state[ 4]^input[ 4]; \ + X##ga = state[ 5]^input[ 5]; \ + X##ge = state[ 6]^input[ 6]; \ + X##gi = state[ 7]^input[ 7]; \ + X##go = state[ 8]^input[ 8]; \ + X##gu = state[ 9]^input[ 9]; \ + X##ka = state[10]^input[10]; \ + X##ke = state[11]^input[11]; \ + X##ki = state[12]^input[12]; \ + X##ko = state[13]^input[13]; \ + X##ku = state[14]^input[14]; \ + X##ma = state[15]^input[15]; \ + X##me = state[16]; \ + X##mi = state[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyFromStateAndXor1088bits(X, state, input) \ + X##ba = state[ 0]^input[ 0]; \ + X##be = state[ 1]^input[ 1]; \ + X##bi = state[ 2]^input[ 2]; \ + X##bo = state[ 3]^input[ 3]; \ + X##bu = state[ 4]^input[ 4]; \ + X##ga = state[ 5]^input[ 5]; \ + X##ge = state[ 6]^input[ 6]; \ + X##gi = state[ 7]^input[ 7]; \ + X##go = state[ 8]^input[ 8]; \ + X##gu = state[ 9]^input[ 9]; \ + X##ka = state[10]^input[10]; \ + X##ke = state[11]^input[11]; \ + X##ki = state[12]^input[12]; \ + X##ko = state[13]^input[13]; \ + X##ku = state[14]^input[14]; \ + X##ma = state[15]^input[15]; \ + X##me = state[16]^input[16]; \ + X##mi = state[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyFromStateAndXor1152bits(X, state, input) \ + X##ba = state[ 0]^input[ 0]; \ + X##be = state[ 1]^input[ 1]; \ + X##bi = state[ 2]^input[ 2]; \ + X##bo = state[ 3]^input[ 3]; \ + X##bu = state[ 4]^input[ 4]; \ + X##ga = state[ 5]^input[ 5]; \ + X##ge = state[ 6]^input[ 6]; \ + X##gi = state[ 7]^input[ 7]; \ + X##go = state[ 8]^input[ 8]; \ + X##gu = state[ 9]^input[ 9]; \ + X##ka = state[10]^input[10]; \ + X##ke = state[11]^input[11]; \ + X##ki = state[12]^input[12]; \ + X##ko = state[13]^input[13]; \ + X##ku = state[14]^input[14]; \ + X##ma = state[15]^input[15]; \ + X##me = state[16]^input[16]; \ + X##mi = state[17]^input[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyFromStateAndXor1344bits(X, state, input) \ + X##ba = state[ 0]^input[ 0]; \ + X##be = state[ 1]^input[ 1]; \ + X##bi = state[ 2]^input[ 2]; \ + X##bo = state[ 3]^input[ 3]; \ + X##bu = state[ 4]^input[ 4]; \ + X##ga = state[ 5]^input[ 5]; \ + X##ge = state[ 6]^input[ 6]; \ + X##gi = state[ 7]^input[ 7]; \ + X##go = state[ 8]^input[ 8]; \ + X##gu = state[ 9]^input[ 9]; \ + X##ka = state[10]^input[10]; \ + X##ke = state[11]^input[11]; \ + X##ki = state[12]^input[12]; \ + X##ko = state[13]^input[13]; \ + X##ku = state[14]^input[14]; \ + X##ma = state[15]^input[15]; \ + X##me = state[16]^input[16]; \ + X##mi = state[17]^input[17]; \ + X##mo = state[18]^input[18]; \ + X##mu = state[19]^input[19]; \ + X##sa = state[20]^input[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyFromState(X, state) \ + X##ba = state[ 0]; \ + X##be = state[ 1]; \ + X##bi = state[ 2]; \ + X##bo = state[ 3]; \ + X##bu = state[ 4]; \ + X##ga = state[ 5]; \ + X##ge = state[ 6]; \ + X##gi = state[ 7]; \ + X##go = state[ 8]; \ + X##gu = state[ 9]; \ + X##ka = state[10]; \ + X##ke = state[11]; \ + X##ki = state[12]; \ + X##ko = state[13]; \ + X##ku = state[14]; \ + X##ma = state[15]; \ + X##me = state[16]; \ + X##mi = state[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyToState(state, X) \ + state[ 0] = X##ba; \ + state[ 1] = X##be; \ + state[ 2] = X##bi; \ + state[ 3] = X##bo; \ + state[ 4] = X##bu; \ + state[ 5] = X##ga; \ + state[ 6] = X##ge; \ + state[ 7] = X##gi; \ + state[ 8] = X##go; \ + state[ 9] = X##gu; \ + state[10] = X##ka; \ + state[11] = X##ke; \ + state[12] = X##ki; \ + state[13] = X##ko; \ + state[14] = X##ku; \ + state[15] = X##ma; \ + state[16] = X##me; \ + state[17] = X##mi; \ + state[18] = X##mo; \ + state[19] = X##mu; \ + state[20] = X##sa; \ + state[21] = X##se; \ + state[22] = X##si; \ + state[23] = X##so; \ + state[24] = X##su; \ + +#define copyStateVariables(X, Y) \ + X##ba = Y##ba; \ + X##be = Y##be; \ + X##bi = Y##bi; \ + X##bo = Y##bo; \ + X##bu = Y##bu; \ + X##ga = Y##ga; \ + X##ge = Y##ge; \ + X##gi = Y##gi; \ + X##go = Y##go; \ + X##gu = Y##gu; \ + X##ka = Y##ka; \ + X##ke = Y##ke; \ + X##ki = Y##ki; \ + X##ko = Y##ko; \ + X##ku = Y##ku; \ + X##ma = Y##ma; \ + X##me = Y##me; \ + X##mi = Y##mi; \ + X##mo = Y##mo; \ + X##mu = Y##mu; \ + X##sa = Y##sa; \ + X##se = Y##se; \ + X##si = Y##si; \ + X##so = Y##so; \ + X##su = Y##su; \ + diff --git a/c_src/KeccakF-1600-arm.c b/c_src/KeccakF-1600-arm.c new file mode 100755 index 0000000..abd6dc9 --- /dev/null +++ b/c_src/KeccakF-1600-arm.c @@ -0,0 +1,123 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by Ronny Van Keer, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include "KeccakF-1600-interface.h" +#include "KeccakSponge.h" +#include + +typedef unsigned char UINT8; +typedef unsigned short UINT16; +typedef unsigned int UINT32; +typedef unsigned long long int UINT64; + +void KeccakPermutationOnWordsAfterXoring_ARM_asm(UINT32 *state, const UINT8 *input, int laneCount); + +void KeccakInitialize( void ) +{ +} + +void KeccakInitializeState(unsigned char *state) +{ + memset(state, 0, KeccakPermutationSizeInBytes); +} + +void KeccakPermutation(unsigned char *state) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, 0, 0); +} + +#ifdef ProvideFast576 +void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 9); +} +#endif + +#ifdef ProvideFast832 +void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 13); +} +#endif + +#ifdef ProvideFast1024 +void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 16); +} +#endif + +#ifdef ProvideFast1088 +void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 17); +} +#endif + +#ifdef ProvideFast1152 +void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 18); +} +#endif + +#ifdef ProvideFast1344 +void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 21); +} +#endif + + +void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount) +{ + KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, laneCount); +} + +// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +UINT64 fromInterleaving(UINT64 x) +{ + UINT64 t; + + t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL; x = x ^ t ^ (t << 16); + t = (x ^ (x >> 8)) & 0x0000FF000000FF00ULL; x = x ^ t ^ (t << 8); + t = (x ^ (x >> 4)) & 0x00F000F000F000F0ULL; x = x ^ t ^ (t << 4); + t = (x ^ (x >> 2)) & 0x0C0C0C0C0C0C0C0CULL; x = x ^ t ^ (t << 2); + t = (x ^ (x >> 1)) & 0x2222222222222222ULL; x = x ^ t ^ (t << 1); + + return x; +} + +void setInterleavedWordsInto8bytes(UINT8* dest, UINT32* evenAndOdd) +{ + ((UINT64*)dest)[0] = fromInterleaving(*(UINT64*)evenAndOdd); +} + +#define extractLanes(laneCount, state, data) \ + { \ + int i; \ + for(i=0; i<(laneCount); i++) \ + setInterleavedWordsInto8bytes(data+i*8, (UINT32*)state+i*2); \ + } + +#ifdef ProvideFast1024 +void KeccakExtract1024bits(const unsigned char *state, unsigned char *data) +{ + extractLanes(16, state, data) +} +#endif + +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount) +{ + extractLanes(laneCount, state, data) +} diff --git a/c_src/KeccakF-1600-armcc.s b/c_src/KeccakF-1600-armcc.s new file mode 100755 index 0000000..b87d0ba --- /dev/null +++ b/c_src/KeccakF-1600-armcc.s @@ -0,0 +1,653 @@ +;// The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +;// Michaël Peeters and Gilles Van Assche. For more information, feedback or +;// questions, please refer to our website: http://keccak.noekeon.org/ +;// +;// Implementation by Ronny Van Keer, +;// hereby denoted as "the implementer". +;// +;// To the extent possible under law, the implementer has waived all copyright +;// and related or neighboring rights to the source code in this file. +;// http://creativecommons.org/publicdomain/zero/1.0/ + + + PRESERVE8 + THUMB + AREA |.text|, CODE, READONLY + +;// --- defines + +_ba0 equ 0*4 +_ba1 equ 1*4 +_be0 equ 2*4 +_be1 equ 3*4 +_bi0 equ 4*4 +_bi1 equ 5*4 +_bo0 equ 6*4 +_bo1 equ 7*4 +_bu0 equ 8*4 +_bu1 equ 9*4 +_ga0 equ 10*4 +_ga1 equ 11*4 +_ge0 equ 12*4 +_ge1 equ 13*4 +_gi0 equ 14*4 +_gi1 equ 15*4 +_go0 equ 16*4 +_go1 equ 17*4 +_gu0 equ 18*4 +_gu1 equ 19*4 +_ka0 equ 20*4 +_ka1 equ 21*4 +_ke0 equ 22*4 +_ke1 equ 23*4 +_ki0 equ 24*4 +_ki1 equ 25*4 +_ko0 equ 26*4 +_ko1 equ 27*4 +_ku0 equ 28*4 +_ku1 equ 29*4 +_ma0 equ 30*4 +_ma1 equ 31*4 +_me0 equ 32*4 +_me1 equ 33*4 +_mi0 equ 34*4 +_mi1 equ 35*4 +_mo0 equ 36*4 +_mo1 equ 37*4 +_mu0 equ 38*4 +_mu1 equ 39*4 +_sa0 equ 40*4 +_sa1 equ 41*4 +_se0 equ 42*4 +_se1 equ 43*4 +_si0 equ 44*4 +_si1 equ 45*4 +_so0 equ 46*4 +_so1 equ 47*4 +_su0 equ 48*4 +_su1 equ 49*4 + +mDe1 equ 50*4 +mDi0 equ 51*4 +mDo0 equ 52*4 +mDo1 equ 53*4 + +;// --- macros + + MACRO + xor5 $result,$ptr,$b,$g,$k,$m,$s + + ldr $result, [$ptr, #$b] + ldr r1, [$ptr, #$g] + ldr r2, [$ptr, #$k] + eor $result, $result, r1 + ldr r1, [$ptr, #$m] + eor $result, $result, r2 + ldr r2, [$ptr, #$s] + eor $result, $result, r1 + eor $result, $result, r2 + MEND + + MACRO + xorrol $b, $yy, $rr + + eor $b, $b, $yy + ror $b, #32-$rr + MEND + + + MACRO + xandnot $resptr, $resofs, $aa, $bb, $cc + + bic r1, $cc, $bb + eor r1, r1, $aa + str r1, [$resptr, #$resofs] + MEND + + MACRO + xandnotRC $resptr, $resofs, $aa, $bb, $cc + + ldr r1, [r3], #4 + bic $cc, $cc, $bb + eor $cc, $cc, r1 + eor $cc, $cc, $aa + str $cc, [$resptr, #$resofs] + MEND + + + EXPORT KeccakPermutationOnWordsAfterXoring_ARM_asm +KeccakPermutationOnWordsAfterXoring_ARM_asm PROC + + push {r4-r12,lr} + sub sp,sp,#4*(50+4) + + movs r9, r2 + beq interleaveDone + mov r8,r0 +interleaveLoop + + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldrd r6, r7, [r8] + + ;// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 + and r3,r4,#0x55555555 + orr r3,r3,r3, LSR #1 + and r3,r3,#0x33333333 + orr r3,r3,r3, LSR #2 + and r3,r3,#0x0F0F0F0F + orr r3,r3,r3, LSR #4 + and r3,r3,#0x00FF00FF + bfi r3,r3,#8, #8 + eor r6,r6,r3, LSR #8 + + and r3,r5,#0x55555555 + orr r3,r3,r3, LSR #1 + and r3,r3,#0x33333333 + orr r3,r3,r3, LSR #2 + and r3,r3,#0x0F0F0F0F + orr r3,r3,r3, LSR #4 + and r3,r3,#0x00FF00FF + orr r3,r3,r3, LSR #8 + eor r6,r6,r3, LSL #16 + + and r3,r4,#0xAAAAAAAA + orr r3,r3,r3, LSL #1 + and r3,r3,#0xCCCCCCCC + orr r3,r3,r3, LSL #2 + and r3,r3,#0xF0F0F0F0 + orr r3,r3,r3, LSL #4 + and r3,r3,#0xFF00FF00 + orr r3,r3,r3, LSL #8 + eor r7,r7,r3, LSR #16 + + and r3,r5,#0xAAAAAAAA + orr r3,r3,r3, LSL #1 + and r3,r3,#0xCCCCCCCC + orr r3,r3,r3, LSL #2 + and r3,r3,#0xF0F0F0F0 + orr r3,r3,r3, LSL #4 + and r3,r3,#0xFF00FF00 + orr r3,r3,r3, LSL #8 + bfc r3, #0, #16 + eor r7,r7,r3 + + strd r6,r7,[r8], #8 + + subs r9,r9,#1 + bne interleaveLoop + +interleaveDone + + ldr r3, =KeccakF1600RoundConstantsWithTerminator + b roundLoop ;//jump over the table + LTORG + + ALIGN + +KeccakF1600RoundConstantsWithTerminator + ;// 0 1 + dcd 0x00000001, 0x00000000 + dcd 0x00000000, 0x00000089 + dcd 0x00000000, 0x8000008b + dcd 0x00000000, 0x80008080 + dcd 0x00000001, 0x0000008b + dcd 0x00000001, 0x00008000 + dcd 0x00000001, 0x80008088 + dcd 0x00000001, 0x80000082 + dcd 0x00000000, 0x0000000b + dcd 0x00000000, 0x0000000a + dcd 0x00000001, 0x00008082 + dcd 0x00000000, 0x00008003 + dcd 0x00000001, 0x0000808b + dcd 0x00000001, 0x8000000b + dcd 0x00000001, 0x8000008a + dcd 0x00000001, 0x80000081 + dcd 0x00000000, 0x80000081 + dcd 0x00000000, 0x80000008 + dcd 0x00000000, 0x00000083 + dcd 0x00000000, 0x80008003 + dcd 0x00000001, 0x80008088 + dcd 0x00000000, 0x80000088 + dcd 0x00000001, 0x00008000 + dcd 0x00000000, 0x80008082 + dcd 0xFFFFFFFF ;//terminator + +roundLoop + + ;//prepTheta A + xor5 r10, r0,_bu0, _gu0, _ku0, _mu0, _su0 + xor5 r6, r0,_be1, _ge1, _ke1, _me1, _se1 + eor r5, r10, r6, ROR #31 + xor5 r11, r0,_bu1, _gu1, _ku1, _mu1, _su1 + xor5 r7, r0,_be0, _ge0, _ke0, _me0, _se0 + eor r4, r11, r7 + + xor5 r8, r0,_bi0, _gi0, _ki0, _mi0, _si0 + eor r1, r8, r11, ROR #31 + str r1, [sp, #mDo0] + xor5 r9, r0,_bi1, _gi1, _ki1, _mi1, _si1 + eor r1, r9, r10 + str r1, [sp, #mDo1] + + xor5 r10, r0,_ba0, _ga0, _ka0, _ma0, _sa0 + eor lr, r10, r9, ROR #31 + xor5 r11, r0,_ba1, _ga1, _ka1, _ma1, _sa1 + eor r1, r11, r8 + str r1, [sp, #mDe1] + + xor5 r9, r0,_bo1, _go1, _ko1, _mo1, _so1 + eor r1, r7, r9, ROR #31 + str r1, [sp, #mDi0] + xor5 r8, r0,_bo0, _go0, _ko0, _mo0, _so0 + eor r2, r6, r8 + + eor r7, r8, r11, ROR #31 + eor r6, r9, r10 + + ;//thetaRhoPiChiIota 0, in A, out E + ldr r8, [r0, #_ba0] + ldr r9, [r0, #_ge0] + ldr r10, [r0, #_ki1] + ldr r11, [r0, #_mo1] + ldr r12, [r0, #_su0] + ldr r1, [sp, #mDo1] + eor r8, r8, r5 + xorrol r9, lr, 22 + xorrol r10, r2, 22 + xorrol r11, r1, 11 + xorrol r12, r7, 7 + xandnot sp, _be0, r9, r10, r11 + xandnot sp, _bi0, r10, r11, r12 + xandnot sp, _bo0, r11, r12, r8 + xandnot sp, _bu0, r12, r8, r9 + xandnotRC sp, _ba0, r8, r9, r10 + + ldr r8, [r0, #_bo0] + ldr r1, [sp, #mDo0] + ldr r9, [r0, #_gu0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDe1] + ldr r10, [r0, #_ka1] + ldr r11, [r0, #_me1] + ldr r12, [r0, #_si1] + xorrol r9, r7, 10 + xorrol r10, r4, 2 + xorrol r11, r1, 23 + xorrol r12, r2, 31 + xandnot sp, _ga0, r8, r9, r10 + xandnot sp, _ge0, r9, r10, r11 + xandnot sp, _gi0, r10, r11, r12 + xandnot sp, _go0, r11, r12, r8 + xandnot sp, _gu0, r12, r8, r9 + + ldr r8, [r0, #_be1] + ldr r1, [sp, #mDe1] + ldr r9, [r0, #_gi0] + xorrol r8, r1, 1 + ldr r1, [sp, #mDi0] + ldr r10, [r0, #_ko1] + xorrol r9, r1, 3 + ldr r1, [sp, #mDo1] + ldr r11, [r0, #_mu0] + ldr r12, [r0, #_sa0] + xorrol r10, r1, 13 + xorrol r11, r7, 4 + xorrol r12, r5, 9 + xandnot sp, _ka0, r8, r9, r10 + xandnot sp, _ke0, r9, r10, r11 + xandnot sp, _ki0, r10, r11, r12 + xandnot sp, _ko0, r11, r12, r8 + xandnot sp, _ku0, r12, r8, r9 + + ldr r8, [r0, #_bu1] + ldr r9, [r0, #_ga0] + ldr r10, [r0, #_ke0] + ldr r11, [r0, #_mi1] + ldr r12, [r0, #_so0] + ldr r1, [sp, #mDo0] + xorrol r8, r6, 14 + xorrol r9, r5, 18 + xorrol r10, lr, 5 + xorrol r11, r2, 8 + xorrol r12, r1, 28 + xandnot sp, _ma0, r8, r9, r10 + xandnot sp, _me0, r9, r10, r11 + xandnot sp, _mi0, r10, r11, r12 + xandnot sp, _mo0, r11, r12, r8 + xandnot sp, _mu0, r12, r8, r9 + + ldr r1, [sp, #mDi0] + ldr r8, [r0, #_bi0] + ldr r9, [r0, #_go1] + xorrol r8, r1, 31 + ldr r1, [sp, #mDo1] + ldr r10, [r0, #_ku1] + xorrol r9, r1, 28 + ldr r11, [r0, #_ma1] + ldr r12, [r0, #_se0] + xorrol r10, r6, 20 + xorrol r11, r4, 21 + xorrol r12, lr, 1 + xandnot sp, _sa0, r8, r9, r10 + xandnot sp, _se0, r9, r10, r11 + xandnot sp, _si0, r10, r11, r12 + xandnot sp, _so0, r11, r12, r8 + xandnot sp, _su0, r12, r8, r9 + + ;// thetaRhoPiChiIota 1, in A, out E + ldr r1, [sp, #mDe1] + ldr r9, [r0, #_ge1] + ldr r8, [r0, #_ba1] + xorrol r9, r1, 22 + ldr r1, [sp, #mDi0] + ldr r10, [r0, #_ki0] + eor r8, r8, r4 + xorrol r10, r1, 21 + ldr r1, [sp, #mDo0] + ldr r11, [r0, #_mo0] + ldr r12, [r0, #_su1] + xorrol r11, r1, 10 + xorrol r12, r6, 7 + xandnot sp, _be1, r9, r10, r11 + xandnot sp, _bi1, r10, r11, r12 + xandnot sp, _bo1, r11, r12, r8 + xandnot sp, _bu1, r12, r8, r9 + xandnotRC sp, _ba1, r8, r9, r10 + + ldr r1, [sp, #mDo1] + ldr r8, [r0, #_bo1] + ldr r12, [r0, #_si0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDi0] + ldr r9, [r0, #_gu1] + xorrol r12, r1, 30 + ldr r10, [r0, #_ka0] + ldr r11, [r0, #_me0] + xorrol r9, r6, 10 + xorrol r10, r5, 1 + xorrol r11, lr, 22 + xandnot sp, _ga1, r8, r9, r10 + xandnot sp, _ge1, r9, r10, r11 + xandnot sp, _gi1, r10, r11, r12 + xandnot sp, _go1, r11, r12, r8 + xandnot sp, _gu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r10, [r0, #_ko0] + ldr r8, [r0, #_be0] + xorrol r10, r1, 12 + ldr r9, [r0, #_gi1] + ldr r11, [r0, #_mu1] + ldr r12, [r0, #_sa1] + eor r8, r8, lr + xorrol r9, r2, 3 + xorrol r11, r6, 4 + xorrol r12, r4, 9 + xandnot sp, _ka1, r8, r9, r10 + xandnot sp, _ke1, r9, r10, r11 + xandnot sp, _ki1, r10, r11, r12 + xandnot sp, _ko1, r11, r12, r8 + xandnot sp, _ku1, r12, r8, r9 + + ldr r1, [sp, #mDe1] + ldr r10, [r0, #_ke1] + ldr r11, [r0, #_mi0] + xorrol r10, r1, 5 + ldr r1, [sp, #mDi0] + ldr r12, [r0, #_so1] + xorrol r11, r1, 7 + ldr r1, [sp, #mDo1] + ldr r8, [r0, #_bu0] + ldr r9, [r0, #_ga1] + xorrol r8, r7, 13 + xorrol r9, r4, 18 + xorrol r12, r1, 28 + xandnot sp, _ma1, r8, r9, r10 + xandnot sp, _me1, r9, r10, r11 + xandnot sp, _mi1, r10, r11, r12 + xandnot sp, _mo1, r11, r12, r8 + xandnot sp, _mu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r9, [r0, #_go0] + ldr r8, [r0, #_bi1] + xorrol r9, r1, 27 + ldr r10, [r0, #_ku0] + ldr r11, [r0, #_ma0] + ldr r12, [r0, #_se1] + ldr r1, [sp, #mDe1] + xorrol r8, r2, 31 + xorrol r10, r7, 19 + xorrol r11, r5, 20 + xorrol r12, r1, 1 + xandnot sp, _sa1, r8, r9, r10 + xandnot sp, _se1, r9, r10, r11 + xandnot sp, _si1, r10, r11, r12 + xandnot sp, _so1, r11, r12, r8 + xandnot sp, _su1, r12, r8, r9 + + ;//prepTheta E + xor5 r10, sp,_bu0, _gu0, _ku0, _mu0, _su0 + xor5 r6, sp,_be1, _ge1, _ke1, _me1, _se1 + eor r5, r10, r6, ROR #31 + xor5 r11, sp,_bu1, _gu1, _ku1, _mu1, _su1 + xor5 r7, sp,_be0, _ge0, _ke0, _me0, _se0 + eor r4, r11, r7 + + xor5 r8, sp,_bi0, _gi0, _ki0, _mi0, _si0 + eor r1, r8, r11, ROR #31 + str r1, [sp, #mDo0] + xor5 r9, sp,_bi1, _gi1, _ki1, _mi1, _si1 + eor r1, r9, r10 + str r1, [sp, #mDo1] + + xor5 r10, sp,_ba0, _ga0, _ka0, _ma0, _sa0 + eor lr, r10, r9, ROR #31 + xor5 r11, sp,_ba1, _ga1, _ka1, _ma1, _sa1 + eor r1, r11, r8 + str r1, [sp, #mDe1] + + xor5 r9, sp,_bo1, _go1, _ko1, _mo1, _so1 + eor r1, r7, r9, ROR #31 + str r1, [sp, #mDi0] + xor5 r8, sp,_bo0, _go0, _ko0, _mo0, _so0 + eor r2, r6, r8 + + eor r7, r8, r11, ROR #31 + eor r6, r9, r10 + + ;//thetaRhoPiChiIota 0, in E, out A + ldr r8, [sp, #_ba0] + ldr r9, [sp, #_ge0] + ldr r10, [sp, #_ki1] + ldr r11, [sp, #_mo1] + ldr r12, [sp, #_su0] + ldr r1, [sp, #mDo1] + eor r8, r8, r5 + xorrol r9, lr, 22 + xorrol r10, r2, 22 + xorrol r11, r1, 11 + xorrol r12, r7, 7 + xandnot r0, _be0, r9, r10, r11 + xandnot r0, _bi0, r10, r11, r12 + xandnot r0, _bo0, r11, r12, r8 + xandnot r0, _bu0, r12, r8, r9 + xandnotRC r0, _ba0, r8, r9, r10 + + ldr r8, [sp, #_bo0] + ldr r1, [sp, #mDo0] + ldr r9, [sp, #_gu0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDe1] + ldr r10, [sp, #_ka1] + ldr r11, [sp, #_me1] + ldr r12, [sp, #_si1] + xorrol r9, r7, 10 + xorrol r10, r4, 2 + xorrol r11, r1, 23 + xorrol r12, r2, 31 + xandnot r0, _ga0, r8, r9, r10 + xandnot r0, _ge0, r9, r10, r11 + xandnot r0, _gi0, r10, r11, r12 + xandnot r0, _go0, r11, r12, r8 + xandnot r0, _gu0, r12, r8, r9 + + ldr r8, [sp, #_be1] + ldr r1, [sp, #mDe1] + ldr r9, [sp, #_gi0] + xorrol r8, r1, 1 + ldr r1, [sp, #mDi0] + ldr r10, [sp, #_ko1] + xorrol r9, r1, 3 + ldr r1, [sp, #mDo1] + ldr r11, [sp, #_mu0] + ldr r12, [sp, #_sa0] + xorrol r10, r1, 13 + xorrol r11, r7, 4 + xorrol r12, r5, 9 + xandnot r0, _ka0, r8, r9, r10 + xandnot r0, _ke0, r9, r10, r11 + xandnot r0, _ki0, r10, r11, r12 + xandnot r0, _ko0, r11, r12, r8 + xandnot r0, _ku0, r12, r8, r9 + + ldr r8, [sp, #_bu1] + ldr r9, [sp, #_ga0] + ldr r10, [sp, #_ke0] + ldr r11, [sp, #_mi1] + ldr r12, [sp, #_so0] + ldr r1, [sp, #mDo0] + xorrol r8, r6, 14 + xorrol r9, r5, 18 + xorrol r10, lr, 5 + xorrol r11, r2, 8 + xorrol r12, r1, 28 + xandnot r0, _ma0, r8, r9, r10 + xandnot r0, _me0, r9, r10, r11 + xandnot r0, _mi0, r10, r11, r12 + xandnot r0, _mo0, r11, r12, r8 + xandnot r0, _mu0, r12, r8, r9 + + ldr r1, [sp, #mDi0] + ldr r8, [sp, #_bi0] + ldr r9, [sp, #_go1] + xorrol r8, r1, 31 + ldr r1, [sp, #mDo1] + ldr r10, [sp, #_ku1] + xorrol r9, r1, 28 + ldr r11, [sp, #_ma1] + ldr r12, [sp, #_se0] + xorrol r10, r6, 20 + xorrol r11, r4, 21 + xorrol r12, lr, 1 + xandnot r0, _sa0, r8, r9, r10 + xandnot r0, _se0, r9, r10, r11 + xandnot r0, _si0, r10, r11, r12 + xandnot r0, _so0, r11, r12, r8 + xandnot r0, _su0, r12, r8, r9 + + ;// thetaRhoPiChiIota 1, in A, out E + ldr r1, [sp, #mDe1] + ldr r9, [sp, #_ge1] + ldr r8, [sp, #_ba1] + xorrol r9, r1, 22 + ldr r1, [sp, #mDi0] + ldr r10, [sp, #_ki0] + eor r8, r8, r4 + xorrol r10, r1, 21 + ldr r1, [sp, #mDo0] + ldr r11, [sp, #_mo0] + ldr r12, [sp, #_su1] + xorrol r11, r1, 10 + xorrol r12, r6, 7 + xandnot r0, _be1, r9, r10, r11 + xandnot r0, _bi1, r10, r11, r12 + xandnot r0, _bo1, r11, r12, r8 + xandnot r0, _bu1, r12, r8, r9 + xandnotRC r0, _ba1, r8, r9, r10 + + ldr r1, [sp, #mDo1] + ldr r8, [sp, #_bo1] + ldr r12, [sp, #_si0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDi0] + ldr r9, [sp, #_gu1] + xorrol r12, r1, 30 + ldr r10, [sp, #_ka0] + ldr r11, [sp, #_me0] + xorrol r9, r6, 10 + xorrol r10, r5, 1 + xorrol r11, lr, 22 + xandnot r0, _ga1, r8, r9, r10 + xandnot r0, _ge1, r9, r10, r11 + xandnot r0, _gi1, r10, r11, r12 + xandnot r0, _go1, r11, r12, r8 + xandnot r0, _gu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r10, [sp, #_ko0] + ldr r8, [sp, #_be0] + xorrol r10, r1, 12 + ldr r9, [sp, #_gi1] + ldr r11, [sp, #_mu1] + ldr r12, [sp, #_sa1] + eor r8, r8, lr + xorrol r9, r2, 3 + xorrol r11, r6, 4 + xorrol r12, r4, 9 + xandnot r0, _ka1, r8, r9, r10 + xandnot r0, _ke1, r9, r10, r11 + xandnot r0, _ki1, r10, r11, r12 + xandnot r0, _ko1, r11, r12, r8 + xandnot r0, _ku1, r12, r8, r9 + + ldr r1, [sp, #mDe1] + ldr r10, [sp, #_ke1] + ldr r11, [sp, #_mi0] + xorrol r10, r1, 5 + ldr r1, [sp, #mDi0] + ldr r12, [sp, #_so1] + xorrol r11, r1, 7 + ldr r1, [sp, #mDo1] + ldr r8, [sp, #_bu0] + ldr r9, [sp, #_ga1] + xorrol r8, r7, 13 + xorrol r9, r4, 18 + xorrol r12, r1, 28 + xandnot r0, _ma1, r8, r9, r10 + xandnot r0, _me1, r9, r10, r11 + xandnot r0, _mi1, r10, r11, r12 + xandnot r0, _mo1, r11, r12, r8 + xandnot r0, _mu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r9, [sp, #_go0] + ldr r8, [sp, #_bi1] + xorrol r9, r1, 27 + ldr r10, [sp, #_ku0] + ldr r11, [sp, #_ma0] + ldr r12, [sp, #_se1] + ldr r1, [sp, #mDe1] + xorrol r8, r2, 31 + xorrol r10, r7, 19 + xorrol r11, r5, 20 + xorrol r12, r1, 1 + xandnot r0, _sa1, r8, r9, r10 + xandnot r0, _se1, r9, r10, r11 + xandnot r0, _si1, r10, r11, r12 + xandnot r0, _so1, r11, r12, r8 + ldr r10, [r3] + xandnot r0, _su1, r12, r8, r9 + + cmp r10, #0xFFFFFFFF + bne roundLoop + + add sp,sp,#4*(50+4) + pop {r4-r12,pc} + + ENDP + + ALIGN + + END diff --git a/c_src/KeccakF-1600-armgcc.s b/c_src/KeccakF-1600-armgcc.s new file mode 100755 index 0000000..d16594b --- /dev/null +++ b/c_src/KeccakF-1600-armgcc.s @@ -0,0 +1,686 @@ +@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +@ Michaël Peeters and Gilles Van Assche. For more information, feedback or +@ questions, please refer to our website: http://keccak.noekeon.org/ +@ +@ Implementation by Ronny Van Keer, +@ hereby denoted as "the implementer". +@ +@ To the extent possible under law, the implementer has waived all copyright +@ and related or neighboring rights to the source code in this file. +@ http://creativecommons.org/publicdomain/zero/1.0/ + +@ This file was created from a .asm file +@ using the ads2gas.pl script. + .equ DO1STROUNDING, 0 + + @ PRESERVE8 + @ THUMB + .syntax unified + .cpu cortex-m3 + .thumb + + +@// --- defines + +.equ _ba0 , 0*4 +.equ _ba1 , 1*4 +.equ _be0 , 2*4 +.equ _be1 , 3*4 +.equ _bi0 , 4*4 +.equ _bi1 , 5*4 +.equ _bo0 , 6*4 +.equ _bo1 , 7*4 +.equ _bu0 , 8*4 +.equ _bu1 , 9*4 +.equ _ga0 , 10*4 +.equ _ga1 , 11*4 +.equ _ge0 , 12*4 +.equ _ge1 , 13*4 +.equ _gi0 , 14*4 +.equ _gi1 , 15*4 +.equ _go0 , 16*4 +.equ _go1 , 17*4 +.equ _gu0 , 18*4 +.equ _gu1 , 19*4 +.equ _ka0 , 20*4 +.equ _ka1 , 21*4 +.equ _ke0 , 22*4 +.equ _ke1 , 23*4 +.equ _ki0 , 24*4 +.equ _ki1 , 25*4 +.equ _ko0 , 26*4 +.equ _ko1 , 27*4 +.equ _ku0 , 28*4 +.equ _ku1 , 29*4 +.equ _ma0 , 30*4 +.equ _ma1 , 31*4 +.equ _me0 , 32*4 +.equ _me1 , 33*4 +.equ _mi0 , 34*4 +.equ _mi1 , 35*4 +.equ _mo0 , 36*4 +.equ _mo1 , 37*4 +.equ _mu0 , 38*4 +.equ _mu1 , 39*4 +.equ _sa0 , 40*4 +.equ _sa1 , 41*4 +.equ _se0 , 42*4 +.equ _se1 , 43*4 +.equ _si0 , 44*4 +.equ _si1 , 45*4 +.equ _so0 , 46*4 +.equ _so1 , 47*4 +.equ _su0 , 48*4 +.equ _su1 , 49*4 + +.equ mDe1 , 50*4 +.equ mDi0 , 51*4 +.equ mDo0 , 52*4 +.equ mDo1 , 53*4 + +@// --- macros + +.macro xor5 result,ptr,b,g,k,m,s + + ldr \result, [\ptr, #\b] + ldr r1, [\ptr, #\g] + ldr r2, [\ptr, #\k] + eor \result, \result, r1 + ldr r1, [\ptr, #\m] + eor \result, \result, r2 + ldr r2, [\ptr, #\s] + eor \result, \result, r1 + eor \result, \result, r2 + .endm + +.macro xorrol b, yy, rr + + eor \b, \b, \yy + ror \b, #32-\rr + .endm + + +.macro xandnot resptr, resofs, aa, bb, cc + + bic r1, \cc, \bb + eor r1, r1, \aa + str r1, [\resptr, #\resofs] + .endm + +.macro xandnotRC resptr, resofs, aa, bb, cc + + ldr r1, [r3], #4 + bic \cc, \cc, \bb + eor \cc, \cc, r1 + eor \cc, \cc, \aa + str \cc, [\resptr, #\resofs] + .endm + + + .size KeccakPermutationOnWords, .-KeccakPermutationOnWords + .align 2 + .global KeccakPermutationOnWordsAfterXoring_ARM_asm + .thumb + .thumb_func + .type KeccakPermutationOnWordsAfterXoring_ARM_asm, %function +KeccakPermutationOnWordsAfterXoring_ARM_asm: + @ args = 0, pretend = 0, frame = 408 + @ frame_needed = 0, uses_anonymous_args = 0 + @ link register save eliminated. + + push {r4-r12,lr} + sub sp,sp,#4*(50+4) + + movs r9, r2 + beq interleaveDone + mov r8,r0 +interleaveLoop: + + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldrd r6, r7, [r8] + + @// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 + and r3,r4,#0x55555555 + orr r3,r3,r3, LSR #1 + and r3,r3,#0x33333333 + orr r3,r3,r3, LSR #2 + and r3,r3,#0x0F0F0F0F + orr r3,r3,r3, LSR #4 + and r3,r3,#0x00FF00FF + bfi r3,r3,#8, #8 + eor r6,r6,r3, LSR #8 + + and r3,r5,#0x55555555 + orr r3,r3,r3, LSR #1 + and r3,r3,#0x33333333 + orr r3,r3,r3, LSR #2 + and r3,r3,#0x0F0F0F0F + orr r3,r3,r3, LSR #4 + and r3,r3,#0x00FF00FF + orr r3,r3,r3, LSR #8 + eor r6,r6,r3, LSL #16 + + and r3,r4,#0xAAAAAAAA + orr r3,r3,r3, LSL #1 + and r3,r3,#0xCCCCCCCC + orr r3,r3,r3, LSL #2 + and r3,r3,#0xF0F0F0F0 + orr r3,r3,r3, LSL #4 + and r3,r3,#0xFF00FF00 + orr r3,r3,r3, LSL #8 + eor r7,r7,r3, LSR #16 + + and r3,r5,#0xAAAAAAAA + orr r3,r3,r3, LSL #1 + and r3,r3,#0xCCCCCCCC + orr r3,r3,r3, LSL #2 + and r3,r3,#0xF0F0F0F0 + orr r3,r3,r3, LSL #4 + and r3,r3,#0xFF00FF00 + orr r3,r3,r3, LSL #8 + bfc r3, #0, #16 + eor r7,r7,r3 + + strd r6,r7,[r8], #8 + + subs r9,r9,#1 + bne interleaveLoop + +interleaveDone: + + ldr r3, =KeccakF1600RoundConstantsWithTerminator + b roundLoop @//jump over the table + .ltorg + + @ ALIGN + +KeccakF1600RoundConstantsWithTerminator: + @// 0 1 + .word 0x00000001 + .word 0x00000000 + .word 0x00000000 + .word 0x00000089 + .word 0x00000000 + .word 0x8000008b + .word 0x00000000 + .word 0x80008080 + .word 0x00000001 + .word 0x0000008b + .word 0x00000001 + .word 0x00008000 + .word 0x00000001 + .word 0x80008088 + .word 0x00000001 + .word 0x80000082 + .word 0x00000000 + .word 0x0000000b + .word 0x00000000 + .word 0x0000000a + .word 0x00000001 + .word 0x00008082 + .word 0x00000000 + .word 0x00008003 + .word 0x00000001 + .word 0x0000808b + .word 0x00000001 + .word 0x8000000b + .word 0x00000001 + .word 0x8000008a + .word 0x00000001 + .word 0x80000081 + .word 0x00000000 + .word 0x80000081 + .word 0x00000000 + .word 0x80000008 + .word 0x00000000 + .word 0x00000083 + .word 0x00000000 + .word 0x80008003 + .word 0x00000001 + .word 0x80008088 + .word 0x00000000 + .word 0x80000088 + .word 0x00000001 + .word 0x00008000 + .word 0x00000000 + .word 0x80008082 + .word 0xFFFFFFFF @//terminator + +roundLoop: + + @//prepTheta A + xor5 r10, r0,_bu0, _gu0, _ku0, _mu0, _su0 + xor5 r6, r0,_be1, _ge1, _ke1, _me1, _se1 + eor r5, r10, r6, ROR #31 + xor5 r11, r0,_bu1, _gu1, _ku1, _mu1, _su1 + xor5 r7, r0,_be0, _ge0, _ke0, _me0, _se0 + eor r4, r11, r7 + + xor5 r8, r0,_bi0, _gi0, _ki0, _mi0, _si0 + eor r1, r8, r11, ROR #31 + str r1, [sp, #mDo0] + xor5 r9, r0,_bi1, _gi1, _ki1, _mi1, _si1 + eor r1, r9, r10 + str r1, [sp, #mDo1] + + xor5 r10, r0,_ba0, _ga0, _ka0, _ma0, _sa0 + eor lr, r10, r9, ROR #31 + xor5 r11, r0,_ba1, _ga1, _ka1, _ma1, _sa1 + eor r1, r11, r8 + str r1, [sp, #mDe1] + + xor5 r9, r0,_bo1, _go1, _ko1, _mo1, _so1 + eor r1, r7, r9, ROR #31 + str r1, [sp, #mDi0] + xor5 r8, r0,_bo0, _go0, _ko0, _mo0, _so0 + eor r2, r6, r8 + + eor r7, r8, r11, ROR #31 + eor r6, r9, r10 + + @//thetaRhoPiChiIota 0, in A, out E + ldr r8, [r0, #_ba0] + ldr r9, [r0, #_ge0] + ldr r10, [r0, #_ki1] + ldr r11, [r0, #_mo1] + ldr r12, [r0, #_su0] + ldr r1, [sp, #mDo1] + eor r8, r8, r5 + xorrol r9, lr, 22 + xorrol r10, r2, 22 + xorrol r11, r1, 11 + xorrol r12, r7, 7 + xandnot sp, _be0, r9, r10, r11 + xandnot sp, _bi0, r10, r11, r12 + xandnot sp, _bo0, r11, r12, r8 + xandnot sp, _bu0, r12, r8, r9 + xandnotRC sp, _ba0, r8, r9, r10 + + ldr r8, [r0, #_bo0] + ldr r1, [sp, #mDo0] + ldr r9, [r0, #_gu0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDe1] + ldr r10, [r0, #_ka1] + ldr r11, [r0, #_me1] + ldr r12, [r0, #_si1] + xorrol r9, r7, 10 + xorrol r10, r4, 2 + xorrol r11, r1, 23 + xorrol r12, r2, 31 + xandnot sp, _ga0, r8, r9, r10 + xandnot sp, _ge0, r9, r10, r11 + xandnot sp, _gi0, r10, r11, r12 + xandnot sp, _go0, r11, r12, r8 + xandnot sp, _gu0, r12, r8, r9 + + ldr r8, [r0, #_be1] + ldr r1, [sp, #mDe1] + ldr r9, [r0, #_gi0] + xorrol r8, r1, 1 + ldr r1, [sp, #mDi0] + ldr r10, [r0, #_ko1] + xorrol r9, r1, 3 + ldr r1, [sp, #mDo1] + ldr r11, [r0, #_mu0] + ldr r12, [r0, #_sa0] + xorrol r10, r1, 13 + xorrol r11, r7, 4 + xorrol r12, r5, 9 + xandnot sp, _ka0, r8, r9, r10 + xandnot sp, _ke0, r9, r10, r11 + xandnot sp, _ki0, r10, r11, r12 + xandnot sp, _ko0, r11, r12, r8 + xandnot sp, _ku0, r12, r8, r9 + + ldr r8, [r0, #_bu1] + ldr r9, [r0, #_ga0] + ldr r10, [r0, #_ke0] + ldr r11, [r0, #_mi1] + ldr r12, [r0, #_so0] + ldr r1, [sp, #mDo0] + xorrol r8, r6, 14 + xorrol r9, r5, 18 + xorrol r10, lr, 5 + xorrol r11, r2, 8 + xorrol r12, r1, 28 + xandnot sp, _ma0, r8, r9, r10 + xandnot sp, _me0, r9, r10, r11 + xandnot sp, _mi0, r10, r11, r12 + xandnot sp, _mo0, r11, r12, r8 + xandnot sp, _mu0, r12, r8, r9 + + ldr r1, [sp, #mDi0] + ldr r8, [r0, #_bi0] + ldr r9, [r0, #_go1] + xorrol r8, r1, 31 + ldr r1, [sp, #mDo1] + ldr r10, [r0, #_ku1] + xorrol r9, r1, 28 + ldr r11, [r0, #_ma1] + ldr r12, [r0, #_se0] + xorrol r10, r6, 20 + xorrol r11, r4, 21 + xorrol r12, lr, 1 + xandnot sp, _sa0, r8, r9, r10 + xandnot sp, _se0, r9, r10, r11 + xandnot sp, _si0, r10, r11, r12 + xandnot sp, _so0, r11, r12, r8 + xandnot sp, _su0, r12, r8, r9 + + @// thetaRhoPiChiIota 1, in A, out E + ldr r1, [sp, #mDe1] + ldr r9, [r0, #_ge1] + ldr r8, [r0, #_ba1] + xorrol r9, r1, 22 + ldr r1, [sp, #mDi0] + ldr r10, [r0, #_ki0] + eor r8, r8, r4 + xorrol r10, r1, 21 + ldr r1, [sp, #mDo0] + ldr r11, [r0, #_mo0] + ldr r12, [r0, #_su1] + xorrol r11, r1, 10 + xorrol r12, r6, 7 + xandnot sp, _be1, r9, r10, r11 + xandnot sp, _bi1, r10, r11, r12 + xandnot sp, _bo1, r11, r12, r8 + xandnot sp, _bu1, r12, r8, r9 + xandnotRC sp, _ba1, r8, r9, r10 + + ldr r1, [sp, #mDo1] + ldr r8, [r0, #_bo1] + ldr r12, [r0, #_si0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDi0] + ldr r9, [r0, #_gu1] + xorrol r12, r1, 30 + ldr r10, [r0, #_ka0] + ldr r11, [r0, #_me0] + xorrol r9, r6, 10 + xorrol r10, r5, 1 + xorrol r11, lr, 22 + xandnot sp, _ga1, r8, r9, r10 + xandnot sp, _ge1, r9, r10, r11 + xandnot sp, _gi1, r10, r11, r12 + xandnot sp, _go1, r11, r12, r8 + xandnot sp, _gu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r10, [r0, #_ko0] + ldr r8, [r0, #_be0] + xorrol r10, r1, 12 + ldr r9, [r0, #_gi1] + ldr r11, [r0, #_mu1] + ldr r12, [r0, #_sa1] + eor r8, r8, lr + xorrol r9, r2, 3 + xorrol r11, r6, 4 + xorrol r12, r4, 9 + xandnot sp, _ka1, r8, r9, r10 + xandnot sp, _ke1, r9, r10, r11 + xandnot sp, _ki1, r10, r11, r12 + xandnot sp, _ko1, r11, r12, r8 + xandnot sp, _ku1, r12, r8, r9 + + ldr r1, [sp, #mDe1] + ldr r10, [r0, #_ke1] + ldr r11, [r0, #_mi0] + xorrol r10, r1, 5 + ldr r1, [sp, #mDi0] + ldr r12, [r0, #_so1] + xorrol r11, r1, 7 + ldr r1, [sp, #mDo1] + ldr r8, [r0, #_bu0] + ldr r9, [r0, #_ga1] + xorrol r8, r7, 13 + xorrol r9, r4, 18 + xorrol r12, r1, 28 + xandnot sp, _ma1, r8, r9, r10 + xandnot sp, _me1, r9, r10, r11 + xandnot sp, _mi1, r10, r11, r12 + xandnot sp, _mo1, r11, r12, r8 + xandnot sp, _mu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r9, [r0, #_go0] + ldr r8, [r0, #_bi1] + xorrol r9, r1, 27 + ldr r10, [r0, #_ku0] + ldr r11, [r0, #_ma0] + ldr r12, [r0, #_se1] + ldr r1, [sp, #mDe1] + xorrol r8, r2, 31 + xorrol r10, r7, 19 + xorrol r11, r5, 20 + xorrol r12, r1, 1 + xandnot sp, _sa1, r8, r9, r10 + xandnot sp, _se1, r9, r10, r11 + xandnot sp, _si1, r10, r11, r12 + xandnot sp, _so1, r11, r12, r8 + xandnot sp, _su1, r12, r8, r9 + + @//prepTheta E + xor5 r10, sp,_bu0, _gu0, _ku0, _mu0, _su0 + xor5 r6, sp,_be1, _ge1, _ke1, _me1, _se1 + eor r5, r10, r6, ROR #31 + xor5 r11, sp,_bu1, _gu1, _ku1, _mu1, _su1 + xor5 r7, sp,_be0, _ge0, _ke0, _me0, _se0 + eor r4, r11, r7 + + xor5 r8, sp,_bi0, _gi0, _ki0, _mi0, _si0 + eor r1, r8, r11, ROR #31 + str r1, [sp, #mDo0] + xor5 r9, sp,_bi1, _gi1, _ki1, _mi1, _si1 + eor r1, r9, r10 + str r1, [sp, #mDo1] + + xor5 r10, sp,_ba0, _ga0, _ka0, _ma0, _sa0 + eor lr, r10, r9, ROR #31 + xor5 r11, sp,_ba1, _ga1, _ka1, _ma1, _sa1 + eor r1, r11, r8 + str r1, [sp, #mDe1] + + xor5 r9, sp,_bo1, _go1, _ko1, _mo1, _so1 + eor r1, r7, r9, ROR #31 + str r1, [sp, #mDi0] + xor5 r8, sp,_bo0, _go0, _ko0, _mo0, _so0 + eor r2, r6, r8 + + eor r7, r8, r11, ROR #31 + eor r6, r9, r10 + + @//thetaRhoPiChiIota 0, in E, out A + ldr r8, [sp, #_ba0] + ldr r9, [sp, #_ge0] + ldr r10, [sp, #_ki1] + ldr r11, [sp, #_mo1] + ldr r12, [sp, #_su0] + ldr r1, [sp, #mDo1] + eor r8, r8, r5 + xorrol r9, lr, 22 + xorrol r10, r2, 22 + xorrol r11, r1, 11 + xorrol r12, r7, 7 + xandnot r0, _be0, r9, r10, r11 + xandnot r0, _bi0, r10, r11, r12 + xandnot r0, _bo0, r11, r12, r8 + xandnot r0, _bu0, r12, r8, r9 + xandnotRC r0, _ba0, r8, r9, r10 + + ldr r8, [sp, #_bo0] + ldr r1, [sp, #mDo0] + ldr r9, [sp, #_gu0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDe1] + ldr r10, [sp, #_ka1] + ldr r11, [sp, #_me1] + ldr r12, [sp, #_si1] + xorrol r9, r7, 10 + xorrol r10, r4, 2 + xorrol r11, r1, 23 + xorrol r12, r2, 31 + xandnot r0, _ga0, r8, r9, r10 + xandnot r0, _ge0, r9, r10, r11 + xandnot r0, _gi0, r10, r11, r12 + xandnot r0, _go0, r11, r12, r8 + xandnot r0, _gu0, r12, r8, r9 + + ldr r8, [sp, #_be1] + ldr r1, [sp, #mDe1] + ldr r9, [sp, #_gi0] + xorrol r8, r1, 1 + ldr r1, [sp, #mDi0] + ldr r10, [sp, #_ko1] + xorrol r9, r1, 3 + ldr r1, [sp, #mDo1] + ldr r11, [sp, #_mu0] + ldr r12, [sp, #_sa0] + xorrol r10, r1, 13 + xorrol r11, r7, 4 + xorrol r12, r5, 9 + xandnot r0, _ka0, r8, r9, r10 + xandnot r0, _ke0, r9, r10, r11 + xandnot r0, _ki0, r10, r11, r12 + xandnot r0, _ko0, r11, r12, r8 + xandnot r0, _ku0, r12, r8, r9 + + ldr r8, [sp, #_bu1] + ldr r9, [sp, #_ga0] + ldr r10, [sp, #_ke0] + ldr r11, [sp, #_mi1] + ldr r12, [sp, #_so0] + ldr r1, [sp, #mDo0] + xorrol r8, r6, 14 + xorrol r9, r5, 18 + xorrol r10, lr, 5 + xorrol r11, r2, 8 + xorrol r12, r1, 28 + xandnot r0, _ma0, r8, r9, r10 + xandnot r0, _me0, r9, r10, r11 + xandnot r0, _mi0, r10, r11, r12 + xandnot r0, _mo0, r11, r12, r8 + xandnot r0, _mu0, r12, r8, r9 + + ldr r1, [sp, #mDi0] + ldr r8, [sp, #_bi0] + ldr r9, [sp, #_go1] + xorrol r8, r1, 31 + ldr r1, [sp, #mDo1] + ldr r10, [sp, #_ku1] + xorrol r9, r1, 28 + ldr r11, [sp, #_ma1] + ldr r12, [sp, #_se0] + xorrol r10, r6, 20 + xorrol r11, r4, 21 + xorrol r12, lr, 1 + xandnot r0, _sa0, r8, r9, r10 + xandnot r0, _se0, r9, r10, r11 + xandnot r0, _si0, r10, r11, r12 + xandnot r0, _so0, r11, r12, r8 + xandnot r0, _su0, r12, r8, r9 + + @// thetaRhoPiChiIota 1, in A, out E + ldr r1, [sp, #mDe1] + ldr r9, [sp, #_ge1] + ldr r8, [sp, #_ba1] + xorrol r9, r1, 22 + ldr r1, [sp, #mDi0] + ldr r10, [sp, #_ki0] + eor r8, r8, r4 + xorrol r10, r1, 21 + ldr r1, [sp, #mDo0] + ldr r11, [sp, #_mo0] + ldr r12, [sp, #_su1] + xorrol r11, r1, 10 + xorrol r12, r6, 7 + xandnot r0, _be1, r9, r10, r11 + xandnot r0, _bi1, r10, r11, r12 + xandnot r0, _bo1, r11, r12, r8 + xandnot r0, _bu1, r12, r8, r9 + xandnotRC r0, _ba1, r8, r9, r10 + + ldr r1, [sp, #mDo1] + ldr r8, [sp, #_bo1] + ldr r12, [sp, #_si0] + xorrol r8, r1, 14 + ldr r1, [sp, #mDi0] + ldr r9, [sp, #_gu1] + xorrol r12, r1, 30 + ldr r10, [sp, #_ka0] + ldr r11, [sp, #_me0] + xorrol r9, r6, 10 + xorrol r10, r5, 1 + xorrol r11, lr, 22 + xandnot r0, _ga1, r8, r9, r10 + xandnot r0, _ge1, r9, r10, r11 + xandnot r0, _gi1, r10, r11, r12 + xandnot r0, _go1, r11, r12, r8 + xandnot r0, _gu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r10, [sp, #_ko0] + ldr r8, [sp, #_be0] + xorrol r10, r1, 12 + ldr r9, [sp, #_gi1] + ldr r11, [sp, #_mu1] + ldr r12, [sp, #_sa1] + eor r8, r8, lr + xorrol r9, r2, 3 + xorrol r11, r6, 4 + xorrol r12, r4, 9 + xandnot r0, _ka1, r8, r9, r10 + xandnot r0, _ke1, r9, r10, r11 + xandnot r0, _ki1, r10, r11, r12 + xandnot r0, _ko1, r11, r12, r8 + xandnot r0, _ku1, r12, r8, r9 + + ldr r1, [sp, #mDe1] + ldr r10, [sp, #_ke1] + ldr r11, [sp, #_mi0] + xorrol r10, r1, 5 + ldr r1, [sp, #mDi0] + ldr r12, [sp, #_so1] + xorrol r11, r1, 7 + ldr r1, [sp, #mDo1] + ldr r8, [sp, #_bu0] + ldr r9, [sp, #_ga1] + xorrol r8, r7, 13 + xorrol r9, r4, 18 + xorrol r12, r1, 28 + xandnot r0, _ma1, r8, r9, r10 + xandnot r0, _me1, r9, r10, r11 + xandnot r0, _mi1, r10, r11, r12 + xandnot r0, _mo1, r11, r12, r8 + xandnot r0, _mu1, r12, r8, r9 + + ldr r1, [sp, #mDo0] + ldr r9, [sp, #_go0] + ldr r8, [sp, #_bi1] + xorrol r9, r1, 27 + ldr r10, [sp, #_ku0] + ldr r11, [sp, #_ma0] + ldr r12, [sp, #_se1] + ldr r1, [sp, #mDe1] + xorrol r8, r2, 31 + xorrol r10, r7, 19 + xorrol r11, r5, 20 + xorrol r12, r1, 1 + xandnot r0, _sa1, r8, r9, r10 + xandnot r0, _se1, r9, r10, r11 + xandnot r0, _si1, r10, r11, r12 + xandnot r0, _so1, r11, r12, r8 + ldr r10, [r3] + xandnot r0, _su1, r12, r8, r9 + + cmp r10, #0xFFFFFFFF + bne roundLoop + + add sp,sp,#4*(50+4) + pop {r4-r12,pc} + + @ + + @ ALIGN + diff --git a/c_src/KeccakF-1600-avr8.c b/c_src/KeccakF-1600-avr8.c new file mode 100755 index 0000000..7ea2679 --- /dev/null +++ b/c_src/KeccakF-1600-avr8.c @@ -0,0 +1,163 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by Ronny Van Keer, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include +#include "AVR8-rotate64.h" + +typedef unsigned char UINT8; +typedef UINT8 tSmallUInt; +typedef unsigned long long UINT64; +typedef UINT64 tKeccakLane; + +#define cKeccakLaneSizeInBits (sizeof(tKeccakLane) * 8) + +#define cKeccakNumberOfRounds 24 + +static tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] PROGMEM = +{ + (tKeccakLane)0x0000000000000001ULL, + (tKeccakLane)0x0000000000008082ULL, + (tKeccakLane)0x800000000000808aULL, + (tKeccakLane)0x8000000080008000ULL, + (tKeccakLane)0x000000000000808bULL, + (tKeccakLane)0x0000000080000001ULL, + (tKeccakLane)0x8000000080008081ULL, + (tKeccakLane)0x8000000000008009ULL, + (tKeccakLane)0x000000000000008aULL, + (tKeccakLane)0x0000000000000088ULL, + (tKeccakLane)0x0000000080008009ULL, + (tKeccakLane)0x000000008000000aULL, + (tKeccakLane)0x000000008000808bULL, + (tKeccakLane)0x800000000000008bULL, + (tKeccakLane)0x8000000000008089ULL, + (tKeccakLane)0x8000000000008003ULL, + (tKeccakLane)0x8000000000008002ULL, + (tKeccakLane)0x8000000000000080ULL, + (tKeccakLane)0x000000000000800aULL, + (tKeccakLane)0x800000008000000aULL, + (tKeccakLane)0x8000000080008081ULL, + (tKeccakLane)0x8000000000008080ULL, + (tKeccakLane)0x0000000080000001ULL, + (tKeccakLane)0x8000000080008008ULL +}; + +static tSmallUInt KeccakF_RotationConstants[24] PROGMEM = +{ + ROT_CODE( 1), ROT_CODE( 3), ROT_CODE( 6), ROT_CODE(10), ROT_CODE(15), + ROT_CODE(21), ROT_CODE(28), ROT_CODE(36), ROT_CODE(45), ROT_CODE(55), + ROT_CODE( 2), ROT_CODE(14), ROT_CODE(27), ROT_CODE(41), ROT_CODE(56), + ROT_CODE( 8), ROT_CODE(25), ROT_CODE(43), ROT_CODE(62), ROT_CODE(18), + ROT_CODE(39), ROT_CODE(61), ROT_CODE(20), ROT_CODE(44) +}; + +static tSmallUInt KeccakF_PiLane[24] PROGMEM = +{ + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 +}; + +static tSmallUInt KeccakF_Mod5[10] PROGMEM = +{ + 0, 1, 2, 3, 4, 0, 1, 2, 3, 4 +}; + + +void KeccakF( tKeccakLane * state ) +{ + tSmallUInt round; + tKeccakLane C[5]; + + // prepare Theta + { + tSmallUInt x; + tKeccakLane * pC; + for ( x = 0, pC = C; x < 5; ++x, ++pC ) + { + *pC = state[x] ^ state[5 + x] ^ state[10 + x] ^ state[15 + x] ^ state[20 + x]; + } + } + + for( round = 0; round < cKeccakNumberOfRounds; ++round ) + { + // Theta + { + tSmallUInt x; + for ( x = 0; x < 5; ++x ) + { + tKeccakLane temp; + tSmallUInt y; + temp = rotate64_1bit_left( C[pgm_read_byte((KeccakF_Mod5+1)+x)] ); + temp ^= C[pgm_read_byte((KeccakF_Mod5+4)+x)]; + for ( y = 0; y < 25; y += 5 ) + { + state[y + x] ^= temp; + } + } + } + + // Rho Pi + { + tKeccakLane temp; + tSmallUInt x; + + temp = state[1]; + for ( x = 0; x < 24; ++x ) + { + tSmallUInt t; + tKeccakLane T[1]; + t = pgm_read_byte(&KeccakF_PiLane[x]); + T[0] = state[t]; + state[t] = rotate64left_code( temp, pgm_read_byte(&KeccakF_RotationConstants[x]) ); + temp = T[0]; + } + } + + // Chi Iota Prepare Theta + { + tSmallUInt z; + UINT8 * p = (unsigned char *)state; + UINT8 * pC = (unsigned char *)C; + + for( z = 0; z < 8; ++z, ++p, ++pC ) + { + tSmallUInt y; + UINT8 c0, c1, c2, c3, c4, t; + + c0 = c1 = c2 = c3 = c4 = 0; + for( y = 5; y != 0; --y, p += 40 ) + { + UINT8 a0 = *p; + UINT8 a1 = *(p+8); + UINT8 a2 = *(p+16); + UINT8 a3 = *(p+24); + UINT8 a4 = *(p+32); + + *p = t = a0 ^ ((~a1) & a2); c0 ^= t; + *(p+8) = t = a1 ^ ((~a2) & a3); c1 ^= t; + *(p+16) = a2 ^= ((~a3) & a4); c2 ^= a2; + *(p+24) = a3 ^= ((~a4) & a0); c3 ^= a3; + *(p+32) = a4 ^= ((~a0) & a1); c4 ^= a4; + } + p -= 5 * 5 * 8; + y = pgm_read_byte( (UINT8 *)(KeccakF_RoundConstants+round) + z ); + *p ^= y; + *pC = c0 ^ y; + *(pC+ 8) = c1; + *(pC+16) = c2; + *(pC+24) = c3; + *(pC+32) = c4; + } + } + } + +} diff --git a/c_src/KeccakF-1600-avr8asm-compact.s b/c_src/KeccakF-1600-avr8asm-compact.s new file mode 100755 index 0000000..c87920f --- /dev/null +++ b/c_src/KeccakF-1600-avr8asm-compact.s @@ -0,0 +1,647 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include "Keccak-avr8-settings.h" +#include "crypto_hash.h" + +#define cKeccakR_SizeInBytes (cKeccakR/8) + +#ifndef crypto_hash_BYTES + #ifdef cKeccakFixedOutputLengthInBytes + #define crypto_hash_BYTES cKeccakFixedOutputLengthInBytes + #else + #define crypto_hash_BYTES cKeccakR_SizeInBytes + #endif +#endif + +// Registers used in all routines +#define zero 1 +#define rpState 24 +#define rX 26 +#define rY 28 +#define rZ 30 + + +/* + * int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) + * + * argument out is passed in r24:r25 + * argument in is passed in r22:r23 + * argument inlen is passed in r14:r21, only lowest 16-bits (r14-r15) are used + */ +.global crypto_hash // populate.py, please update crypto_hash +crypto_hash: // populate.py, please update crypto_hash + + // crypto_hash only registers + #define rT1 16 + #define rT2 17 + #define rT3 18 + #define rInLen 22 //(2 regs) + #define sp 0x3D + + push r2 + push r3 + push r4 + push r5 + push r6 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 + + // Allocate state (25*8) + C variables (5*8) + in rZ, sp + in rZ+1, sp+1 + subi rZ, 240 + sbci rZ+1, 0 + in r0, 0x3F + cli + out sp+1, rZ+1 + out sp, rZ + out 0x3F, r0 + adiw rZ, 41 // pointer to start of state, end of C, compensate post decrement + + push r24 // save out pointer + push r25 + + movw rpState, rZ + movw rY, r22 //y contains in pointer + movw rInLen, r14 + + ldi rT3, 5*5*8 //clear state +clearStateLoop: + st z+, zero + dec rT3 + brne clearStateLoop + + // Full blocks + cpi rInLen, cKeccakR_SizeInBytes + cpc rInLen+1, zero + brcs ch_lastblock + +ch_FullRateLoop: + ldi rT3, cKeccakR_SizeInBytes + movw rZ, rpState +ch_XorLanesLoop: + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + subi rT3, 1 + brne ch_XorLanesLoop + + push rY + push rY+1 + call KeccakF + pop rY+1 + pop rY + + subi rInLen, cKeccakR_SizeInBytes + sbci rInLen+1, 0 + cpi rInLen, cKeccakR_SizeInBytes + cpc rInLen+1, zero + brcc ch_FullRateLoop + +ch_lastblock: // XOR last uncomplete block into state + movw rZ, rpState + + subi rInLen, 0 + breq ch_Padding +ch_xorBytesLoop: + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + subi rInLen, 1 + brne ch_xorBytesLoop + +ch_Padding: + ldi rT1, 1 + ld rT2, Z + eor rT1, rT2 + st Z, rT1 + + ldi rZ, cKeccakR_SizeInBytes-1 + add rZ, rpState + mov rZ+1, rpState+1 + adc rZ+1, zero + ld rT1, Z + subi rT1, 0x80 + st Z, rT1 + + call KeccakF + + //output + ldi rT3, crypto_hash_BYTES + movw rY, rpState + pop rZ+1 ; restore out pointer + pop rZ +outputLoop: + ld rT1, Y+ + st Z+, rT1 + dec rT3 + brne outputLoop + + + // Free state and pop registers + ldi rZ, 199 + add rpState, rZ + adc rpState+1, zero + in r0, 0x3F + cli + out sp+1, rpState+1 + out sp, rpState + out 0x3F, r0 + + pop r29 + pop r28 + pop r17 + pop r16 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + pop r7 + pop r6 + pop r5 + pop r4 + pop r3 + pop r2 + + // return 0 + mov r24, zero + mov r25, zero + + #undef rInLen + #undef rT1 + #undef rT2 + #undef rT3 + #undef sp + + ret + + +//#define ROT_BIT(a) (a <= 4) ? ((a == 0) ? 0x80 : (a & 7)) : (0x80 | (8-a)) + +#define ROT_BIT(a) ((a) & 7) +#define ROT_BYTE(a) (((a)/8 + !!(((a)%8) > 4)) & 7) + +KeccakF_RhoPiConstants: + .BYTE ROT_BIT( 1), ROT_BYTE( 3), 10 * 8 + .BYTE ROT_BIT( 3), ROT_BYTE( 6), 7 * 8 + .BYTE ROT_BIT( 6), ROT_BYTE(10), 11 * 8 + .BYTE ROT_BIT(10), ROT_BYTE(15), 17 * 8 + .BYTE ROT_BIT(15), ROT_BYTE(21), 18 * 8 + .BYTE ROT_BIT(21), ROT_BYTE(28), 3 * 8 + .BYTE ROT_BIT(28), ROT_BYTE(36), 5 * 8 + .BYTE ROT_BIT(36), ROT_BYTE(45), 16 * 8 + .BYTE ROT_BIT(45), ROT_BYTE(55), 8 * 8 + .BYTE ROT_BIT(55), ROT_BYTE( 2), 21 * 8 + .BYTE ROT_BIT( 2), ROT_BYTE(14), 24 * 8 + .BYTE ROT_BIT(14), ROT_BYTE(27), 4 * 8 + .BYTE ROT_BIT(27), ROT_BYTE(41), 15 * 8 + .BYTE ROT_BIT(41), ROT_BYTE(56), 23 * 8 + .BYTE ROT_BIT(56), ROT_BYTE( 8), 19 * 8 + .BYTE ROT_BIT( 8), ROT_BYTE(25), 13 * 8 + .BYTE ROT_BIT(25), ROT_BYTE(43), 12 * 8 + .BYTE ROT_BIT(43), ROT_BYTE(62), 2 * 8 + .BYTE ROT_BIT(62), ROT_BYTE(18), 20 * 8 + .BYTE ROT_BIT(18), ROT_BYTE(39), 14 * 8 + .BYTE ROT_BIT(39), ROT_BYTE(61), 22 * 8 + .BYTE ROT_BIT(61), ROT_BYTE(20), 9 * 8 + .BYTE ROT_BIT(20), ROT_BYTE(44), 6 * 8 + .BYTE ROT_BIT(44), ROT_BYTE( 1), 1 * 8 + + +KeccakF_RoundConstants: + .BYTE 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x82, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x8a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x00, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x8b, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x09, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x8a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x09, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x8b, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x89, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x03, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x02, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x0a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x08, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0xFF, 0 //terminator + + .text + + + +// KeccakF +// Not callable from C! +// +// argument rpState is passed in r24:r25 +// +KeccakF: + + // Variables used in multiple operations + #define rTemp 2 // 8 regs (2-9) + #define rTempBis 10 // 8 regs (10-17) + #define rTempTer 18 // 2 regs (18-19) + #define pRound 20 // 2 regs (20-21) + + // Initial Prepare Theta + #define TCIPx rTempTer + + movw rZ, rpState // Z points to 5 C lanes + sbiw rZ, 40 + movw rY, rpState + ldi TCIPx, 5*8 +KeccakInitialPrepTheta_Loop: + ld r0, Y + adiw rY, 40 + ld rTemp, Y + adiw rY, 40 + eor r0, rTemp + ld rTemp, Y + adiw rY, 40 + eor r0, rTemp + ld rTemp, Y + eor r0, rTemp + ldd rTemp, Y+40 + eor r0, rTemp + st Z+, r0 + subi rY, 119 + sbc rY+1, zero + dec TCIPx + brne KeccakInitialPrepTheta_Loop + #undef TCIPx + + ldi pRound, lo8(KeccakF_RoundConstants) + ldi pRound+1, hi8(KeccakF_RoundConstants) +Keccak_RoundLoop: + + // Theta + #define TCplus rX + #define TCminus rZ + #define TCcoordX rTempTer + #define TCcoordY rTempTer+1 + + movw TCminus, rpState + sbiw TCminus, 1*8 + movw TCplus, rpState + sbiw TCplus, 4*8 + movw rY, rpState + + ldi TCcoordX, 0x16 +KeccakTheta_Loop1: + ld rTemp+0, X+ + ld rTemp+1, X+ + ld rTemp+2, X+ + ld rTemp+3, X+ + ld rTemp+4, X+ + ld rTemp+5, X+ + ld rTemp+6, X+ + ld rTemp+7, X+ + + lsl rTemp+0 + rol rTemp+1 + rol rTemp+2 + rol rTemp+3 + rol rTemp+4 + rol rTemp+5 + rol rTemp+6 + rol rTemp+7 + adc rTemp+0, zero + + ld r0, Z+ + eor rTemp+0, r0 + ld r0, Z+ + eor rTemp+1, r0 + ld r0, Z+ + eor rTemp+2, r0 + ld r0, Z+ + eor rTemp+3, r0 + ld r0, Z+ + eor rTemp+4, r0 + ld r0, Z+ + eor rTemp+5, r0 + ld r0, Z+ + eor rTemp+6, r0 + ld r0, Z+ + eor rTemp+7, r0 + + ldi TCcoordY, 5 +KeccakTheta_Loop2: + ld r0, Y + eor r0, rTemp+0 + st Y+, r0 + ld r0, Y + eor r0, rTemp+1 + st Y+, r0 + ld r0, Y + eor r0, rTemp+2 + st Y+, r0 + ld r0, Y + eor r0, rTemp+3 + st Y+, r0 + ld r0, Y + eor r0, rTemp+4 + st Y+, r0 + ld r0, Y + eor r0, rTemp+5 + st Y+, r0 + ld r0, Y + eor r0, rTemp+6 + st Y+, r0 + ld r0, Y + eor r0, rTemp+7 + st Y+, r0 + adiw rY, 32 + + dec TCcoordY + brne KeccakTheta_Loop2 + + subi rY, 200-8 + sbc rY+1, zero + + lsr TCcoordX + brcc 1f + breq KeccakTheta_End + rjmp KeccakTheta_Loop1 +1: + cpi TCcoordX, 0x0B + brne 2f + sbiw TCminus, 40 + rjmp KeccakTheta_Loop1 +2: + sbiw TCplus, 40 + rjmp KeccakTheta_Loop1 + +KeccakTheta_End: + #undef TCplus + #undef TCminus + #undef TCcoordX + #undef TCcoordY + + + // Rho Pi + #define RPindex rTempTer+0 + #define RPTemp rTempTer+1 + + sbiw rY, 32 + + ld rTemp+0, Y+ + ld rTemp+1, Y+ + ld rTemp+2, Y+ + ld rTemp+3, Y+ + ld rTemp+4, Y+ + ld rTemp+5, Y+ + ld rTemp+6, Y+ + ld rTemp+7, Y+ + + ldi rZ, lo8(KeccakF_RhoPiConstants) + ldi rZ+1, hi8(KeccakF_RhoPiConstants) + +KeccakRhoPi_Loop: + ; do bit rotation + lpm RPTemp, Z+ ;get nuber of bits to rotate + cpi RPTemp, 5 + brcs rotate64_nbit_leftOrNot + neg RPTemp + andi RPTemp, 3 + +rotate64_nbit_right: + bst rTemp, 0 + ror rTemp+7 + ror rTemp+6 + ror rTemp+5 + ror rTemp+4 + ror rTemp+3 + ror rTemp+2 + ror rTemp+1 + ror rTemp + bld rTemp+7, 7 + dec RPTemp + brne rotate64_nbit_right + rjmp KeccakRhoPi_RhoBitRotateDone + +rotate64_nbit_leftOrNot: + tst RPTemp + breq KeccakRhoPi_RhoBitRotateDone +rotate64_nbit_left: + lsl rTemp + rol rTemp+1 + rol rTemp+2 + rol rTemp+3 + rol rTemp+4 + rol rTemp+5 + rol rTemp+6 + rol rTemp+7 + adc rTemp, r1 + dec RPTemp + brne rotate64_nbit_left + +KeccakRhoPi_RhoBitRotateDone: + lpm r0, Z+ ;get number of bytes to rotate + lpm RPindex, Z+ ;get index in state + movw rY, rpState + add rY, RPindex + adc rY+1, zero + + ldi rX, rTempBis + add rX, r0 + mov rX+1, zero + ldi RPTemp, 8 +KeccakRhoPi_PiByteRotLoop: + ld r0, Y+ + st X+, r0 + cpi rX, rTempBis+8 + brne KeccakRhoPi_PiByteRotFirst + ldi rX, rTempBis +KeccakRhoPi_PiByteRotFirst: + dec RPTemp + brne KeccakRhoPi_PiByteRotLoop + + sbiw rY, 8 + st Y+, rTemp+0 + st Y+, rTemp+1 + st Y+, rTemp+2 + st Y+, rTemp+3 + st Y+, rTemp+4 + st Y+, rTemp+5 + st Y+, rTemp+6 + st Y+, rTemp+7 + + movw rTemp+0, rTempBis+0 + movw rTemp+2, rTempBis+2 + movw rTemp+4, rTempBis+4 + movw rTemp+6, rTempBis+6 +KeccakRhoPi_RhoDone: + subi RPindex, 8 + brne KeccakRhoPi_Loop + + #undef RPindex + #undef RPTemp + + + // Chi Iota prepare Theta + #define CIPTa0 rTemp + #define CIPTa1 rTemp+1 + #define CIPTa2 rTemp+2 + #define CIPTa3 rTemp+3 + #define CIPTa4 rTemp+4 + #define CIPTc0 rTempBis + #define CIPTc1 rTempBis+1 + #define CIPTc2 rTempBis+2 + #define CIPTc3 rTempBis+3 + #define CIPTc4 rTempBis+4 + #define CIPTz rTempBis+6 + #define CIPTy rTempBis+7 + + movw rY, rpState + movw rX, rpState ; 5 * C + sbiw rX, 40 + movw rZ, pRound + + ldi CIPTz, 8 +KeccakChiIotaPrepareTheta_zLoop: + mov CIPTc0, zero + mov CIPTc1, zero + movw CIPTc2, CIPTc0 + mov CIPTc4, zero + + ldi CIPTy, 5 +KeccakChiIotaPrepareTheta_yLoop: + ld CIPTa0, Y + ldd CIPTa1, Y+8 + ldd CIPTa2, Y+16 + ldd CIPTa3, Y+24 + ldd CIPTa4, Y+32 + + ;*p = t = a0 ^ ((~a1) & a2); c0 ^= t; + mov r0, CIPTa1 + com r0 + and r0, CIPTa2 + eor r0, CIPTa0 + eor CIPTc0, r0 + st Y, r0 + + ;*(p+8) = t = a1 ^ ((~a2) & a3); c1 ^= t; + mov r0, CIPTa2 + com r0 + and r0, CIPTa3 + eor r0, CIPTa1 + eor CIPTc1, r0 + std Y+8, r0 + + ;*(p+16) = a2 ^= ((~a3) & a4); c2 ^= a2; + mov r0, CIPTa3 + com r0 + and r0, CIPTa4 + eor r0, CIPTa2 + eor CIPTc2, r0 + std Y+16, r0 + + ;*(p+24) = a3 ^= ((~a4) & a0); c3 ^= a3; + mov r0, CIPTa4 + com r0 + and r0, CIPTa0 + eor r0, CIPTa3 + eor CIPTc3, r0 + std Y+24, r0 + + ;*(p+32) = a4 ^= ((~a0) & a1); c4 ^= a4; + com CIPTa0 + and CIPTa0, CIPTa1 + eor CIPTa0, CIPTa4 + eor CIPTc4, CIPTa0 + std Y+32, CIPTa0 + + adiw rY, 40 + dec CIPTy + brne KeccakChiIotaPrepareTheta_yLoop + + subi rY, 200 + sbc rY+1, zero + + lpm r0, Z+ ;Round Constant + ld CIPTa0, Y + eor CIPTa0, r0 + st Y+, CIPTa0 + + movw pRound, rZ + movw rZ, rX + eor CIPTc0, r0 + st Z+, CIPTc0 + std Z+7, CIPTc1 + std Z+15, CIPTc2 + std Z+23, CIPTc3 + std Z+31, CIPTc4 + movw rX, rZ + movw rZ, pRound + + dec CIPTz + brne KeccakChiIotaPrepareTheta_zLoop + + #undef CIPTa0 + #undef CIPTa1 + #undef CIPTa2 + #undef CIPTa3 + #undef CIPTa4 + #undef CIPTc0 + #undef CIPTc1 + #undef CIPTc2 + #undef CIPTc3 + #undef CIPTc4 + #undef CIPTz + #undef CIPTy + + + ;Check for terminator + lpm r0, Z + inc r0 + breq Keccak_Done + rjmp Keccak_RoundLoop +Keccak_Done: + ret + + #undef rTemp + #undef rTempBis + #undef rTempTer + #undef pRound + + #undef rpState + #undef zero + #undef rX + #undef rY + #undef rZ diff --git a/c_src/KeccakF-1600-avr8asm-fast.s b/c_src/KeccakF-1600-avr8asm-fast.s new file mode 100755 index 0000000..e27f174 --- /dev/null +++ b/c_src/KeccakF-1600-avr8asm-fast.s @@ -0,0 +1,934 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include "Keccak-avr8-settings.h" +#include "crypto_hash.h" + +#define cKeccakR_SizeInBytes (cKeccakR/8) + +#ifndef crypto_hash_BYTES + #ifdef cKeccakFixedOutputLengthInBytes + #define crypto_hash_BYTES cKeccakFixedOutputLengthInBytes + #else + #define crypto_hash_BYTES cKeccakR_SizeInBytes + #endif +#endif + +// Registers used in all routines +#define zero 1 +#define rpState 24 +#define rX 26 +#define rY 28 +#define rZ 30 + + +/* + * int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) + * + * argument out is passed in r24:r25 + * argument in is passed in r22:r23 + * argument inlen is passed in r14:r21, only lowest 16-bits (r14-r15) are used + */ +.global crypto_hash // populate.py, please update crypto_hash +crypto_hash: // populate.py, please update crypto_hash + + // crypto_hash only registers + #define rInLen 16 //(2 regs) + #define rT1 18 + #define rT2 19 + #define rT3 20 + #define sp 0x3D + + push r2 + push r3 + push r4 + push r5 + push r6 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 + + // Allocate state (25*8) + C variables (5*8) + in rZ, sp + in rZ+1, sp+1 + subi rZ, 240 + sbci rZ+1, 0 + in r0, 0x3F + cli + out sp+1, rZ+1 + out sp, rZ + out 0x3F, r0 + adiw rZ, 41 // pointer to start of state, end of C, compensate post decrement + + push r24 // save out pointer + push r25 + + movw rpState, rZ + movw rY, r22 //y contains in pointer + movw rInLen, r14 + + ldi rT3, 5*5*2 //clear state (4 bytes each iteration) +clearStateLoop: + st z+, zero + st z+, zero + st z+, zero + st z+, zero + dec rT3 + brne clearStateLoop + + // Full blocks + cpi rInLen, cKeccakR_SizeInBytes + cpc rInLen+1, zero + brcs ch_lastblock + +ch_FullRateLoop: + ldi rT3, cKeccakR_SizeInBytes/8 + movw rZ, rpState +ch_XorLanesLoop: + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + + subi rT3, 1 + brne ch_XorLanesLoop + + push rY + push rY+1 + push rInLen + push rInLen+1 + call KeccakF + pop rInLen+1 + pop rInLen + pop rY+1 + pop rY + + subi rInLen, cKeccakR_SizeInBytes + sbci rInLen+1, 0 + cpi rInLen, cKeccakR_SizeInBytes + cpc rInLen+1, zero + brcc ch_FullRateLoop + +ch_lastblock: // XOR last uncomplete block into state + movw rZ, rpState + + lsr rInLen + brcc ch_xorBytes2 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + subi rInLen, 0 +ch_xorBytes2: + breq ch_Padding +ch_xorBytes2Loop: + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + ld rT1, Y+ + ld rT2, Z + eor rT1, rT2 + st Z+, rT1 + subi rInLen, 1 + brne ch_xorBytes2Loop + +ch_Padding: + ldi rT1, 1 + ld rT2, Z + eor rT1, rT2 + st Z, rT1 + + ldi rZ, cKeccakR_SizeInBytes-1 + add rZ, rpState + mov rZ+1, rpState+1 + adc rZ+1, zero + ld rT1, Z + subi rT1, 0x80 + st Z, rT1 + + call KeccakF + + //output + ldi rT3, crypto_hash_BYTES/4 ; copy 4 bytes per iteration + movw rY, rpState + pop rZ+1 ; restore out pointer + pop rZ +outputLoop: + ld rT1, Y+ + st Z+, rT1 + ld rT1, Y+ + st Z+, rT1 + ld rT1, Y+ + st Z+, rT1 + ld rT1, Y+ + st Z+, rT1 + dec rT3 + brne outputLoop + + + // Free state and pop registers + ldi rZ, 199 + add rpState, rZ + adc rpState+1, zero + in r0, 0x3F + cli + out sp+1, rpState+1 + out sp, rpState + out 0x3F, r0 + + pop r29 + pop r28 + pop r17 + pop r16 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + pop r7 + pop r6 + pop r5 + pop r4 + pop r3 + pop r2 + + // return 0 + mov r24, zero + mov r25, zero + + #undef rInLen + #undef rT1 + #undef rT2 + #undef rT3 + #undef sp + + ret + + +#define ROT_BIT(a) ((a) & 7) +#define ROT_BYTE(a) ((((a)/8 + !!(((a)%8) > 4)) & 7) * 9) + +KeccakF_RhoPiConstants: + .BYTE ROT_BIT( 1), ROT_BYTE( 3), 10 * 8 + .BYTE ROT_BIT( 3), ROT_BYTE( 6), 7 * 8 + .BYTE ROT_BIT( 6), ROT_BYTE(10), 11 * 8 + .BYTE ROT_BIT(10), ROT_BYTE(15), 17 * 8 + .BYTE ROT_BIT(15), ROT_BYTE(21), 18 * 8 + .BYTE ROT_BIT(21), ROT_BYTE(28), 3 * 8 + .BYTE ROT_BIT(28), ROT_BYTE(36), 5 * 8 + .BYTE ROT_BIT(36), ROT_BYTE(45), 16 * 8 + .BYTE ROT_BIT(45), ROT_BYTE(55), 8 * 8 + .BYTE ROT_BIT(55), ROT_BYTE( 2), 21 * 8 + .BYTE ROT_BIT( 2), ROT_BYTE(14), 24 * 8 + .BYTE ROT_BIT(14), ROT_BYTE(27), 4 * 8 + .BYTE ROT_BIT(27), ROT_BYTE(41), 15 * 8 + .BYTE ROT_BIT(41), ROT_BYTE(56), 23 * 8 + .BYTE ROT_BIT(56), ROT_BYTE( 8), 19 * 8 + .BYTE ROT_BIT( 8), ROT_BYTE(25), 13 * 8 + .BYTE ROT_BIT(25), ROT_BYTE(43), 12 * 8 + .BYTE ROT_BIT(43), ROT_BYTE(62), 2 * 8 + .BYTE ROT_BIT(62), ROT_BYTE(18), 20 * 8 + .BYTE ROT_BIT(18), ROT_BYTE(39), 14 * 8 + .BYTE ROT_BIT(39), ROT_BYTE(61), 22 * 8 + .BYTE ROT_BIT(61), ROT_BYTE(20), 9 * 8 + .BYTE ROT_BIT(20), ROT_BYTE(44), 6 * 8 + .BYTE ROT_BIT(44), ROT_BYTE( 1), 1 * 8 + + +KeccakF_RoundConstants: + .BYTE 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x82, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x8a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x00, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x8b, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x09, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x8a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x09, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x8b, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x89, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x03, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x02, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x0a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 + .BYTE 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00 + .BYTE 0x08, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80 + .BYTE 0xFF, 0 //terminator + + .text + + + +// KeccakF +// Not callable from C! +// +// argument rpState is passed in r24:r25 +// +KeccakF: + + // Variables used in multiple operations + #define rTemp 2 // 8 regs (2-9) + #define rTempBis 10 // 8 regs (10-17) + #define rTempTer 18 // 4 regs (18-21) + #define pRound 22 // 2 regs (22-23) + + // Initial Prepare Theta + #define TCIPx rTempTer + + movw rZ, rpState // Z points to 8 C + sbiw rZ, 40 + ldi TCIPx, 5 + movw rY, rpState +KeccakInitialPrepTheta_Loop: + ld rTemp+0, Y+ ;state[x] + ld rTemp+1, Y+ + ld rTemp+2, Y+ + ld rTemp+3, Y+ + ld rTemp+4, Y+ + ld rTemp+5, Y+ + ld rTemp+6, Y+ + ld rTemp+7, Y+ + + adiw rY, 32 + ld r0, Y+ ;state[5+x] + eor rTemp+0, r0 + ld r0, Y+ + eor rTemp+1, r0 + ld r0, Y+ + eor rTemp+2, r0 + ld r0, Y+ + eor rTemp+3, r0 + ld r0, Y+ + eor rTemp+4, r0 + ld r0, Y+ + eor rTemp+5, r0 + ld r0, Y+ + eor rTemp+6, r0 + ld r0, Y+ + eor rTemp+7, r0 + + adiw rY, 32 + ld r0, Y+ ;state[10+x] + eor rTemp+0, r0 + ld r0, Y+ + eor rTemp+1, r0 + ld r0, Y+ + eor rTemp+2, r0 + ld r0, Y+ + eor rTemp+3, r0 + ld r0, Y+ + eor rTemp+4, r0 + ld r0, Y+ + eor rTemp+5, r0 + ld r0, Y+ + eor rTemp+6, r0 + ld r0, Y+ + eor rTemp+7, r0 + + adiw rY, 32 + ld r0, Y+ ;state[15+x] + eor rTemp+0, r0 + ld r0, Y+ + eor rTemp+1, r0 + ld r0, Y+ + eor rTemp+2, r0 + ld r0, Y+ + eor rTemp+3, r0 + ld r0, Y+ + eor rTemp+4, r0 + ld r0, Y+ + eor rTemp+5, r0 + ld r0, Y+ + eor rTemp+6, r0 + ld r0, Y+ + eor rTemp+7, r0 + + adiw rY, 32 + ld r0, Y+ ;state[20+x] + eor rTemp+0, r0 + ld r0, Y+ + eor rTemp+1, r0 + ld r0, Y+ + eor rTemp+2, r0 + ld r0, Y+ + eor rTemp+3, r0 + ld r0, Y+ + eor rTemp+4, r0 + ld r0, Y+ + eor rTemp+5, r0 + ld r0, Y+ + eor rTemp+6, r0 + ld r0, Y+ + eor rTemp+7, r0 + + st Z+, rTemp+0 + st Z+, rTemp+1 + st Z+, rTemp+2 + st Z+, rTemp+3 + st Z+, rTemp+4 + st Z+, rTemp+5 + st Z+, rTemp+6 + st Z+, rTemp+7 + + subi rY, 160 + sbc rY+1, zero + + subi TCIPx, 1 + breq KeccakInitialPrepTheta_Done + rjmp KeccakInitialPrepTheta_Loop +KeccakInitialPrepTheta_Done: + #undef TCIPx + + ldi pRound, lo8(KeccakF_RoundConstants) + ldi pRound+1, hi8(KeccakF_RoundConstants) +Keccak_RoundLoop: + + // Theta + #define TCplus rX + #define TCminus rZ + #define TCcoordX rTempTer + #define TCcoordY rTempTer+1 + + movw TCminus, rpState + sbiw TCminus, 1*8 + movw TCplus, rpState + sbiw TCplus, 4*8 + movw rY, rpState + + ldi TCcoordX, 0x16 +KeccakTheta_Loop1: + ld rTemp+0, X+ + ld rTemp+1, X+ + ld rTemp+2, X+ + ld rTemp+3, X+ + ld rTemp+4, X+ + ld rTemp+5, X+ + ld rTemp+6, X+ + ld rTemp+7, X+ + + lsl rTemp+0 + rol rTemp+1 + rol rTemp+2 + rol rTemp+3 + rol rTemp+4 + rol rTemp+5 + rol rTemp+6 + rol rTemp+7 + adc rTemp+0, zero + + ld r0, Z+ + eor rTemp+0, r0 + ld r0, Z+ + eor rTemp+1, r0 + ld r0, Z+ + eor rTemp+2, r0 + ld r0, Z+ + eor rTemp+3, r0 + ld r0, Z+ + eor rTemp+4, r0 + ld r0, Z+ + eor rTemp+5, r0 + ld r0, Z+ + eor rTemp+6, r0 + ld r0, Z+ + eor rTemp+7, r0 + + ldi TCcoordY, 5 +KeccakTheta_Loop2: + ld r0, Y + eor r0, rTemp+0 + st Y+, r0 + ld r0, Y + eor r0, rTemp+1 + st Y+, r0 + ld r0, Y + eor r0, rTemp+2 + st Y+, r0 + ld r0, Y + eor r0, rTemp+3 + st Y+, r0 + ld r0, Y + eor r0, rTemp+4 + st Y+, r0 + ld r0, Y + eor r0, rTemp+5 + st Y+, r0 + ld r0, Y + eor r0, rTemp+6 + st Y+, r0 + ld r0, Y + eor r0, rTemp+7 + st Y+, r0 + adiw rY, 32 + + dec TCcoordY + brne KeccakTheta_Loop2 + + subi rY, 200-8 + sbc rY+1, zero + + lsr TCcoordX + brcc 1f + breq KeccakTheta_End + rjmp KeccakTheta_Loop1 +1: + cpi TCcoordX, 0x0B + brne 2f + sbiw TCminus, 40 + rjmp KeccakTheta_Loop1 +2: + sbiw TCplus, 40 + rjmp KeccakTheta_Loop1 + +KeccakTheta_End: + #undef TCplus + #undef TCminus + #undef TCcoordX + #undef TCcoordY + + + // Rho Pi + #define RPpConst rTempTer // 2 regs + #define RPindex rTempTer+2 + #define RPpBitRot rX + #define RPpByteRot pRound + + sbiw rY, 32 + + ld rTemp+0, Y+ + ld rTemp+1, Y+ + ld rTemp+2, Y+ + ld rTemp+3, Y+ + ld rTemp+4, Y+ + ld rTemp+5, Y+ + ld rTemp+6, Y+ + ld rTemp+7, Y+ + + push pRound + push pRound+1 + ldi RPpConst, lo8(KeccakF_RhoPiConstants) + ldi RPpConst+1, hi8(KeccakF_RhoPiConstants) + ldi RPpBitRot, pm_lo8(bit_rot_jmp_table) + ldi RPpBitRot+1, pm_hi8(bit_rot_jmp_table) + ldi RPpByteRot, pm_lo8(rotate64_0byte_left) + ldi RPpByteRot+1, pm_hi8(rotate64_0byte_left) + +KeccakRhoPi_Loop: + ; get rotation codes and state index + movw rZ, RPpConst + lpm r0, Z+ ;bits + lpm rTempBis, Z+ ;bytes + lpm RPindex, Z+ + movw RPpConst, rZ + + ; do bit rotation + movw rZ, RPpBitRot + add rZ, r0 + adc rZ+1, zero + ijmp + +KeccakRhoPi_RhoBitRotateDone: + movw rY, rpState + add rY, RPindex + adc rY+1, zero + + movw rZ, RPpByteRot + add rZ, rTempBis + adc rZ+1, zero + ijmp + +KeccakRhoPi_PiStore: + sbiw rY, 8 + st Y+, rTemp+0 + st Y+, rTemp+1 + st Y+, rTemp+2 + st Y+, rTemp+3 + st Y+, rTemp+4 + st Y+, rTemp+5 + st Y+, rTemp+6 + st Y+, rTemp+7 + + movw rTemp+0, rTempBis+0 + movw rTemp+2, rTempBis+2 + movw rTemp+4, rTempBis+4 + movw rTemp+6, rTempBis+6 +KeccakRhoPi_RhoDone: + subi RPindex, 8 + brne KeccakRhoPi_Loop + pop pRound+1 + pop pRound + + #undef RPpConst + #undef RPindex + #undef RPpBitRot + #undef RPpByteRot + + + // Chi Iota prepare Theta + #define CIPTa0 rTemp + #define CIPTa1 rTemp+1 + #define CIPTa2 rTemp+2 + #define CIPTa3 rTemp+3 + #define CIPTa4 rTemp+4 + #define CIPTc0 rTempBis + #define CIPTc1 rTempBis+1 + #define CIPTc2 rTempBis+2 + #define CIPTc3 rTempBis+3 + #define CIPTc4 rTempBis+4 + #define CIPTz rTempBis+6 + #define CIPTy rTempBis+7 + + movw rY, rpState + movw rX, rpState ; 5 * C + sbiw rX, 40 + movw rZ, pRound + + ldi CIPTz, 8 +KeccakChiIotaPrepareTheta_zLoop: + mov CIPTc0, zero + mov CIPTc1, zero + movw CIPTc2, CIPTc0 + mov CIPTc4, zero + + ldi CIPTy, 5 +KeccakChiIotaPrepareTheta_yLoop: + ld CIPTa0, Y + ldd CIPTa1, Y+8 + ldd CIPTa2, Y+16 + ldd CIPTa3, Y+24 + ldd CIPTa4, Y+32 + + ;*p = t = a0 ^ ((~a1) & a2); c0 ^= t; + mov r0, CIPTa1 + com r0 + and r0, CIPTa2 + eor r0, CIPTa0 + eor CIPTc0, r0 + st Y, r0 + + ;*(p+8) = t = a1 ^ ((~a2) & a3); c1 ^= t; + mov r0, CIPTa2 + com r0 + and r0, CIPTa3 + eor r0, CIPTa1 + eor CIPTc1, r0 + std Y+8, r0 + + ;*(p+16) = a2 ^= ((~a3) & a4); c2 ^= a2; + mov r0, CIPTa3 + com r0 + and r0, CIPTa4 + eor r0, CIPTa2 + eor CIPTc2, r0 + std Y+16, r0 + + ;*(p+24) = a3 ^= ((~a4) & a0); c3 ^= a3; + mov r0, CIPTa4 + com r0 + and r0, CIPTa0 + eor r0, CIPTa3 + eor CIPTc3, r0 + std Y+24, r0 + + ;*(p+32) = a4 ^= ((~a0) & a1); c4 ^= a4; + com CIPTa0 + and CIPTa0, CIPTa1 + eor CIPTa0, CIPTa4 + eor CIPTc4, CIPTa0 + std Y+32, CIPTa0 + + adiw rY, 40 + dec CIPTy + brne KeccakChiIotaPrepareTheta_yLoop + + subi rY, 200 + sbc rY+1, zero + + lpm r0, Z+ ;Round Constant + ld CIPTa0, Y + eor CIPTa0, r0 + st Y+, CIPTa0 + + movw pRound, rZ + movw rZ, rX + eor CIPTc0, r0 + st Z+, CIPTc0 + std Z+7, CIPTc1 + std Z+15, CIPTc2 + std Z+23, CIPTc3 + std Z+31, CIPTc4 + movw rX, rZ + movw rZ, pRound + + dec CIPTz + brne KeccakChiIotaPrepareTheta_zLoop + + #undef CIPTa0 + #undef CIPTa1 + #undef CIPTa2 + #undef CIPTa3 + #undef CIPTa4 + #undef CIPTc0 + #undef CIPTc1 + #undef CIPTc2 + #undef CIPTc3 + #undef CIPTc4 + #undef CIPTz + #undef CIPTy + + + ;Check for terminator + lpm r0, Z + inc r0 + breq Keccak_Done + rjmp Keccak_RoundLoop +Keccak_Done: + ret + + +bit_rot_jmp_table: + rjmp KeccakRhoPi_RhoBitRotateDone + rjmp rotate64_1bit_left + rjmp rotate64_2bit_left + rjmp rotate64_3bit_left + rjmp rotate64_4bit_left + rjmp rotate64_3bit_right + rjmp rotate64_2bit_right + rjmp rotate64_1bit_right + +rotate64_4bit_left: + lsl rTemp + rol rTemp+1 + rol rTemp+2 + rol rTemp+3 + rol rTemp+4 + rol rTemp+5 + rol rTemp+6 + rol rTemp+7 + adc rTemp, r1 +rotate64_3bit_left: + lsl rTemp + rol rTemp+1 + rol rTemp+2 + rol rTemp+3 + rol rTemp+4 + rol rTemp+5 + rol rTemp+6 + rol rTemp+7 + adc rTemp, r1 +rotate64_2bit_left: + lsl rTemp + rol rTemp+1 + rol rTemp+2 + rol rTemp+3 + rol rTemp+4 + rol rTemp+5 + rol rTemp+6 + rol rTemp+7 + adc rTemp, r1 +rotate64_1bit_left: + lsl rTemp + rol rTemp+1 + rol rTemp+2 + rol rTemp+3 + rol rTemp+4 + rol rTemp+5 + rol rTemp+6 + rol rTemp+7 + adc rTemp, r1 + rjmp KeccakRhoPi_RhoBitRotateDone + +rotate64_3bit_right: + bst rTemp, 0 + ror rTemp+7 + ror rTemp+6 + ror rTemp+5 + ror rTemp+4 + ror rTemp+3 + ror rTemp+2 + ror rTemp+1 + ror rTemp + bld rTemp+7, 7 +rotate64_2bit_right: + bst rTemp, 0 + ror rTemp+7 + ror rTemp+6 + ror rTemp+5 + ror rTemp+4 + ror rTemp+3 + ror rTemp+2 + ror rTemp+1 + ror rTemp + bld rTemp+7, 7 +rotate64_1bit_right: + bst rTemp, 0 + ror rTemp+7 + ror rTemp+6 + ror rTemp+5 + ror rTemp+4 + ror rTemp+3 + ror rTemp+2 + ror rTemp+1 + ror rTemp + bld rTemp+7, 7 + rjmp KeccakRhoPi_RhoBitRotateDone + +/* +** Each byte rotate routine must be 9 instructions long. +*/ +rotate64_0byte_left: + ld rTempBis+0, Y+ + ld rTempBis+1, Y+ + ld rTempBis+2, Y+ + ld rTempBis+3, Y+ + ld rTempBis+4, Y+ + ld rTempBis+5, Y+ + ld rTempBis+6, Y+ + ld rTempBis+7, Y+ + rjmp KeccakRhoPi_PiStore + +rotate64_1byte_left: + ld rTempBis+1, Y+ + ld rTempBis+2, Y+ + ld rTempBis+3, Y+ + ld rTempBis+4, Y+ + ld rTempBis+5, Y+ + ld rTempBis+6, Y+ + ld rTempBis+7, Y+ + ld rTempBis+0, Y+ + rjmp KeccakRhoPi_PiStore + +rotate64_2byte_left: + ld rTempBis+2, Y+ + ld rTempBis+3, Y+ + ld rTempBis+4, Y+ + ld rTempBis+5, Y+ + ld rTempBis+6, Y+ + ld rTempBis+7, Y+ + ld rTempBis+0, Y+ + ld rTempBis+1, Y+ + rjmp KeccakRhoPi_PiStore + +rotate64_3byte_left: + ld rTempBis+3, Y+ + ld rTempBis+4, Y+ + ld rTempBis+5, Y+ + ld rTempBis+6, Y+ + ld rTempBis+7, Y+ + ld rTempBis+0, Y+ + ld rTempBis+1, Y+ + ld rTempBis+2, Y+ + rjmp KeccakRhoPi_PiStore + +rotate64_4byte_left: + ld rTempBis+4, Y+ + ld rTempBis+5, Y+ + ld rTempBis+6, Y+ + ld rTempBis+7, Y+ + ld rTempBis+0, Y+ + ld rTempBis+1, Y+ + ld rTempBis+2, Y+ + ld rTempBis+3, Y+ + rjmp KeccakRhoPi_PiStore + +rotate64_5byte_left: + ld rTempBis+5, Y+ + ld rTempBis+6, Y+ + ld rTempBis+7, Y+ + ld rTempBis+0, Y+ + ld rTempBis+1, Y+ + ld rTempBis+2, Y+ + ld rTempBis+3, Y+ + ld rTempBis+4, Y+ + rjmp KeccakRhoPi_PiStore + +rotate64_6byte_left: + ld rTempBis+6, Y+ + ld rTempBis+7, Y+ + ld rTempBis+0, Y+ + ld rTempBis+1, Y+ + ld rTempBis+2, Y+ + ld rTempBis+3, Y+ + ld rTempBis+4, Y+ + ld rTempBis+5, Y+ + rjmp KeccakRhoPi_PiStore + +rotate64_7byte_left: + ld rTempBis+7, Y+ + ld rTempBis+0, Y+ + ld rTempBis+1, Y+ + ld rTempBis+2, Y+ + ld rTempBis+3, Y+ + ld rTempBis+4, Y+ + ld rTempBis+5, Y+ + ld rTempBis+6, Y+ + rjmp KeccakRhoPi_PiStore + + #undef rTemp + #undef rTempBis + #undef rTempTer + #undef pRound + + #undef rpState + #undef zero + #undef rX + #undef rY + #undef rZ diff --git a/c_src/KeccakF-1600-inplace-armgcc-ARMv7A-NEON.s b/c_src/KeccakF-1600-inplace-armgcc-ARMv7A-NEON.s new file mode 100755 index 0000000..539e8ea --- /dev/null +++ b/c_src/KeccakF-1600-inplace-armgcc-ARMv7A-NEON.s @@ -0,0 +1,446 @@ +@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +@ Michaël Peeters and Gilles Van Assche. For more information, feedback or +@ questions, please refer to our website: http://keccak.noekeon.org/ +@ +@ Implementation by Ronny Van Keer, hereby denoted as "the implementer". +@ +@ To the extent possible under law, the implementer has waived all copyright +@ and related or neighboring rights to the source code in this file. +@ http://creativecommons.org/publicdomain/zero/1.0/ + +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.equ DO1STROUNDING, 0 + + @ PRESERVE8 +.text + +@// --- offsets in state +.equ Aba, 0*8 +.equ Aga, 1*8 +.equ Aka, 2*8 +.equ Ama, 3*8 +.equ Asa, 4*8 + +@// --- macros + +.macro KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5 + + @Prepare Theta + @Ca = Aba^Aga^Aka^Ama^Asa@ + @Ce = Abe^Age^Ake^Ame^Ase@ + @Ci = Abi^Agi^Aki^Ami^Asi@ + @Co = Abo^Ago^Ako^Amo^Aso@ + @Cu = Abu^Agu^Aku^Amu^Asu@ + @De = Ca^ROL64(Ci, 1)@ + @Di = Ce^ROL64(Co, 1)@ + @Do = Ci^ROL64(Cu, 1)@ + @Du = Co^ROL64(Ca, 1)@ + @Da = Cu^ROL64(Ce, 1)@ + + veor.64 q4, q6, q7 + veor.64 q5, q9, q10 + veor.64 d8, d8, d9 + veor.64 d10, d10, d11 + veor.64 d1, d8, d16 + veor.64 d2, d10, d17 + + veor.64 q4, q11, q12 + veor.64 q5, q14, q15 + veor.64 d8, d8, d9 + veor.64 d10, d10, d11 + veor.64 d3, d8, d26 + + vadd.u64 q4, q1, q1 + veor.64 d4, d10, d27 + vmov.64 d0, d5 + vsri.64 q4, q1, #63 + + vadd.u64 q5, q2, q2 + veor.64 q4, q4, q0 + vsri.64 q5, q2, #63 + vadd.u64 d7, d1, d1 + veor.64 \argA2, \argA2, d8 + veor.64 q5, q5, q1 + + vsri.64 d7, d1, #63 + vshl.u64 d1, \argA2, #44 + veor.64 \argA3, \argA3, d9 + veor.64 d7, d7, d4 + + @Ba = argA1^Da@ + @Be = ROL64((argA2^De), 44)@ + @Bi = ROL64((argA3^Di), 43)@ + @Bo = ROL64((argA4^Do), 21)@ + @Bu = ROL64((argA5^Du), 14)@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + @argA1 = Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@ + vsri.64 d1, \argA2, #64-44 + vshl.u64 d2, \argA3, #43 + vldr.64 d0, [sp, #\argA1] + veor.64 \argA4, \argA4, d10 + vsri.64 d2, \argA3, #64-43 + vshl.u64 d3, \argA4, #21 + veor.64 \argA5, \argA5, d11 + veor.64 d0, d0, d7 + vsri.64 d3, \argA4, #64-21 + vbic.64 d5, d2, d1 + vshl.u64 d4, \argA5, #14 + vbic.64 \argA2, d3, d2 + vld1.64 d6, [r3]! + veor.64 d5, d0 + vsri.64 d4, \argA5, #64-14 + veor.64 d5, d6 + vbic.64 \argA5, d1, d0 + vbic.64 \argA3, d4, d3 + vbic.64 \argA4, d0, d4 + veor.64 \argA2, d1 + vstr.64 d5, [sp, #\argA1] + veor.64 \argA3, d2 + veor.64 \argA4, d3 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi1 argA1, argA2, argA3, argA4, argA5 + + @d2 = ROL64((argA1^Da), 3)@ + @d3 = ROL64((argA2^De), 45)@ + @d4 = ROL64((argA3^Di), 61)@ + @d0 = ROL64((argA4^Do), 28)@ + @d1 = ROL64((argA5^Du), 20)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA2, \argA2, d8 + veor.64 \argA3, \argA3, d9 + vshl.u64 d3, \argA2, #45 + vldr.64 d6, [sp, #\argA1] + vshl.u64 d4, \argA3, #61 + veor.64 \argA4, \argA4, d10 + vsri.64 d3, \argA2, #64-45 + veor.64 \argA5, \argA5, d11 + vsri.64 d4, \argA3, #64-61 + vshl.u64 d0, \argA4, #28 + veor.64 d6, d6, d7 + vshl.u64 d1, \argA5, #20 + vbic.64 \argA3, d4, d3 + vsri.64 d0, \argA4, #64-28 + vbic.64 \argA4, d0, d4 + vshl.u64 d2, d6, #3 + vsri.64 d1, \argA5, #64-20 + veor.64 \argA4, d3 + vsri.64 d2, d6, #64-3 + vbic.64 \argA5, d1, d0 + vbic.64 d6, d2, d1 + vbic.64 \argA2, d3, d2 + veor.64 d6, d0 + veor.64 \argA2, d1 + vstr.64 d6, [sp, #\argA1] + veor.64 \argA3, d2 + veor.64 d5, d6 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5 + + @d4 = ROL64((argA1^Da), 18)@ + @d0 = ROL64((argA2^De), 1)@ + @d1 = ROL64((argA3^Di), 6)@ + @d2 = ROL64((argA4^Do), 25)@ + @d3 = ROL64((argA5^Du), 8)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA3, \argA3, d9 + veor.64 \argA4, \argA4, d10 + vshl.u64 d1, \argA3, #6 + vldr.64 d6, [sp, #\argA1] + vshl.u64 d2, \argA4, #25 + veor.64 \argA5, \argA5, d11 + vsri.64 d1, \argA3, #64-6 + veor.64 \argA2, \argA2, d8 + vsri.64 d2, \argA4, #64-25 + vext.8 d3, \argA5, \argA5, #7 + veor.64 d6, d6, d7 + vbic.64 \argA3, d2, d1 + vadd.u64 d0, \argA2, \argA2 + vbic.64 \argA4, d3, d2 + vsri.64 d0, \argA2, #64-1 + vshl.u64 d4, d6, #18 + veor.64 \argA2, d1, \argA4 + veor.64 \argA3, d0 + vsri.64 d4, d6, #64-18 + vstr.64 \argA3, [sp, #\argA1] + veor.64 d5, \argA3 + vbic.64 \argA5, d1, d0 + vbic.64 \argA3, d4, d3 + vbic.64 \argA4, d0, d4 + veor.64 \argA3, d2 + veor.64 \argA4, d3 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5 + + @d1 = ROL64((argA1^Da), 36)@ + @d2 = ROL64((argA2^De), 10)@ + @d3 = ROL64((argA3^Di), 15)@ + @d4 = ROL64((argA4^Do), 56)@ + @d0 = ROL64((argA5^Du), 27)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA2, \argA2, d8 + veor.64 \argA3, \argA3, d9 + vshl.u64 d2, \argA2, #10 + vldr.64 d6, [sp, #\argA1] + vshl.u64 d3, \argA3, #15 + veor.64 \argA4, \argA4, d10 + vsri.64 d2, \argA2, #64-10 + vsri.64 d3, \argA3, #64-15 + veor.64 \argA5, \argA5, d11 + vext.8 d4, \argA4, \argA4, #1 + vbic.64 \argA2, d3, d2 + vshl.u64 d0, \argA5, #27 + veor.64 d6, d6, d7 + vbic.64 \argA3, d4, d3 + vsri.64 d0, \argA5, #64-27 + vshl.u64 d1, d6, #36 + veor.64 \argA3, d2 + vbic.64 \argA4, d0, d4 + vsri.64 d1, d6, #64-36 + + veor.64 \argA4, d3 + vbic.64 d6, d2, d1 + vbic.64 \argA5, d1, d0 + veor.64 d6, d0 + veor.64 \argA2, d1 + vstr.64 d6, [sp, #\argA1] + veor.64 d5, d6 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5 + + @d3 = ROL64((argA1^Da), 41)@ + @d4 = ROL64((argA2^De), 2)@ + @d0 = ROL64((argA3^Di), 62)@ + @d1 = ROL64((argA4^Do), 55)@ + @d2 = ROL64((argA5^Du), 39)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA2, \argA2, d8 + veor.64 \argA3, \argA3, d9 + vshl.u64 d4, \argA2, #2 + veor.64 \argA5, \argA5, d11 + vshl.u64 d0, \argA3, #62 + vldr.64 d6, [sp, #\argA1] + vsri.64 d4, \argA2, #64-2 + veor.64 \argA4, \argA4, d10 + vsri.64 d0, \argA3, #64-62 + + vshl.u64 d1, \argA4, #55 + veor.64 d6, d6, d7 + vshl.u64 d2, \argA5, #39 + vsri.64 d1, \argA4, #64-55 + vbic.64 \argA4, d0, d4 + vsri.64 d2, \argA5, #64-39 + vbic.64 \argA2, d1, d0 + vshl.u64 d3, d6, #41 + veor.64 \argA5, d4, \argA2 + vbic.64 \argA2, d2, d1 + vsri.64 d3, d6, #64-41 + veor.64 d6, d0, \argA2 + + vbic.64 \argA2, d3, d2 + vbic.64 \argA3, d4, d3 + veor.64 \argA2, d1 + vstr.64 d6, [sp, #\argA1] + veor.64 d5, d6 + veor.64 \argA3, d2 + veor.64 \argA4, d3 + + .endm + +@// --- constants + + + .align 8 + .ltorg +KeccakF1600RoundConstantsWithTerminator: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0xFFFFFFFFFFFFFFFF @//terminator + + .align 8 + +@// --- code + +@not callable from C! +.global KeccakF_armv7a_neon_asm +KeccakF_armv7a_neon_asm: @ + + adr r3, KeccakF1600RoundConstantsWithTerminator +roundLoop: + + KeccakThetaRhoPiChiIota Aba, d13, d19, d25, d31 + KeccakThetaRhoPiChi1 Aka, d15, d21, d22, d28 + KeccakThetaRhoPiChi2 Asa, d12, d18, d24, d30 + KeccakThetaRhoPiChi3 Aga, d14, d20, d26, d27 + KeccakThetaRhoPiChi4 Ama, d16, d17, d23, d29 + + KeccakThetaRhoPiChiIota Aba, d15, d18, d26, d29 + KeccakThetaRhoPiChi1 Asa, d14, d17, d25, d28 + KeccakThetaRhoPiChi2 Ama, d13, d21, d24, d27 + KeccakThetaRhoPiChi3 Aka, d12, d20, d23, d31 + KeccakThetaRhoPiChi4 Aga, d16, d19, d22, d30 + + KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30 + KeccakThetaRhoPiChi1 Ama, d12, d19, d26, d28 + KeccakThetaRhoPiChi2 Aga, d15, d17, d24, d31 + KeccakThetaRhoPiChi3 Asa, d13, d20, d22, d29 + KeccakThetaRhoPiChi4 Aka, d16, d18, d25, d27 + + KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27 + KeccakThetaRhoPiChi1 Aga, d13, d18, d23, d28 + KeccakThetaRhoPiChi2 Aka, d14, d19, d24, d29 + ldr r0, [r3] + KeccakThetaRhoPiChi3 Ama, d15, d20, d25, d30 + cmp r0, #0xFFFFFFFF + KeccakThetaRhoPiChi4 Asa, d16, d21, d26, d31 + + bne roundLoop + bx lr + + @ + .align 8 + +@//void KeccakF_armv7a( tKeccakLane * state ) callable from C +.global KeccakF_armv7a_neon +KeccakF_armv7a_neon: @ + + vpush {q4-q7} + sub sp,sp, #5*8 + + vldr.64 d0, [r0, #0*8] + vldr.64 d12, [r0, #1*8] + vldr.64 d17, [r0, #2*8] + vldr.64 d22, [r0, #3*8] + vldr.64 d27, [r0, #4*8] + + vldr.64 d1, [r0, #5*8] + vldr.64 d13, [r0, #6*8] + vldr.64 d18, [r0, #7*8] + vldr.64 d23, [r0, #8*8] + vldr.64 d28, [r0, #9*8] + + vldr.64 d2, [r0, #10*8] + vldr.64 d14, [r0, #11*8] + vldr.64 d19, [r0, #12*8] + vldr.64 d24, [r0, #13*8] + vldr.64 d29, [r0, #14*8] + + vldr.64 d3, [r0, #15*8] + vldr.64 d15, [r0, #16*8] + vldr.64 d20, [r0, #17*8] + vldr.64 d25, [r0, #18*8] + vldr.64 d30, [r0, #19*8] + + vldr.64 d4, [r0, #20*8] + vldr.64 d16, [r0, #21*8] + vldr.64 d21, [r0, #22*8] + vldr.64 d26, [r0, #23*8] + vldr.64 d31, [r0, #24*8] + + vstr.64 d0, [sp, #Aba] + mov r2, lr + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + mov r1, r0 + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + vpop.64 { d0- d4 } + + vstr.64 d0, [r1, #0*8] + vstr.64 d12, [r1, #1*8] + vstr.64 d17, [r1, #2*8] + vstr.64 d22, [r1, #3*8] + vstr.64 d27, [r1, #4*8] + + vstr.64 d1, [r1, #5*8] + vstr.64 d13, [r1, #6*8] + vstr.64 d18, [r1, #7*8] + vstr.64 d23, [r1, #8*8] + vstr.64 d28, [r1, #9*8] + + vstr.64 d2, [r1, #10*8] + vstr.64 d14, [r1, #11*8] + vstr.64 d19, [r1, #12*8] + vstr.64 d24, [r1, #13*8] + vstr.64 d29, [r1, #14*8] + + vstr.64 d3, [r1, #15*8] + vstr.64 d15, [r1, #16*8] + vstr.64 d20, [r1, #17*8] + vstr.64 d25, [r1, #18*8] + vstr.64 d30, [r1, #19*8] + + vstr.64 d4, [r1, #20*8] + vstr.64 d16, [r1, #21*8] + vstr.64 d21, [r1, #22*8] + vstr.64 d26, [r1, #23*8] + vstr.64 d31, [r1, #24*8] + + vpop {q4-q7} + bx r2 + + @ + diff --git a/c_src/KeccakF-1600-int-set.h b/c_src/KeccakF-1600-int-set.h new file mode 100755 index 0000000..0ed1d80 --- /dev/null +++ b/c_src/KeccakF-1600-int-set.h @@ -0,0 +1,6 @@ +#define ProvideFast576 +#define ProvideFast832 +#define ProvideFast1024 +#define ProvideFast1088 +#define ProvideFast1152 +#define ProvideFast1344 diff --git a/c_src/KeccakF-1600-interface.h b/c_src/KeccakF-1600-interface.h new file mode 100755 index 0000000..22185a4 --- /dev/null +++ b/c_src/KeccakF-1600-interface.h @@ -0,0 +1,46 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakPermutationInterface_h_ +#define _KeccakPermutationInterface_h_ + +#include "KeccakF-1600-int-set.h" + +void KeccakInitialize( void ); +void KeccakInitializeState(unsigned char *state); +void KeccakPermutation(unsigned char *state); +#ifdef ProvideFast576 +void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast832 +void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1024 +void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1088 +void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1152 +void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1344 +void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data); +#endif +void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount); +#ifdef ProvideFast1024 +void KeccakExtract1024bits(const unsigned char *state, unsigned char *data); +#endif +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount); + +#endif diff --git a/c_src/KeccakF-1600-opt32-settings.h b/c_src/KeccakF-1600-opt32-settings.h new file mode 100755 index 0000000..b135918 --- /dev/null +++ b/c_src/KeccakF-1600-opt32-settings.h @@ -0,0 +1,4 @@ +#define Unrolling 2 +//#define UseBebigokimisa +//#define UseInterleaveTables +#define UseSchedule 3 diff --git a/c_src/KeccakF-1600-opt32.c b/c_src/KeccakF-1600-opt32.c new file mode 100755 index 0000000..aded3a9 --- /dev/null +++ b/c_src/KeccakF-1600-opt32.c @@ -0,0 +1,524 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "brg_endian.h" +#include "KeccakF-1600-opt32-settings.h" +#include "KeccakF-1600-interface.h" + +typedef unsigned char UINT8; +typedef unsigned short UINT16; +typedef unsigned int UINT32; +typedef unsigned long long int UINT64; + +#ifdef UseInterleaveTables +int interleaveTablesBuilt = 0; +UINT16 interleaveTable[65536]; +UINT16 deinterleaveTable[65536]; + +void buildInterleaveTables() +{ + UINT32 i, j; + UINT16 x; + + if (!interleaveTablesBuilt) { + for(i=0; i<65536; i++) { + x = 0; + for(j=0; j<16; j++) { + if (i & (1 << j)) + x |= (1 << (j/2 + 8*(j%2))); + } + interleaveTable[i] = x; + deinterleaveTable[x] = (UINT16)i; + } + interleaveTablesBuilt = 1; + } +} + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + +#define xor2bytesIntoInterleavedWords(even, odd, source, j) \ + i##j = interleaveTable[((const UINT16*)source)[j]]; \ + ((UINT8*)even)[j] ^= i##j & 0xFF; \ + ((UINT8*)odd)[j] ^= i##j >> 8; + +#define setInterleavedWordsInto2bytes(dest, even, odd, j) \ + d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \ + ((UINT16*)dest)[j] = d##j; + +#else // (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) + +#define xor2bytesIntoInterleavedWords(even, odd, source, j) \ + i##j = interleaveTable[source[2*j] ^ ((UINT16)source[2*j+1] << 8)]; \ + *even ^= (i##j & 0xFF) << (j*8); \ + *odd ^= ((i##j >> 8) & 0xFF) << (j*8); + +#define setInterleavedWordsInto2bytes(dest, even, odd, j) \ + d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \ + dest[2*j] = d##j & 0xFF; \ + dest[2*j+1] = d##j >> 8; + +#endif // Endianness + +void xor8bytesIntoInterleavedWords(UINT32 *even, UINT32 *odd, const UINT8* source) +{ + UINT16 i0, i1, i2, i3; + + xor2bytesIntoInterleavedWords(even, odd, source, 0) + xor2bytesIntoInterleavedWords(even, odd, source, 1) + xor2bytesIntoInterleavedWords(even, odd, source, 2) + xor2bytesIntoInterleavedWords(even, odd, source, 3) +} + +#define xorLanesIntoState(laneCount, state, input) \ + { \ + int i; \ + for(i=0; i<(laneCount); i++) \ + xor8bytesIntoInterleavedWords(state+i*2, state+i*2+1, input+i*8); \ + } + +void setInterleavedWordsInto8bytes(UINT8* dest, UINT32 even, UINT32 odd) +{ + UINT16 d0, d1, d2, d3; + + setInterleavedWordsInto2bytes(dest, even, odd, 0) + setInterleavedWordsInto2bytes(dest, even, odd, 1) + setInterleavedWordsInto2bytes(dest, even, odd, 2) + setInterleavedWordsInto2bytes(dest, even, odd, 3) +} + +#define extractLanes(laneCount, state, data) \ + { \ + int i; \ + for(i=0; i<(laneCount); i++) \ + setInterleavedWordsInto8bytes(data+i*8, ((UINT32*)state)[i*2], ((UINT32*)state)[i*2+1]); \ + } + +#else // No interleaving tables + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + +// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +#define xorInterleavedLE(rateInLanes, state, input) \ + { \ + const UINT32 * pI = (const UINT32 *)input; \ + UINT32 * pS = state; \ + UINT32 t, x0, x1; \ + int i; \ + for (i = (rateInLanes)-1; i >= 0; --i) \ + { \ + x0 = *(pI++); \ + t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1); \ + t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2); \ + t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4); \ + t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8); \ + x1 = *(pI++); \ + t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1); \ + t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2); \ + t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4); \ + t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8); \ + *(pS++) ^= (UINT16)x0 | (x1 << 16); \ + *(pS++) ^= (x0 >> 16) | (x1 & 0xFFFF0000); \ + } \ + } + +#define xorLanesIntoState(laneCount, state, input) \ + xorInterleavedLE(laneCount, state, input) + +#else // (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) + +// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +UINT64 toInterleaving(UINT64 x) +{ + UINT64 t; + + t = (x ^ (x >> 1)) & 0x2222222222222222ULL; x = x ^ t ^ (t << 1); + t = (x ^ (x >> 2)) & 0x0C0C0C0C0C0C0C0CULL; x = x ^ t ^ (t << 2); + t = (x ^ (x >> 4)) & 0x00F000F000F000F0ULL; x = x ^ t ^ (t << 4); + t = (x ^ (x >> 8)) & 0x0000FF000000FF00ULL; x = x ^ t ^ (t << 8); + t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL; x = x ^ t ^ (t << 16); + + return x; +} + +void xor8bytesIntoInterleavedWords(UINT32* evenAndOdd, const UINT8* source) +{ + // This can be optimized + UINT64 sourceWord = + (UINT64)source[0] + ^ (((UINT64)source[1]) << 8) + ^ (((UINT64)source[2]) << 16) + ^ (((UINT64)source[3]) << 24) + ^ (((UINT64)source[4]) << 32) + ^ (((UINT64)source[5]) << 40) + ^ (((UINT64)source[6]) << 48) + ^ (((UINT64)source[7]) << 56); + UINT64 evenAndOddWord = toInterleaving(sourceWord); + evenAndOdd[0] ^= (UINT32)evenAndOddWord; + evenAndOdd[1] ^= (UINT32)(evenAndOddWord >> 32); +} + +#define xorLanesIntoState(laneCount, state, input) \ + { \ + int i; \ + for(i=0; i<(laneCount); i++) \ + xor8bytesIntoInterleavedWords(state+i*2, input+i*8); \ + } + +#endif // Endianness + +// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +UINT64 fromInterleaving(UINT64 x) +{ + UINT64 t; + + t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL; x = x ^ t ^ (t << 16); + t = (x ^ (x >> 8)) & 0x0000FF000000FF00ULL; x = x ^ t ^ (t << 8); + t = (x ^ (x >> 4)) & 0x00F000F000F000F0ULL; x = x ^ t ^ (t << 4); + t = (x ^ (x >> 2)) & 0x0C0C0C0C0C0C0C0CULL; x = x ^ t ^ (t << 2); + t = (x ^ (x >> 1)) & 0x2222222222222222ULL; x = x ^ t ^ (t << 1); + + return x; +} + +void setInterleavedWordsInto8bytes(UINT8* dest, UINT32* evenAndOdd) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + ((UINT64*)dest)[0] = fromInterleaving(*(UINT64*)evenAndOdd); +#else // (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) + // This can be optimized + UINT64 evenAndOddWord = (UINT64)evenAndOdd[0] ^ ((UINT64)evenAndOdd[1] << 32); + UINT64 destWord = fromInterleaving(evenAndOddWord); + dest[0] = destWord & 0xFF; + dest[1] = (destWord >> 8) & 0xFF; + dest[2] = (destWord >> 16) & 0xFF; + dest[3] = (destWord >> 24) & 0xFF; + dest[4] = (destWord >> 32) & 0xFF; + dest[5] = (destWord >> 40) & 0xFF; + dest[6] = (destWord >> 48) & 0xFF; + dest[7] = (destWord >> 56) & 0xFF; +#endif // Endianness +} + +#define extractLanes(laneCount, state, data) \ + { \ + int i; \ + for(i=0; i<(laneCount); i++) \ + setInterleavedWordsInto8bytes(data+i*8, (UINT32*)state+i*2); \ + } + +#endif // With or without interleaving tables + +#if defined(_MSC_VER) +#define ROL32(a, offset) _rotl(a, offset) +#elif (defined (__arm__) && defined(__ARMCC_VERSION)) +#define ROL32(a, offset) __ror(a, 32-(offset)) +#else +#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset)))) +#endif + +#include "KeccakF-1600-unrolling.macros" +#include "KeccakF-1600-32.macros" + +#if (UseSchedule == 3) + +#ifdef UseBebigokimisa +#error "No lane complementing with schedule 3." +#endif + +#if (Unrolling != 2) +#error "Only unrolling 2 is supported by schedule 3." +#endif + +void KeccakPermutationOnWords(UINT32 *state) +{ + rounds +} + +void KeccakPermutationOnWordsAfterXoring(UINT32 *state, const UINT8 *input, unsigned int laneCount) +{ + xorLanesIntoState(laneCount, state, input) + rounds +} + +#ifdef ProvideFast576 +void KeccakPermutationOnWordsAfterXoring576bits(UINT32 *state, const UINT8 *input) +{ + xorLanesIntoState(9, state, input) + rounds +} +#endif + +#ifdef ProvideFast832 +void KeccakPermutationOnWordsAfterXoring832bits(UINT32 *state, const UINT8 *input) +{ + xorLanesIntoState(13, state, input) + rounds +} +#endif + +#ifdef ProvideFast1024 +void KeccakPermutationOnWordsAfterXoring1024bits(UINT32 *state, const UINT8 *input) +{ + xorLanesIntoState(16, state, input) + rounds +} +#endif + +#ifdef ProvideFast1088 +void KeccakPermutationOnWordsAfterXoring1088bits(UINT32 *state, const UINT8 *input) +{ + xorLanesIntoState(17, state, input) + rounds +} +#endif + +#ifdef ProvideFast1152 +void KeccakPermutationOnWordsAfterXoring1152bits(UINT32 *state, const UINT8 *input) +{ + xorLanesIntoState(18, state, input) + rounds +} +#endif + +#ifdef ProvideFast1344 +void KeccakPermutationOnWordsAfterXoring1344bits(UINT32 *state, const UINT8 *input) +{ + xorLanesIntoState(21, state, input) + rounds +} +#endif + +#else // (Schedule != 3) + +void KeccakPermutationOnWords(UINT32 *state) +{ + declareABCDE +#if (Unrolling != 24) + unsigned int i; +#endif + + copyFromState(A, state) + rounds +} + +void KeccakPermutationOnWordsAfterXoring(UINT32 *state, const UINT8 *input, unsigned int laneCount) +{ + declareABCDE + unsigned int i; + + xorLanesIntoState(laneCount, state, input) + copyFromState(A, state) + rounds +} + +#ifdef ProvideFast576 +void KeccakPermutationOnWordsAfterXoring576bits(UINT32 *state, const UINT8 *input) +{ + declareABCDE + unsigned int i; + + xorLanesIntoState(9, state, input) + copyFromState(A, state) + rounds +} +#endif + +#ifdef ProvideFast832 +void KeccakPermutationOnWordsAfterXoring832bits(UINT32 *state, const UINT8 *input) +{ + declareABCDE + unsigned int i; + + xorLanesIntoState(13, state, input) + copyFromState(A, state) + rounds +} +#endif + +#ifdef ProvideFast1024 +void KeccakPermutationOnWordsAfterXoring1024bits(UINT32 *state, const UINT8 *input) +{ + declareABCDE + unsigned int i; + + xorLanesIntoState(16, state, input) + copyFromState(A, state) + rounds +} +#endif + +#ifdef ProvideFast1088 +void KeccakPermutationOnWordsAfterXoring1088bits(UINT32 *state, const UINT8 *input) +{ + declareABCDE + unsigned int i; + + xorLanesIntoState(17, state, input) + copyFromState(A, state) + rounds +} +#endif + +#ifdef ProvideFast1152 +void KeccakPermutationOnWordsAfterXoring1152bits(UINT32 *state, const UINT8 *input) +{ + declareABCDE + unsigned int i; + + xorLanesIntoState(18, state, input) + copyFromState(A, state) + rounds +} +#endif + +#ifdef ProvideFast1344 +void KeccakPermutationOnWordsAfterXoring1344bits(UINT32 *state, const UINT8 *input) +{ + declareABCDE + unsigned int i; + + xorLanesIntoState(21, state, input) + copyFromState(A, state) + rounds +} +#endif + +#endif + +void KeccakInitialize() +{ +#ifdef UseInterleaveTables + buildInterleaveTables(); +#endif +} + +void KeccakInitializeState(unsigned char *state) +{ + memset(state, 0, 200); +#ifdef UseBebigokimisa + ((UINT32*)state)[ 2] = ~(UINT32)0; + ((UINT32*)state)[ 3] = ~(UINT32)0; + ((UINT32*)state)[ 4] = ~(UINT32)0; + ((UINT32*)state)[ 5] = ~(UINT32)0; + ((UINT32*)state)[16] = ~(UINT32)0; + ((UINT32*)state)[17] = ~(UINT32)0; + ((UINT32*)state)[24] = ~(UINT32)0; + ((UINT32*)state)[25] = ~(UINT32)0; + ((UINT32*)state)[34] = ~(UINT32)0; + ((UINT32*)state)[35] = ~(UINT32)0; + ((UINT32*)state)[40] = ~(UINT32)0; + ((UINT32*)state)[41] = ~(UINT32)0; +#endif +} + +void KeccakPermutation(unsigned char *state) +{ + // We assume the state is always stored as interleaved 32-bit words + KeccakPermutationOnWords((UINT32*)state); +} + +#ifdef ProvideFast576 +void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring576bits((UINT32*)state, data); +} +#endif + +#ifdef ProvideFast832 +void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring832bits((UINT32*)state, data); +} +#endif + +#ifdef ProvideFast1024 +void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring1024bits((UINT32*)state, data); +} +#endif + +#ifdef ProvideFast1088 +void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring1088bits((UINT32*)state, data); +} +#endif + +#ifdef ProvideFast1152 +void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring1152bits((UINT32*)state, data); +} +#endif + +#ifdef ProvideFast1344 +void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationOnWordsAfterXoring1344bits((UINT32*)state, data); +} +#endif + +void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount) +{ + KeccakPermutationOnWordsAfterXoring((UINT32*)state, data, laneCount); +} + +#ifdef ProvideFast1024 +void KeccakExtract1024bits(const unsigned char *state, unsigned char *data) +{ + extractLanes(16, state, data) +#ifdef UseBebigokimisa + ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2]; + ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3]; + ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4]; + ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5]; + ((UINT32*)data)[16] = ~((UINT32*)data)[16]; + ((UINT32*)data)[17] = ~((UINT32*)data)[17]; + ((UINT32*)data)[24] = ~((UINT32*)data)[24]; + ((UINT32*)data)[25] = ~((UINT32*)data)[25]; +#endif +} +#endif + +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount) +{ + extractLanes(laneCount, state, data) +#ifdef UseBebigokimisa + if (laneCount > 1) { + ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2]; + ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3]; + if (laneCount > 2) { + ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4]; + ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5]; + if (laneCount > 8) { + ((UINT32*)data)[16] = ~((UINT32*)data)[16]; + ((UINT32*)data)[17] = ~((UINT32*)data)[17]; + if (laneCount > 12) { + ((UINT32*)data)[24] = ~((UINT32*)data)[24]; + ((UINT32*)data)[25] = ~((UINT32*)data)[25]; + if (laneCount > 17) { + ((UINT32*)data)[34] = ~((UINT32*)data)[34]; + ((UINT32*)data)[35] = ~((UINT32*)data)[35]; + if (laneCount > 20) { + ((UINT32*)data)[40] = ~((UINT32*)data)[40]; + ((UINT32*)data)[41] = ~((UINT32*)data)[41]; + } + } + } + } + } + } +#endif +} diff --git a/c_src/KeccakF-1600-opt64-settings.h b/c_src/KeccakF-1600-opt64-settings.h new file mode 100755 index 0000000..8f16ada --- /dev/null +++ b/c_src/KeccakF-1600-opt64-settings.h @@ -0,0 +1,7 @@ +#define Unrolling 24 +#define UseBebigokimisa +//#define UseSSE +//#define UseOnlySIMD64 +//#define UseMMX +//#define UseSHLD +//#define UseXOP diff --git a/c_src/KeccakF-1600-opt64.c b/c_src/KeccakF-1600-opt64.c new file mode 100755 index 0000000..9349f03 --- /dev/null +++ b/c_src/KeccakF-1600-opt64.c @@ -0,0 +1,504 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "brg_endian.h" +#include "KeccakF-1600-opt64-settings.h" +#include "KeccakF-1600-interface.h" + +typedef unsigned char UINT8; +typedef unsigned long long int UINT64; + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + +#if defined(UseSSE) + #include + typedef __m128i V64; + typedef __m128i V128; + typedef union { + V128 v128; + UINT64 v64[2]; + } V6464; + + #define ANDnu64(a, b) _mm_andnot_si128(a, b) + #define LOAD64(a) _mm_loadl_epi64((const V64 *)&(a)) + #define CONST64(a) _mm_loadl_epi64((const V64 *)&(a)) + #define ROL64(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) + #define STORE64(a, b) _mm_storel_epi64((V64 *)&(a), b) + #define XOR64(a, b) _mm_xor_si128(a, b) + #define XOReq64(a, b) a = _mm_xor_si128(a, b) + #define SHUFFLEBYTES128(a, b) _mm_shuffle_epi8(a, b) + + #define ANDnu128(a, b) _mm_andnot_si128(a, b) + #define LOAD6464(a, b) _mm_set_epi64((__m64)(a), (__m64)(b)) + #define CONST128(a) _mm_load_si128((const V128 *)&(a)) + #define LOAD128(a) _mm_load_si128((const V128 *)&(a)) + #define LOAD128u(a) _mm_loadu_si128((const V128 *)&(a)) + #define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) + #define STORE128(a, b) _mm_store_si128((V128 *)&(a), b) + #define XOR128(a, b) _mm_xor_si128(a, b) + #define XOReq128(a, b) a = _mm_xor_si128(a, b) + #define GET64LOLO(a, b) _mm_unpacklo_epi64(a, b) + #define GET64HIHI(a, b) _mm_unpackhi_epi64(a, b) + #define COPY64HI2LO(a) _mm_shuffle_epi32(a, 0xEE) + #define COPY64LO2HI(a) _mm_shuffle_epi32(a, 0x44) + #define ZERO128() _mm_setzero_si128() + + #ifdef UseOnlySIMD64 + #include "KeccakF-1600-simd64.macros" + #else +ALIGN const UINT64 rho8_56[2] = {0x0605040302010007, 0x080F0E0D0C0B0A09}; + #include "KeccakF-1600-simd128.macros" + #endif + + #ifdef UseBebigokimisa + #error "UseBebigokimisa cannot be used in combination with UseSSE" + #endif +#elif defined(UseXOP) + #include + typedef __m128i V64; + typedef __m128i V128; + + #define LOAD64(a) _mm_loadl_epi64((const V64 *)&(a)) + #define CONST64(a) _mm_loadl_epi64((const V64 *)&(a)) + #define STORE64(a, b) _mm_storel_epi64((V64 *)&(a), b) + #define XOR64(a, b) _mm_xor_si128(a, b) + #define XOReq64(a, b) a = _mm_xor_si128(a, b) + + #define ANDnu128(a, b) _mm_andnot_si128(a, b) + #define LOAD6464(a, b) _mm_set_epi64((__m64)(a), (__m64)(b)) + #define CONST128(a) _mm_load_si128((const V128 *)&(a)) + #define LOAD128(a) _mm_load_si128((const V128 *)&(a)) + #define LOAD128u(a) _mm_loadu_si128((const V128 *)&(a)) + #define STORE128(a, b) _mm_store_si128((V128 *)&(a), b) + #define XOR128(a, b) _mm_xor_si128(a, b) + #define XOReq128(a, b) a = _mm_xor_si128(a, b) + #define ZERO128() _mm_setzero_si128() + + #define SWAP64(a) _mm_shuffle_epi32(a, 0x4E) + #define GET64LOLO(a, b) _mm_unpacklo_epi64(a, b) + #define GET64HIHI(a, b) _mm_unpackhi_epi64(a, b) + #define GET64LOHI(a, b) ((__m128i)_mm_blend_pd((__m128d)a, (__m128d)b, 2)) + #define GET64HILO(a, b) SWAP64(GET64LOHI(b, a)) + #define COPY64HI2LO(a) _mm_shuffle_epi32(a, 0xEE) + #define COPY64LO2HI(a) _mm_shuffle_epi32(a, 0x44) + + #define ROL6464same(a, o) _mm_roti_epi64(a, o) + #define ROL6464(a, r1, r2) _mm_rot_epi64(a, CONST128( rot_##r1##_##r2 )) +ALIGN const UINT64 rot_0_20[2] = { 0, 20}; +ALIGN const UINT64 rot_44_3[2] = {44, 3}; +ALIGN const UINT64 rot_43_45[2] = {43, 45}; +ALIGN const UINT64 rot_21_61[2] = {21, 61}; +ALIGN const UINT64 rot_14_28[2] = {14, 28}; +ALIGN const UINT64 rot_1_36[2] = { 1, 36}; +ALIGN const UINT64 rot_6_10[2] = { 6, 10}; +ALIGN const UINT64 rot_25_15[2] = {25, 15}; +ALIGN const UINT64 rot_8_56[2] = { 8, 56}; +ALIGN const UINT64 rot_18_27[2] = {18, 27}; +ALIGN const UINT64 rot_62_55[2] = {62, 55}; +ALIGN const UINT64 rot_39_41[2] = {39, 41}; + +#if defined(UseSimulatedXOP) + // For debugging purposes, when XOP is not available + #undef ROL6464 + #undef ROL6464same + #define ROL6464same(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) + V128 ROL6464(V128 a, int r0, int r1) + { + V128 a0 = ROL64(a, r0); + V128 a1 = COPY64HI2LO(ROL64(a, r1)); + return GET64LOLO(a0, a1); + } +#endif + + #include "KeccakF-1600-xop.macros" + + #ifdef UseBebigokimisa + #error "UseBebigokimisa cannot be used in combination with UseXOP" + #endif +#elif defined(UseMMX) + #include + typedef __m64 V64; + #define ANDnu64(a, b) _mm_andnot_si64(a, b) + + #if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) + #define LOAD64(a) *(V64*)&(a) + #define CONST64(a) *(V64*)&(a) + #define STORE64(a, b) *(V64*)&(a) = b + #else + #define LOAD64(a) (V64)a + #define CONST64(a) (V64)a + #define STORE64(a, b) a = (UINT64)b + #endif + #define ROL64(a, o) _mm_or_si64(_mm_slli_si64(a, o), _mm_srli_si64(a, 64-(o))) + #define XOR64(a, b) _mm_xor_si64(a, b) + #define XOReq64(a, b) a = _mm_xor_si64(a, b) + + #include "KeccakF-1600-simd64.macros" + + #ifdef UseBebigokimisa + #error "UseBebigokimisa cannot be used in combination with UseMMX" + #endif +#else + #if defined(_MSC_VER) + #define ROL64(a, offset) _rotl64(a, offset) + #elif defined(UseSHLD) + #define ROL64(x,N) ({ \ + register UINT64 __out; \ + register UINT64 __in = x; \ + __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \ + __out; \ + }) + #else + #define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset))) + #endif + + #include "KeccakF-1600-64.macros" +#endif + +#include "KeccakF-1600-unrolling.macros" + +void KeccakPermutationOnWords(UINT64 *state) +{ + declareABCDE +#if (Unrolling != 24) + unsigned int i; +#endif + + copyFromState(A, state) + rounds +#if defined(UseMMX) + _mm_empty(); +#endif +} + +void KeccakPermutationOnWordsAfterXoring(UINT64 *state, const UINT64 *input, unsigned int laneCount) +{ + declareABCDE +#if (Unrolling != 24) + unsigned int i; +#endif + unsigned int j; + + for(j=0; j> (8*i)) & 0xFF; +} + +#ifdef ProvideFast1024 +void KeccakExtract1024bits(const unsigned char *state, unsigned char *data) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + memcpy(data, state, 128); +#else + unsigned int i; + + for(i=0; i<16; i++) + fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]); +#endif +#ifdef UseBebigokimisa + ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1]; + ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2]; + ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8]; + ((UINT64*)data)[12] = ~((UINT64*)data)[12]; +#endif +} +#endif + +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + memcpy(data, state, laneCount*8); +#else + unsigned int i; + + for(i=0; i 1) { + ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1]; + if (laneCount > 2) { + ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2]; + if (laneCount > 8) { + ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8]; + if (laneCount > 12) { + ((UINT64*)data)[12] = ~((UINT64*)data)[12]; + if (laneCount > 17) { + ((UINT64*)data)[17] = ~((UINT64*)data)[17]; + if (laneCount > 20) { + ((UINT64*)data)[20] = ~((UINT64*)data)[20]; + } + } + } + } + } + } +#endif +} diff --git a/c_src/KeccakF-1600-reference.c b/c_src/KeccakF-1600-reference.c new file mode 100755 index 0000000..628f710 --- /dev/null +++ b/c_src/KeccakF-1600-reference.c @@ -0,0 +1,300 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include +#include "brg_endian.h" +#include "displayIntermediateValues.h" +#include "KeccakNISTInterface.h" +#include "KeccakF-1600-interface.h" + +typedef unsigned char UINT8; +typedef unsigned long long int UINT64; + +#define nrRounds 24 +UINT64 KeccakRoundConstants[nrRounds]; +#define nrLanes 25 +unsigned int KeccakRhoOffsets[nrLanes]; + +void KeccakPermutationOnWords(UINT64 *state); +void theta(UINT64 *A); +void rho(UINT64 *A); +void pi(UINT64 *A); +void chi(UINT64 *A); +void iota(UINT64 *A, unsigned int indexRound); + +void fromBytesToWords(UINT64 *stateAsWords, const unsigned char *state) +{ + unsigned int i, j; + + for(i=0; i<(KeccakPermutationSize/64); i++) { + stateAsWords[i] = 0; + for(j=0; j<(64/8); j++) + stateAsWords[i] |= (UINT64)(state[i*(64/8)+j]) << (8*j); + } +} + +void fromWordsToBytes(unsigned char *state, const UINT64 *stateAsWords) +{ + unsigned int i, j; + + for(i=0; i<(KeccakPermutationSize/64); i++) + for(j=0; j<(64/8); j++) + state[i*(64/8)+j] = (stateAsWords[i] >> (8*j)) & 0xFF; +} + +void KeccakPermutation(unsigned char *state) +{ +#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) + UINT64 stateAsWords[KeccakPermutationSize/64]; +#endif + + displayStateAsBytes(1, "Input of permutation", state); +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + KeccakPermutationOnWords((UINT64*)state); +#else + fromBytesToWords(stateAsWords, state); + KeccakPermutationOnWords(stateAsWords); + fromWordsToBytes(state, stateAsWords); +#endif + displayStateAsBytes(1, "State after permutation", state); +} + +void KeccakPermutationAfterXor(unsigned char *state, const unsigned char *data, unsigned int dataLengthInBytes) +{ + unsigned int i; + + for(i=0; i> (64-offset))) : a) + +void theta(UINT64 *A) +{ + unsigned int x, y; + UINT64 C[5], D[5]; + + for(x=0; x<5; x++) { + C[x] = 0; + for(y=0; y<5; y++) + C[x] ^= A[index(x, y)]; + } + for(x=0; x<5; x++) + D[x] = ROL64(C[(x+1)%5], 1) ^ C[(x+4)%5]; + for(x=0; x<5; x++) + for(y=0; y<5; y++) + A[index(x, y)] ^= D[x]; +} + +void rho(UINT64 *A) +{ + unsigned int x, y; + + for(x=0; x<5; x++) for(y=0; y<5; y++) + A[index(x, y)] = ROL64(A[index(x, y)], KeccakRhoOffsets[index(x, y)]); +} + +void pi(UINT64 *A) +{ + unsigned int x, y; + UINT64 tempA[25]; + + for(x=0; x<5; x++) for(y=0; y<5; y++) + tempA[index(x, y)] = A[index(x, y)]; + for(x=0; x<5; x++) for(y=0; y<5; y++) + A[index(0*x+1*y, 2*x+3*y)] = tempA[index(x, y)]; +} + +void chi(UINT64 *A) +{ + unsigned int x, y; + UINT64 C[5]; + + for(y=0; y<5; y++) { + for(x=0; x<5; x++) + C[x] = A[index(x, y)] ^ ((~A[index(x+1, y)]) & A[index(x+2, y)]); + for(x=0; x<5; x++) + A[index(x, y)] = C[x]; + } +} + +void iota(UINT64 *A, unsigned int indexRound) +{ + A[index(0, 0)] ^= KeccakRoundConstants[indexRound]; +} + +int LFSR86540(UINT8 *LFSR) +{ + int result = ((*LFSR) & 0x01) != 0; + if (((*LFSR) & 0x80) != 0) + // Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1 + (*LFSR) = ((*LFSR) << 1) ^ 0x71; + else + (*LFSR) <<= 1; + return result; +} + +void KeccakInitializeRoundConstants() +{ + UINT8 LFSRstate = 0x01; + unsigned int i, j, bitPosition; + + for(i=0; i> 32)); + fprintf(f, "%08X", (unsigned int)(KeccakRoundConstants[i] & 0xFFFFFFFFULL)); + fprintf(f, "\n"); + } + fprintf(f, "\n"); +} + +void displayRhoOffsets(FILE *f) +{ + unsigned int x, y; + + for(y=0; y<5; y++) for(x=0; x<5; x++) { + fprintf(f, "RhoOffset[%i][%i] = ", x, y); + fprintf(f, "%2i", KeccakRhoOffsets[index(x, y)]); + fprintf(f, "\n"); + } + fprintf(f, "\n"); +} + +void KeccakInitializeState(unsigned char *state) +{ + memset(state, 0, KeccakPermutationSizeInBytes); +} + +#ifdef ProvideFast576 +void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationAfterXor(state, data, 72); +} +#endif + +#ifdef ProvideFast832 +void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationAfterXor(state, data, 104); +} +#endif + +#ifdef ProvideFast1024 +void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationAfterXor(state, data, 128); +} +#endif + +#ifdef ProvideFast1088 +void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationAfterXor(state, data, 136); +} +#endif + +#ifdef ProvideFast1152 +void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationAfterXor(state, data, 144); +} +#endif + +#ifdef ProvideFast1344 +void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data) +{ + KeccakPermutationAfterXor(state, data, 168); +} +#endif + +void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount) +{ + KeccakPermutationAfterXor(state, data, laneCount*8); +} + +#ifdef ProvideFast1024 +void KeccakExtract1024bits(const unsigned char *state, unsigned char *data) +{ + memcpy(data, state, 128); +} +#endif + +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount) +{ + memcpy(data, state, laneCount*8); +} diff --git a/c_src/KeccakF-1600-reference.h b/c_src/KeccakF-1600-reference.h new file mode 100755 index 0000000..698bab8 --- /dev/null +++ b/c_src/KeccakF-1600-reference.h @@ -0,0 +1,20 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakPermutationReference_h_ +#define _KeccakPermutationReference_h_ + +void displayRoundConstants(FILE *f); +void displayRhoOffsets(FILE *f); + +#endif diff --git a/c_src/KeccakF-1600-reference32BI.c b/c_src/KeccakF-1600-reference32BI.c new file mode 100755 index 0000000..1ec4c23 --- /dev/null +++ b/c_src/KeccakF-1600-reference32BI.c @@ -0,0 +1,371 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include +#include "brg_endian.h" +#include "displayIntermediateValues.h" +#include "KeccakNISTInterface.h" +#include "KeccakF-1600-interface.h" + +typedef unsigned char UINT8; +typedef unsigned int UINT32; + +#define nrRounds 24 +UINT32 KeccakRoundConstants[nrRounds][2]; +#define nrLanes 25 +unsigned int KeccakRhoOffsets[nrLanes]; + +void KeccakPermutationOnWords(UINT32 *state); +void theta(UINT32 *A); +void rho(UINT32 *A); +void pi(UINT32 *A); +void chi(UINT32 *A); +void iota(UINT32 *A, unsigned int indexRound); + +void toBitInterleaving(UINT32 low, UINT32 high, UINT32 *even, UINT32 *odd) +{ + unsigned int i; + + *even = 0; + *odd = 0; + for(i=0; i<64; i++) { + unsigned int inBit; + if (i < 32) + inBit = (low >> i) & 1; + else + inBit = (high >> (i-32)) & 1; + if ((i % 2) == 0) + *even |= inBit << (i/2); + else + *odd |= inBit << ((i-1)/2); + } +} + +void fromBitInterleaving(UINT32 even, UINT32 odd, UINT32 *low, UINT32 *high) +{ + unsigned int i; + + *low = 0; + *high = 0; + for(i=0; i<64; i++) { + unsigned int inBit; + if ((i % 2) == 0) + inBit = (even >> (i/2)) & 1; + else + inBit = (odd >> ((i-1)/2)) & 1; + if (i < 32) + *low |= inBit << i; + else + *high |= inBit << (i-32); + } +} + +void fromBytesToWords(UINT32 *stateAsWords, const unsigned char *state) +{ + unsigned int i, j; + UINT32 low, high; + UINT32 even, odd; + + for(i=0; i<(KeccakPermutationSize/64); i++) { + low = 0; + high = 0; + for(j=0; j<(32/8); j++) + low |= (UINT32)(state[i*(64/8)+j]) << (8*j); + for(j=(32/8); j<(64/8); j++) + high |= (UINT32)(state[i*(64/8)+j]) << (8*j-32); + toBitInterleaving(low, high, &even, &odd); + stateAsWords[2*i+0] = even; + stateAsWords[2*i+1] = odd; + } +} + +void fromWordsToBytes(unsigned char *state, const UINT32 *stateAsWords) +{ + unsigned int i, j; + UINT32 low, high; + + for(i=0; i<(KeccakPermutationSize/64); i++) { + fromBitInterleaving(stateAsWords[2*i+0], stateAsWords[2*i+1], &low, &high); + for(j=0; j<(32/8); j++) + state[i*(64/8)+j] = (low >> (8*j)) & 0xFF; + for(j=32/8; j<(64/8); j++) + state[i*(64/8)+j] = (high >> (8*j-32)) & 0xFF; + } +} + +void KeccakPermutation(unsigned char *state) +{ + UINT32 stateAsWords[KeccakPermutationSize/32]; + + displayStateAsBytes(1, "Input of permutation", state); + fromBytesToWords(stateAsWords, state); + KeccakPermutationOnWords(stateAsWords); + fromWordsToBytes(state, stateAsWords); + displayStateAsBytes(1, "State after permutation", state); +} + +void KeccakPermutationAfterXor(unsigned char *state, const unsigned char *data, unsigned int dataLengthInBytes) +{ + unsigned int i; + + for(i=0; i> (32-offset))) : a) + +void ROL64(UINT32 inEven, UINT32 inOdd, UINT32 *outEven, UINT32 *outOdd, unsigned int offset) +{ + if ((offset % 2) == 0) { + *outEven = ROL32(inEven, offset/2); + *outOdd = ROL32(inOdd, offset/2); + } + else { + *outEven = ROL32(inOdd, (offset+1)/2); + *outOdd = ROL32(inEven, (offset-1)/2); + } +} + +void theta(UINT32 *A) +{ + unsigned int x, y, z; + UINT32 C[5][2], D[5][2]; + + for(x=0; x<5; x++) { + for(z=0; z<2; z++) { + C[x][z] = 0; + for(y=0; y<5; y++) + C[x][z] ^= A[index(x, y, z)]; + } + } + for(x=0; x<5; x++) { + ROL64(C[(x+1)%5][0], C[(x+1)%5][1], &(D[x][0]), &(D[x][1]), 1); + for(z=0; z<2; z++) + D[x][z] ^= C[(x+4)%5][z]; + } + for(x=0; x<5; x++) + for(y=0; y<5; y++) + for(z=0; z<2; z++) + A[index(x, y, z)] ^= D[x][z]; +} + +void rho(UINT32 *A) +{ + unsigned int x, y; + + for(x=0; x<5; x++) for(y=0; y<5; y++) + ROL64(A[index(x, y, 0)], A[index(x, y, 1)], &(A[index(x, y, 0)]), &(A[index(x, y, 1)]), KeccakRhoOffsets[5*y+x]); +} + +void pi(UINT32 *A) +{ + unsigned int x, y, z; + UINT32 tempA[50]; + + for(x=0; x<5; x++) for(y=0; y<5; y++) for(z=0; z<2; z++) + tempA[index(x, y, z)] = A[index(x, y, z)]; + for(x=0; x<5; x++) for(y=0; y<5; y++) for(z=0; z<2; z++) + A[index(0*x+1*y, 2*x+3*y, z)] = tempA[index(x, y, z)]; +} + +void chi(UINT32 *A) +{ + unsigned int x, y, z; + UINT32 C[5][2]; + + for(y=0; y<5; y++) { + for(x=0; x<5; x++) + for(z=0; z<2; z++) + C[x][z] = A[index(x, y, z)] ^ ((~A[index(x+1, y, z)]) & A[index(x+2, y, z)]); + for(x=0; x<5; x++) + for(z=0; z<2; z++) + A[index(x, y, z)] = C[x][z]; + } +} + +void iota(UINT32 *A, unsigned int indexRound) +{ + A[index(0, 0, 0)] ^= KeccakRoundConstants[indexRound][0]; + A[index(0, 0, 1)] ^= KeccakRoundConstants[indexRound][1]; +} + +int LFSR86540(UINT8 *LFSR) +{ + int result = ((*LFSR) & 0x01) != 0; + if (((*LFSR) & 0x80) != 0) + // Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1 + (*LFSR) = ((*LFSR) << 1) ^ 0x71; + else + (*LFSR) <<= 1; + return result; +} + +void KeccakInitializeRoundConstants() +{ + UINT8 LFSRstate = 0x01; + unsigned int i, j, bitPosition; + UINT32 low, high; + + for(i=0; i +#include "KeccakF-1600-interface.h" + +#define UseBebigokimisa + +typedef unsigned char UINT8; +typedef unsigned long long int UINT64; + +void KeccakInitialize() +{ +} + +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount) +{ + memcpy(data, state, laneCount*8); +#ifdef UseBebigokimisa + if (laneCount > 8) + { + ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1]; + ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2]; + ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8]; + + if (laneCount > 12) + { + ((UINT64*)data)[12] = ~((UINT64*)data)[12]; + if (laneCount > 17) + { + ((UINT64*)data)[17] = ~((UINT64*)data)[17]; + if (laneCount > 20) + { + ((UINT64*)data)[20] = ~((UINT64*)data)[20]; + } + } + } + } + else + { + if (laneCount > 1) + { + ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1]; + if (laneCount > 2) + { + ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2]; + } + } + } + +#endif +} diff --git a/c_src/KeccakF-1600-x86-64-gas.s b/c_src/KeccakF-1600-x86-64-gas.s new file mode 100755 index 0000000..289a84e --- /dev/null +++ b/c_src/KeccakF-1600-x86-64-gas.s @@ -0,0 +1,766 @@ +# +# The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +# Michaël Peeters and Gilles Van Assche. For more information, feedback or +# questions, please refer to our website: http://keccak.noekeon.org/ +# +# Implementation by Ronny Van Keer, +# hereby denoted as "the implementer". +# +# To the extent possible under law, the implementer has waived all copyright +# and related or neighboring rights to the source code in this file. +# http://creativecommons.org/publicdomain/zero/1.0/ +# + + .text + + +#// --- defines + +.equ UseSIMD, 1 + + +.equ _ba, 0*8 +.equ _be, 1*8 +.equ _bi, 2*8 +.equ _bo, 3*8 +.equ _bu, 4*8 +.equ _ga, 5*8 +.equ _ge, 6*8 +.equ _gi, 7*8 +.equ _go, 8*8 +.equ _gu, 9*8 +.equ _ka, 10*8 +.equ _ke, 11*8 +.equ _ki, 12*8 +.equ _ko, 13*8 +.equ _ku, 14*8 +.equ _ma, 15*8 +.equ _me, 16*8 +.equ _mi, 17*8 +.equ _mo, 18*8 +.equ _mu, 19*8 +.equ _sa, 20*8 +.equ _se, 21*8 +.equ _si, 22*8 +.equ _so, 23*8 +.equ _su, 24*8 + + +# arguments +.equ apState, %rdi +.equ apInput, %rsi +.equ aNbrWords, %rdx + +# xor input into state section +.equ xpState, %r9 + +# round vars +.equ rT1, %rax +.equ rpState, %rdi +.equ rpStack, %rsp + +.equ rDa, %rbx +.equ rDe, %rcx +.equ rDi, %rdx +.equ rDo, %r8 +.equ rDu, %r9 + +.equ rBa, %r10 +.equ rBe, %r11 +.equ rBi, %r12 +.equ rBo, %r13 +.equ rBu, %r14 + +.equ rCa, %rsi +.equ rCe, %rbp +.equ rCi, rBi +.equ rCo, rBo +.equ rCu, %r15 + +.macro mKeccakRound iState, oState, rc, lastRound + + movq rCe, rDa + rolq rDa + + movq _bi(\iState), rCi + xorq _gi(\iState), rDi + xorq rCu, rDa + xorq _ki(\iState), rCi + xorq _mi(\iState), rDi + xorq rDi, rCi + + movq rCi, rDe + rolq rDe + + movq _bo(\iState), rCo + xorq _go(\iState), rDo + xorq rCa, rDe + xorq _ko(\iState), rCo + xorq _mo(\iState), rDo + xorq rDo, rCo + + movq rCo, rDi + rolq rDi + + movq rCu, rDo + xorq rCe, rDi + rolq rDo + + movq rCa, rDu + xorq rCi, rDo + rolq rDu + + movq _ba(\iState), rBa + movq _ge(\iState), rBe + xorq rCo, rDu + movq _ki(\iState), rBi + movq _mo(\iState), rBo + movq _su(\iState), rBu + xorq rDe, rBe + rolq $44, rBe + xorq rDi, rBi + xorq rDa, rBa + rolq $43, rBi + + movq rBe, rCa + movq $\rc, rT1 + orq rBi, rCa + xorq rBa, rT1 + xorq rT1, rCa + movq rCa, _ba(\oState) + + xorq rDu, rBu + rolq $14, rBu + movq rBa, rCu + andq rBe, rCu + xorq rBu, rCu + movq rCu, _bu(\oState) + + xorq rDo, rBo + rolq $21, rBo + movq rBo, rT1 + andq rBu, rT1 + xorq rBi, rT1 + movq rT1, _bi(\oState) + + notq rBi + orq rBa, rBu + orq rBo, rBi + xorq rBo, rBu + xorq rBe, rBi + movq rBu, _bo(\oState) + movq rBi, _be(\oState) + .if \lastRound == 0 + movq rBi, rCe + .endif + + + movq _gu(\iState), rBe + xorq rDu, rBe + movq _ka(\iState), rBi + rolq $20, rBe + xorq rDa, rBi + rolq $3, rBi + movq _bo(\iState), rBa + movq rBe, rT1 + orq rBi, rT1 + xorq rDo, rBa + movq _me(\iState), rBo + movq _si(\iState), rBu + rolq $28, rBa + xorq rBa, rT1 + movq rT1, _ga(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDe, rBo + rolq $45, rBo + movq rBi, rT1 + andq rBo, rT1 + xorq rBe, rT1 + movq rT1, _ge(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDi, rBu + rolq $61, rBu + movq rBu, rT1 + orq rBa, rT1 + xorq rBo, rT1 + movq rT1, _go(\oState) + + andq rBe, rBa + xorq rBu, rBa + movq rBa, _gu(\oState) + notq rBu + .if \lastRound == 0 + xorq rBa, rCu + .endif + + orq rBu, rBo + xorq rBi, rBo + movq rBo, _gi(\oState) + + + movq _be(\iState), rBa + movq _gi(\iState), rBe + movq _ko(\iState), rBi + movq _mu(\iState), rBo + movq _sa(\iState), rBu + xorq rDi, rBe + rolq $6, rBe + xorq rDo, rBi + rolq $25, rBi + movq rBe, rT1 + orq rBi, rT1 + xorq rDe, rBa + rolq $1, rBa + xorq rBa, rT1 + movq rT1, _ka(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDu, rBo + rolq $8, rBo + movq rBi, rT1 + andq rBo, rT1 + xorq rBe, rT1 + movq rT1, _ke(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDa, rBu + rolq $18, rBu + notq rBo + movq rBo, rT1 + andq rBu, rT1 + xorq rBi, rT1 + movq rT1, _ki(\oState) + + movq rBu, rT1 + orq rBa, rT1 + xorq rBo, rT1 + movq rT1, _ko(\oState) + + andq rBe, rBa + xorq rBu, rBa + movq rBa, _ku(\oState) + .if \lastRound == 0 + xorq rBa, rCu + .endif + + movq _ga(\iState), rBe + xorq rDa, rBe + movq _ke(\iState), rBi + rolq $36, rBe + xorq rDe, rBi + movq _bu(\iState), rBa + rolq $10, rBi + movq rBe, rT1 + movq _mi(\iState), rBo + andq rBi, rT1 + xorq rDu, rBa + movq _so(\iState), rBu + rolq $27, rBa + xorq rBa, rT1 + movq rT1, _ma(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDi, rBo + rolq $15, rBo + movq rBi, rT1 + orq rBo, rT1 + xorq rBe, rT1 + movq rT1, _me(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDo, rBu + rolq $56, rBu + notq rBo + movq rBo, rT1 + orq rBu, rT1 + xorq rBi, rT1 + movq rT1, _mi(\oState) + + orq rBa, rBe + xorq rBu, rBe + movq rBe, _mu(\oState) + + andq rBa, rBu + xorq rBo, rBu + movq rBu, _mo(\oState) + .if \lastRound == 0 + xorq rBe, rCu + .endif + + + movq _bi(\iState), rBa + movq _go(\iState), rBe + movq _ku(\iState), rBi + xorq rDi, rBa + movq _ma(\iState), rBo + rolq $62, rBa + xorq rDo, rBe + movq _se(\iState), rBu + rolq $55, rBe + + xorq rDu, rBi + movq rBa, rDu + xorq rDe, rBu + rolq $2, rBu + andq rBe, rDu + xorq rBu, rDu + movq rDu, _su(\oState) + + rolq $39, rBi + .if \lastRound == 0 + xorq rDu, rCu + .endif + notq rBe + xorq rDa, rBo + movq rBe, rDa + andq rBi, rDa + xorq rBa, rDa + movq rDa, _sa(\oState) + .if \lastRound == 0 + xor rDa, rCa + .endif + + rolq $41, rBo + movq rBi, rDe + orq rBo, rDe + xorq rBe, rDe + movq rDe, _se(\oState) + .if \lastRound == 0 + xorq rDe, rCe + .endif + + movq rBo, rDi + movq rBu, rDo + andq rBu, rDi + orq rBa, rDo + xorq rBi, rDi + xorq rBo, rDo + movq rDi, _si(\oState) + movq rDo, _so(\oState) + + .endm + +.macro mKeccakPermutation + + subq $8*25, %rsp + + movq _ba(rpState), rCa + movq _be(rpState), rCe + movq _bu(rpState), rCu + + xorq _ga(rpState), rCa + xorq _ge(rpState), rCe + xorq _gu(rpState), rCu + + xorq _ka(rpState), rCa + xorq _ke(rpState), rCe + xorq _ku(rpState), rCu + + xorq _ma(rpState), rCa + xorq _me(rpState), rCe + xorq _mu(rpState), rCu + + xorq _sa(rpState), rCa + xorq _se(rpState), rCe + movq _si(rpState), rDi + movq _so(rpState), rDo + xorq _su(rpState), rCu + + + mKeccakRound rpState, rpStack, 0x0000000000000001, 0 + mKeccakRound rpStack, rpState, 0x0000000000008082, 0 + mKeccakRound rpState, rpStack, 0x800000000000808a, 0 + mKeccakRound rpStack, rpState, 0x8000000080008000, 0 + mKeccakRound rpState, rpStack, 0x000000000000808b, 0 + mKeccakRound rpStack, rpState, 0x0000000080000001, 0 + + mKeccakRound rpState, rpStack, 0x8000000080008081, 0 + mKeccakRound rpStack, rpState, 0x8000000000008009, 0 + mKeccakRound rpState, rpStack, 0x000000000000008a, 0 + mKeccakRound rpStack, rpState, 0x0000000000000088, 0 + mKeccakRound rpState, rpStack, 0x0000000080008009, 0 + mKeccakRound rpStack, rpState, 0x000000008000000a, 0 + + mKeccakRound rpState, rpStack, 0x000000008000808b, 0 + mKeccakRound rpStack, rpState, 0x800000000000008b, 0 + mKeccakRound rpState, rpStack, 0x8000000000008089, 0 + mKeccakRound rpStack, rpState, 0x8000000000008003, 0 + mKeccakRound rpState, rpStack, 0x8000000000008002, 0 + mKeccakRound rpStack, rpState, 0x8000000000000080, 0 + + mKeccakRound rpState, rpStack, 0x000000000000800a, 0 + mKeccakRound rpStack, rpState, 0x800000008000000a, 0 + mKeccakRound rpState, rpStack, 0x8000000080008081, 0 + mKeccakRound rpStack, rpState, 0x8000000000008080, 0 + mKeccakRound rpState, rpStack, 0x0000000080000001, 0 + mKeccakRound rpStack, rpState, 0x8000000080008008, 1 + + addq $8*25, %rsp + + .endm + +.macro mPushRegs + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + .endm + + +.macro mPopRegs + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + + .endm + + +.macro mXorState128 input, state, offset + .if UseSIMD == 0 + movq \offset(\input), %rax + movq \offset+8(\input), %rcx + xorq %rax, \offset(\state) + xorq %rcx, \offset+8(\state) + .else + movdqu \offset(\input), %xmm0 + pxor \offset(\state), %xmm0 + movdqu %xmm0, \offset(\state) + .endif + .endm + +.macro mXorState256 input, state, offset + .if UseSIMD == 0 + movq \offset(\input), %rax + movq \offset+8(\input), %r10 + movq \offset+16(\input), %rcx + movq \offset+24(\input), %r8 + xorq %rax, \offset(\state) + xorq %r10, \offset+8(\state) + xorq %rcx, \offset+16(\state) + xorq %r8, \offset+24(\state) + .else + movdqu \offset(\input), %xmm0 + pxor \offset(\state), %xmm0 + movdqu \offset+16(\input), %xmm1 + pxor \offset+16(\state), %xmm1 + movdqu %xmm0, \offset(\state) + movdqu %xmm1, \offset+16(\state) + .endif + .endm + +.macro mXorState512 input, state, offset + .if UseSIMD == 0 + mXorState256 \input, \state, \offset + mXorState256 \input, \state, \offset+32 + .else + movdqu \offset(\input), %xmm0 + movdqu \offset+16(\input), %xmm1 + pxor \offset(\state), %xmm0 + movdqu \offset+32(\input), %xmm2 + pxor \offset+16(\state), %xmm1 + movdqu %xmm0, \offset(\state) + movdqu \offset+48(\input), %xmm3 + pxor \offset+32(\state), %xmm2 + movdqu %xmm1, \offset+16(\state) + pxor \offset+48(\state), %xmm3 + movdqu %xmm2, \offset+32(\state) + movdqu %xmm3, \offset+48(\state) + .endif + .endm + +# ------------------------------------------------------------------------- + + .size KeccakPermutation, .-KeccakPermutation + .align 2 + .global KeccakPermutation + .type KeccakPermutation, %function +KeccakPermutation: + + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb576bits, .-KeccakAbsorb576bits + .align 2 + .global KeccakAbsorb576bits + .type KeccakAbsorb576bits, %function +KeccakAbsorb576bits: + + mXorState512 apInput, apState, 0 + movq 64(apInput), %rax + xorq %rax, 64(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb832bits, .-KeccakAbsorb832bits + .align 2 + .global KeccakAbsorb832bits + .type KeccakAbsorb832bits, %function +KeccakAbsorb832bits: + + mXorState512 apInput, apState, 0 + mXorState256 apInput, apState, 64 + movq 96(apInput), %rax + xorq %rax, 96(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1024bits, .-KeccakAbsorb1024bits + .align 2 + .global KeccakAbsorb1024bits + .type KeccakAbsorb1024bits, %function +KeccakAbsorb1024bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1088bits, .-KeccakAbsorb1088bits + .align 2 + .global KeccakAbsorb1088bits + .type KeccakAbsorb1088bits, %function +KeccakAbsorb1088bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + movq 128(apInput), %rax + xorq %rax, 128(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1152bits, .-KeccakAbsorb1152bits + .align 2 + .global KeccakAbsorb1152bits + .type KeccakAbsorb1152bits, %function +KeccakAbsorb1152bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mXorState128 apInput, apState, 128 + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1344bits, .-KeccakAbsorb1344bits + .align 2 + .global KeccakAbsorb1344bits + .type KeccakAbsorb1344bits, %function +KeccakAbsorb1344bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mXorState256 apInput, apState, 128 + movq 160(apInput), %rax + xorq %rax, 160(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb, .-KeccakAbsorb + .align 2 + .global KeccakAbsorb + .type KeccakAbsorb, %function +KeccakAbsorb: + + movq apState, xpState + + test $16, aNbrWords + jz xorInputToState8 + mXorState512 apInput, xpState, 0 + mXorState512 apInput, xpState, 64 + addq $128, apInput + addq $128, xpState + +xorInputToState8: + test $8, aNbrWords + jz xorInputToState4 + mXorState512 apInput, xpState, 0 + addq $64, apInput + addq $64, xpState + +xorInputToState4: + test $4, aNbrWords + jz xorInputToState2 + mXorState256 apInput, xpState, 0 + addq $32, apInput + addq $32, xpState + +xorInputToState2: + test $2, aNbrWords + jz xorInputToState1 + mXorState128 apInput, xpState, 0 + addq $16, apInput + addq $16, xpState + +xorInputToState1: + test $1, aNbrWords + jz xorInputToStateDone + movq (apInput), %rax + xorq %rax, (xpState) + +xorInputToStateDone: + + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakInitializeState, .-KeccakInitializeState + .align 2 + .global KeccakInitializeState + .type KeccakInitializeState, %function +KeccakInitializeState: + xorq %rax, %rax + xorq %rcx, %rcx + notq %rcx + + .if UseSIMD == 0 + movq %rax, 0*8(apState) + movq %rcx, 1*8(apState) + movq %rcx, 2*8(apState) + movq %rax, 3*8(apState) + movq %rax, 4*8(apState) + movq %rax, 5*8(apState) + movq %rax, 6*8(apState) + movq %rax, 7*8(apState) + movq %rcx, 8*8(apState) + movq %rax, 9*8(apState) + movq %rax, 10*8(apState) + movq %rax, 11*8(apState) + movq %rcx, 12*8(apState) + movq %rax, 13*8(apState) + movq %rax, 14*8(apState) + movq %rax, 15*8(apState) + movq %rax, 16*8(apState) + movq %rcx, 17*8(apState) + movq %rax, 18*8(apState) + movq %rax, 19*8(apState) + movq %rcx, 20*8(apState) + movq %rax, 21*8(apState) + movq %rax, 22*8(apState) + movq %rax, 23*8(apState) + movq %rax, 24*8(apState) + .else + pxor %xmm0, %xmm0 + + movq %rax, 0*8(apState) + movq %rcx, 1*8(apState) + movq %rcx, 2*8(apState) + movq %rax, 3*8(apState) + movdqu %xmm0, 4*8(apState) + movdqu %xmm0, 6*8(apState) + movq %rcx, 8*8(apState) + movq %rax, 9*8(apState) + movdqu %xmm0, 10*8(apState) + movq %rcx, 12*8(apState) + movq %rax, 13*8(apState) + movdqu %xmm0, 14*8(apState) + movq %rax, 16*8(apState) + movq %rcx, 17*8(apState) + movdqu %xmm0, 18*8(apState) + movq %rcx, 20*8(apState) + movq %rax, 21*8(apState) + movdqu %xmm0, 22*8(apState) + movq %rax, 24*8(apState) + .endif + ret + +# ------------------------------------------------------------------------- + + .size KeccakExtract1024bits, .-KeccakExtract1024bits + .align 2 + .global KeccakExtract1024bits + .type KeccakExtract1024bits, %function +KeccakExtract1024bits: + + movq 0*8(apState), %rax + movq 1*8(apState), %rcx + movq 2*8(apState), %rdx + movq 3*8(apState), %r8 + notq %rcx + notq %rdx + movq %rax, 0*8(%rsi) + movq %rcx, 1*8(%rsi) + movq %rdx, 2*8(%rsi) + movq %r8, 3*8(%rsi) + + movq 4*8(apState), %rax + movq 5*8(apState), %rcx + movq 6*8(apState), %rdx + movq 7*8(apState), %r8 + movq %rax, 4*8(%rsi) + movq %rcx, 5*8(%rsi) + movq %rdx, 6*8(%rsi) + movq %r8, 7*8(%rsi) + + movq 8*8(apState), %rax + movq 9*8(apState), %rcx + movq 10*8(apState), %rdx + movq 11*8(apState), %r8 + notq %rax + movq %rax, 8*8(%rsi) + movq %rcx, 9*8(%rsi) + movq %rdx, 10*8(%rsi) + movq %r8, 11*8(%rsi) + + movq 12*8(apState), %rax + movq 13*8(apState), %rcx + movq 14*8(apState), %rdx + movq 15*8(apState), %r8 + notq %rax + movq %rax, 12*8(%rsi) + movq %rcx, 13*8(%rsi) + movq %rdx, 14*8(%rsi) + movq %r8, 15*8(%rsi) + ret + diff --git a/c_src/KeccakF-1600-x86-64-shld-gas.s b/c_src/KeccakF-1600-x86-64-shld-gas.s new file mode 100755 index 0000000..bc84762 --- /dev/null +++ b/c_src/KeccakF-1600-x86-64-shld-gas.s @@ -0,0 +1,766 @@ +# +# The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +# Michaël Peeters and Gilles Van Assche. For more information, feedback or +# questions, please refer to our website: http://keccak.noekeon.org/ +# +# Implementation by Ronny Van Keer, +# hereby denoted as "the implementer". +# +# To the extent possible under law, the implementer has waived all copyright +# and related or neighboring rights to the source code in this file. +# http://creativecommons.org/publicdomain/zero/1.0/ +# + + .text + + +#// --- defines + +.equ UseSIMD, 1 + + +.equ _ba, 0*8 +.equ _be, 1*8 +.equ _bi, 2*8 +.equ _bo, 3*8 +.equ _bu, 4*8 +.equ _ga, 5*8 +.equ _ge, 6*8 +.equ _gi, 7*8 +.equ _go, 8*8 +.equ _gu, 9*8 +.equ _ka, 10*8 +.equ _ke, 11*8 +.equ _ki, 12*8 +.equ _ko, 13*8 +.equ _ku, 14*8 +.equ _ma, 15*8 +.equ _me, 16*8 +.equ _mi, 17*8 +.equ _mo, 18*8 +.equ _mu, 19*8 +.equ _sa, 20*8 +.equ _se, 21*8 +.equ _si, 22*8 +.equ _so, 23*8 +.equ _su, 24*8 + + +# arguments +.equ apState, %rdi +.equ apInput, %rsi +.equ aNbrWords, %rdx + +# xor input into state section +.equ xpState, %r9 + +# round vars +.equ rT1, %rax +.equ rpState, %rdi +.equ rpStack, %rsp + +.equ rDa, %rbx +.equ rDe, %rcx +.equ rDi, %rdx +.equ rDo, %r8 +.equ rDu, %r9 + +.equ rBa, %r10 +.equ rBe, %r11 +.equ rBi, %r12 +.equ rBo, %r13 +.equ rBu, %r14 + +.equ rCa, %rsi +.equ rCe, %rbp +.equ rCi, rBi +.equ rCo, rBo +.equ rCu, %r15 + +.macro mKeccakRound iState, oState, rc, lastRound + + movq rCe, rDa + shld $1, rDa, rDa + + movq _bi(\iState), rCi + xorq _gi(\iState), rDi + xorq _ki(\iState), rCi + xorq rCu, rDa + xorq _mi(\iState), rDi + xorq rDi, rCi + + movq rCi, rDe + shld $1, rDe, rDe + + movq _bo(\iState), rCo + xorq _go(\iState), rDo + xorq _ko(\iState), rCo + xorq rCa, rDe + xorq _mo(\iState), rDo + xorq rDo, rCo + + movq rCo, rDi + shld $1, rDi, rDi + + movq rCu, rDo + xorq rCe, rDi + shld $1, rDo, rDo + + movq rCa, rDu + xorq rCi, rDo + shld $1, rDu, rDu + + movq _ba(\iState), rBa + movq _ge(\iState), rBe + xorq rCo, rDu + movq _ki(\iState), rBi + movq _mo(\iState), rBo + movq _su(\iState), rBu + xorq rDe, rBe + shld $44, rBe, rBe + xorq rDi, rBi + xorq rDa, rBa + shld $43, rBi, rBi + + movq rBe, rCa + movq $\rc, rT1 + orq rBi, rCa + xorq rBa, rT1 + xorq rT1, rCa + movq rCa, _ba(\oState) + + xorq rDu, rBu + shld $14, rBu, rBu + movq rBa, rCu + andq rBe, rCu + xorq rBu, rCu + movq rCu, _bu(\oState) + + xorq rDo, rBo + shld $21, rBo, rBo + movq rBo, rT1 + andq rBu, rT1 + xorq rBi, rT1 + movq rT1, _bi(\oState) + + notq rBi + orq rBa, rBu + orq rBo, rBi + xorq rBo, rBu + xorq rBe, rBi + movq rBu, _bo(\oState) + movq rBi, _be(\oState) + .if \lastRound == 0 + movq rBi, rCe + .endif + + + movq _gu(\iState), rBe + xorq rDu, rBe + movq _ka(\iState), rBi + shld $20, rBe, rBe + xorq rDa, rBi + shld $3, rBi, rBi + movq _bo(\iState), rBa + movq rBe, rT1 + orq rBi, rT1 + xorq rDo, rBa + movq _me(\iState), rBo + movq _si(\iState), rBu + shld $28, rBa, rBa + xorq rBa, rT1 + movq rT1, _ga(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDe, rBo + shld $45, rBo, rBo + movq rBi, rT1 + andq rBo, rT1 + xorq rBe, rT1 + movq rT1, _ge(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDi, rBu + shld $61, rBu, rBu + movq rBu, rT1 + orq rBa, rT1 + xorq rBo, rT1 + movq rT1, _go(\oState) + + andq rBe, rBa + xorq rBu, rBa + movq rBa, _gu(\oState) + notq rBu + .if \lastRound == 0 + xorq rBa, rCu + .endif + + orq rBu, rBo + xorq rBi, rBo + movq rBo, _gi(\oState) + + + movq _be(\iState), rBa + movq _gi(\iState), rBe + movq _ko(\iState), rBi + movq _mu(\iState), rBo + movq _sa(\iState), rBu + xorq rDi, rBe + shld $6, rBe, rBe + xorq rDo, rBi + shld $25, rBi, rBi + movq rBe, rT1 + orq rBi, rT1 + xorq rDe, rBa + shld $1, rBa, rBa + xorq rBa, rT1 + movq rT1, _ka(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDu, rBo + shld $8, rBo, rBo + movq rBi, rT1 + andq rBo, rT1 + xorq rBe, rT1 + movq rT1, _ke(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDa, rBu + shld $18, rBu, rBu + notq rBo + movq rBo, rT1 + andq rBu, rT1 + xorq rBi, rT1 + movq rT1, _ki(\oState) + + movq rBu, rT1 + orq rBa, rT1 + xorq rBo, rT1 + movq rT1, _ko(\oState) + + andq rBe, rBa + xorq rBu, rBa + movq rBa, _ku(\oState) + .if \lastRound == 0 + xorq rBa, rCu + .endif + + movq _ga(\iState), rBe + xorq rDa, rBe + movq _ke(\iState), rBi + shld $36, rBe, rBe + xorq rDe, rBi + movq _bu(\iState), rBa + shld $10, rBi, rBi + movq rBe, rT1 + movq _mi(\iState), rBo + andq rBi, rT1 + xorq rDu, rBa + movq _so(\iState), rBu + shld $27, rBa, rBa + xorq rBa, rT1 + movq rT1, _ma(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDi, rBo + shld $15, rBo, rBo + movq rBi, rT1 + orq rBo, rT1 + xorq rBe, rT1 + movq rT1, _me(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDo, rBu + shld $56, rBu, rBu + notq rBo + movq rBo, rT1 + orq rBu, rT1 + xorq rBi, rT1 + movq rT1, _mi(\oState) + + orq rBa, rBe + xorq rBu, rBe + movq rBe, _mu(\oState) + + andq rBa, rBu + xorq rBo, rBu + movq rBu, _mo(\oState) + .if \lastRound == 0 + xorq rBe, rCu + .endif + + + movq _bi(\iState), rBa + movq _go(\iState), rBe + movq _ku(\iState), rBi + xorq rDi, rBa + movq _ma(\iState), rBo + shld $62, rBa, rBa + xorq rDo, rBe + movq _se(\iState), rBu + shld $55, rBe, rBe + + xorq rDu, rBi + movq rBa, rDu + xorq rDe, rBu + shld $2, rBu, rBu + andq rBe, rDu + xorq rBu, rDu + movq rDu, _su(\oState) + + shld $39, rBi, rBi + .if \lastRound == 0 + xorq rDu, rCu + .endif + notq rBe + xorq rDa, rBo + movq rBe, rDa + andq rBi, rDa + xorq rBa, rDa + movq rDa, _sa(\oState) + .if \lastRound == 0 + xor rDa, rCa + .endif + + shld $41, rBo, rBo + movq rBi, rDe + orq rBo, rDe + xorq rBe, rDe + movq rDe, _se(\oState) + .if \lastRound == 0 + xorq rDe, rCe + .endif + + movq rBo, rDi + movq rBu, rDo + andq rBu, rDi + orq rBa, rDo + xorq rBi, rDi + xorq rBo, rDo + movq rDi, _si(\oState) + movq rDo, _so(\oState) + + .endm + +.macro mKeccakPermutation + + subq $8*25, %rsp + + movq _ba(rpState), rCa + movq _be(rpState), rCe + movq _bu(rpState), rCu + + xorq _ga(rpState), rCa + xorq _ge(rpState), rCe + xorq _gu(rpState), rCu + + xorq _ka(rpState), rCa + xorq _ke(rpState), rCe + xorq _ku(rpState), rCu + + xorq _ma(rpState), rCa + xorq _me(rpState), rCe + xorq _mu(rpState), rCu + + xorq _sa(rpState), rCa + xorq _se(rpState), rCe + movq _si(rpState), rDi + movq _so(rpState), rDo + xorq _su(rpState), rCu + + + mKeccakRound rpState, rpStack, 0x0000000000000001, 0 + mKeccakRound rpStack, rpState, 0x0000000000008082, 0 + mKeccakRound rpState, rpStack, 0x800000000000808a, 0 + mKeccakRound rpStack, rpState, 0x8000000080008000, 0 + mKeccakRound rpState, rpStack, 0x000000000000808b, 0 + mKeccakRound rpStack, rpState, 0x0000000080000001, 0 + + mKeccakRound rpState, rpStack, 0x8000000080008081, 0 + mKeccakRound rpStack, rpState, 0x8000000000008009, 0 + mKeccakRound rpState, rpStack, 0x000000000000008a, 0 + mKeccakRound rpStack, rpState, 0x0000000000000088, 0 + mKeccakRound rpState, rpStack, 0x0000000080008009, 0 + mKeccakRound rpStack, rpState, 0x000000008000000a, 0 + + mKeccakRound rpState, rpStack, 0x000000008000808b, 0 + mKeccakRound rpStack, rpState, 0x800000000000008b, 0 + mKeccakRound rpState, rpStack, 0x8000000000008089, 0 + mKeccakRound rpStack, rpState, 0x8000000000008003, 0 + mKeccakRound rpState, rpStack, 0x8000000000008002, 0 + mKeccakRound rpStack, rpState, 0x8000000000000080, 0 + + mKeccakRound rpState, rpStack, 0x000000000000800a, 0 + mKeccakRound rpStack, rpState, 0x800000008000000a, 0 + mKeccakRound rpState, rpStack, 0x8000000080008081, 0 + mKeccakRound rpStack, rpState, 0x8000000000008080, 0 + mKeccakRound rpState, rpStack, 0x0000000080000001, 0 + mKeccakRound rpStack, rpState, 0x8000000080008008, 1 + + addq $8*25, %rsp + + .endm + +.macro mPushRegs + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + .endm + + +.macro mPopRegs + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + + .endm + + +.macro mXorState128 input, state, offset + .if UseSIMD == 0 + movq \offset(\input), %rax + movq \offset+8(\input), %rcx + xorq %rax, \offset(\state) + xorq %rcx, \offset+8(\state) + .else + movdqu \offset(\input), %xmm0 + pxor \offset(\state), %xmm0 + movdqu %xmm0, \offset(\state) + .endif + .endm + +.macro mXorState256 input, state, offset + .if UseSIMD == 0 + movq \offset(\input), %rax + movq \offset+8(\input), %r10 + movq \offset+16(\input), %rcx + movq \offset+24(\input), %r8 + xorq %rax, \offset(\state) + xorq %r10, \offset+8(\state) + xorq %rcx, \offset+16(\state) + xorq %r8, \offset+24(\state) + .else + movdqu \offset(\input), %xmm0 + pxor \offset(\state), %xmm0 + movdqu \offset+16(\input), %xmm1 + pxor \offset+16(\state), %xmm1 + movdqu %xmm0, \offset(\state) + movdqu %xmm1, \offset+16(\state) + .endif + .endm + +.macro mXorState512 input, state, offset + .if UseSIMD == 0 + mXorState256 \input, \state, \offset + mXorState256 \input, \state, \offset+32 + .else + movdqu \offset(\input), %xmm0 + movdqu \offset+16(\input), %xmm1 + pxor \offset(\state), %xmm0 + movdqu \offset+32(\input), %xmm2 + pxor \offset+16(\state), %xmm1 + movdqu %xmm0, \offset(\state) + movdqu \offset+48(\input), %xmm3 + pxor \offset+32(\state), %xmm2 + movdqu %xmm1, \offset+16(\state) + pxor \offset+48(\state), %xmm3 + movdqu %xmm2, \offset+32(\state) + movdqu %xmm3, \offset+48(\state) + .endif + .endm + +# ------------------------------------------------------------------------- + + .size KeccakPermutation, .-KeccakPermutation + .align 2 + .global KeccakPermutation + .type KeccakPermutation, %function +KeccakPermutation: + + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb576bits, .-KeccakAbsorb576bits + .align 2 + .global KeccakAbsorb576bits + .type KeccakAbsorb576bits, %function +KeccakAbsorb576bits: + + mXorState512 apInput, apState, 0 + movq 64(apInput), %rax + xorq %rax, 64(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb832bits, .-KeccakAbsorb832bits + .align 2 + .global KeccakAbsorb832bits + .type KeccakAbsorb832bits, %function +KeccakAbsorb832bits: + + mXorState512 apInput, apState, 0 + mXorState256 apInput, apState, 64 + movq 96(apInput), %rax + xorq %rax, 96(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1024bits, .-KeccakAbsorb1024bits + .align 2 + .global KeccakAbsorb1024bits + .type KeccakAbsorb1024bits, %function +KeccakAbsorb1024bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1088bits, .-KeccakAbsorb1088bits + .align 2 + .global KeccakAbsorb1088bits + .type KeccakAbsorb1088bits, %function +KeccakAbsorb1088bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + movq 128(apInput), %rax + xorq %rax, 128(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1152bits, .-KeccakAbsorb1152bits + .align 2 + .global KeccakAbsorb1152bits + .type KeccakAbsorb1152bits, %function +KeccakAbsorb1152bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mXorState128 apInput, apState, 128 + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1344bits, .-KeccakAbsorb1344bits + .align 2 + .global KeccakAbsorb1344bits + .type KeccakAbsorb1344bits, %function +KeccakAbsorb1344bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mXorState256 apInput, apState, 128 + movq 160(apInput), %rax + xorq %rax, 160(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb, .-KeccakAbsorb + .align 2 + .global KeccakAbsorb + .type KeccakAbsorb, %function +KeccakAbsorb: + + movq apState, xpState + + test $16, aNbrWords + jz xorInputToState8 + mXorState512 apInput, xpState, 0 + mXorState512 apInput, xpState, 64 + addq $128, apInput + addq $128, xpState + +xorInputToState8: + test $8, aNbrWords + jz xorInputToState4 + mXorState512 apInput, xpState, 0 + addq $64, apInput + addq $64, xpState + +xorInputToState4: + test $4, aNbrWords + jz xorInputToState2 + mXorState256 apInput, xpState, 0 + addq $32, apInput + addq $32, xpState + +xorInputToState2: + test $2, aNbrWords + jz xorInputToState1 + mXorState128 apInput, xpState, 0 + addq $16, apInput + addq $16, xpState + +xorInputToState1: + test $1, aNbrWords + jz xorInputToStateDone + movq (apInput), %rax + xorq %rax, (xpState) + +xorInputToStateDone: + + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakInitializeState, .-KeccakInitializeState + .align 2 + .global KeccakInitializeState + .type KeccakInitializeState, %function +KeccakInitializeState: + xorq %rax, %rax + xorq %rcx, %rcx + notq %rcx + + .if UseSIMD == 0 + movq %rax, 0*8(apState) + movq %rcx, 1*8(apState) + movq %rcx, 2*8(apState) + movq %rax, 3*8(apState) + movq %rax, 4*8(apState) + movq %rax, 5*8(apState) + movq %rax, 6*8(apState) + movq %rax, 7*8(apState) + movq %rcx, 8*8(apState) + movq %rax, 9*8(apState) + movq %rax, 10*8(apState) + movq %rax, 11*8(apState) + movq %rcx, 12*8(apState) + movq %rax, 13*8(apState) + movq %rax, 14*8(apState) + movq %rax, 15*8(apState) + movq %rax, 16*8(apState) + movq %rcx, 17*8(apState) + movq %rax, 18*8(apState) + movq %rax, 19*8(apState) + movq %rcx, 20*8(apState) + movq %rax, 21*8(apState) + movq %rax, 22*8(apState) + movq %rax, 23*8(apState) + movq %rax, 24*8(apState) + .else + pxor %xmm0, %xmm0 + + movq %rax, 0*8(apState) + movq %rcx, 1*8(apState) + movq %rcx, 2*8(apState) + movq %rax, 3*8(apState) + movdqu %xmm0, 4*8(apState) + movdqu %xmm0, 6*8(apState) + movq %rcx, 8*8(apState) + movq %rax, 9*8(apState) + movdqu %xmm0, 10*8(apState) + movq %rcx, 12*8(apState) + movq %rax, 13*8(apState) + movdqu %xmm0, 14*8(apState) + movq %rax, 16*8(apState) + movq %rcx, 17*8(apState) + movdqu %xmm0, 18*8(apState) + movq %rcx, 20*8(apState) + movq %rax, 21*8(apState) + movdqu %xmm0, 22*8(apState) + movq %rax, 24*8(apState) + .endif + ret + +# ------------------------------------------------------------------------- + + .size KeccakExtract1024bits, .-KeccakExtract1024bits + .align 2 + .global KeccakExtract1024bits + .type KeccakExtract1024bits, %function +KeccakExtract1024bits: + + movq 0*8(apState), %rax + movq 1*8(apState), %rcx + movq 2*8(apState), %rdx + movq 3*8(apState), %r8 + notq %rcx + notq %rdx + movq %rax, 0*8(%rsi) + movq %rcx, 1*8(%rsi) + movq %rdx, 2*8(%rsi) + movq %r8, 3*8(%rsi) + + movq 4*8(apState), %rax + movq 5*8(apState), %rcx + movq 6*8(apState), %rdx + movq 7*8(apState), %r8 + movq %rax, 4*8(%rsi) + movq %rcx, 5*8(%rsi) + movq %rdx, 6*8(%rsi) + movq %r8, 7*8(%rsi) + + movq 8*8(apState), %rax + movq 9*8(apState), %rcx + movq 10*8(apState), %rdx + movq 11*8(apState), %r8 + notq %rax + movq %rax, 8*8(%rsi) + movq %rcx, 9*8(%rsi) + movq %rdx, 10*8(%rsi) + movq %r8, 11*8(%rsi) + + movq 12*8(apState), %rax + movq 13*8(apState), %rcx + movq 14*8(apState), %rdx + movq 15*8(apState), %r8 + notq %rax + movq %rax, 12*8(%rsi) + movq %rcx, 13*8(%rsi) + movq %rdx, 14*8(%rsi) + movq %r8, 15*8(%rsi) + ret + diff --git a/c_src/KeccakF-1600-xop.macros b/c_src/KeccakF-1600-xop.macros new file mode 100755 index 0000000..e5d6514 --- /dev/null +++ b/c_src/KeccakF-1600-xop.macros @@ -0,0 +1,573 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#define declareABCDE \ + V128 Abage, Abegi, Abigo, Abogu, Abuga; \ + V128 Akame, Akemi, Akimo, Akomu, Akuma; \ + V128 Abae, Abio, Agae, Agio, Akae, Akio, Amae, Amio; \ + V64 Aba, Abe, Abi, Abo, Abu; \ + V64 Aga, Age, Agi, Ago, Agu; \ + V64 Aka, Ake, Aki, Ako, Aku; \ + V64 Ama, Ame, Ami, Amo, Amu; \ + V128 Asase, Asiso; \ + V64 Asu; \ + V128 Bbage, Bbegi, Bbigo, Bbogu, Bbuga; \ + V128 Bkame, Bkemi, Bkimo, Bkomu, Bkuma; \ + V128 Bsase, Bsesi, Bsiso, Bsosu, Bsusa; \ + V128 Cae, Cei, Cio, Cou, Cua; \ + V128 Dau, Dea, Die, Doi, Duo; \ + V128 Dua, Dae, Dei, Dio, Dou; \ + V128 Ebage, Ebegi, Ebigo, Ebogu, Ebuga; \ + V128 Ekame, Ekemi, Ekimo, Ekomu, Ekuma; \ + V128 Esase, Esiso; \ + V64 Esu; \ + V128 Zero; + +#define prepareTheta + +#define computeD \ + Cua = GET64LOLO(Cua, Cae); \ + Dei = XOR128(Cae, ROL6464same(Cio, 1)); \ + Dou = XOR128(Cio, ROL6464same(Cua, 1)); \ + Cei = GET64HILO(Cae, Cio); \ + Dae = XOR128(Cua, ROL6464same(Cei, 1)); \ + Dau = GET64LOHI(Dae, Dou); \ + Dea = SWAP64(Dae); \ + Die = SWAP64(Dei); \ + Doi = GET64LOLO(Dou, Die); \ + Duo = SWAP64(Dou); + +// --- Theta Rho Pi Chi Iota Prepare-theta +// --- 64-bit lanes mapped to 64-bit and 128-bit words +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + computeD \ + \ + Bbage = XOR128(GET64LOHI(A##bage, A##bogu), Dau); \ + Bbage = ROL6464(Bbage, 0, 20); \ + Bbegi = XOR128(GET64HILO(A##bage, A##kame), Dea); \ + Bbegi = ROL6464(Bbegi, 44, 3); \ + Bbigo = XOR128(GET64LOHI(A##kimo, A##kame), Die); \ + Bbigo = ROL6464(Bbigo, 43, 45); \ + E##bage = XOR128(Bbage, ANDnu128(Bbegi, Bbigo)); \ + XOReq128(E##bage, CONST64(KeccakF1600RoundConstants[i])); \ + Cae = E##bage; \ + Bbogu = XOR128(GET64HILO(A##kimo, A##siso), Doi); \ + Bbogu = ROL6464(Bbogu, 21, 61); \ + E##begi = XOR128(Bbegi, ANDnu128(Bbigo, Bbogu)); \ + Cei = E##begi; \ + Bbuga = XOR128(GET64LOLO(A##su, A##bogu), Duo); \ + Bbuga = ROL6464(Bbuga, 14, 28); \ + E##bigo = XOR128(Bbigo, ANDnu128(Bbogu, Bbuga)); \ + Cio = E##bigo; \ + E##bogu = XOR128(Bbogu, ANDnu128(Bbuga, Bbage)); \ + Cou = E##bogu; \ + E##buga = XOR128(Bbuga, ANDnu128(Bbage, Bbegi)); \ + Cua = E##buga; \ +\ + Bkame = XOR128(GET64LOHI(A##begi, A##buga), Dea); \ + Bkame = ROL6464(Bkame, 1, 36); \ + Bkemi = XOR128(GET64HILO(A##begi, A##kemi), Die); \ + Bkemi = ROL6464(Bkemi, 6, 10); \ + Bkimo = XOR128(GET64LOHI(A##komu, A##kemi), Doi); \ + Bkimo = ROL6464(Bkimo, 25, 15); \ + E##kame = XOR128(Bkame, ANDnu128(Bkemi, Bkimo)); \ + XOReq128(Cae, E##kame); \ + Bkomu = XOR128(GET64HIHI(A##komu, A##siso), Duo); \ + Bkomu = ROL6464(Bkomu, 8, 56); \ + E##kemi = XOR128(Bkemi, ANDnu128(Bkimo, Bkomu)); \ + XOReq128(Cei, E##kemi); \ + Bkuma = XOR128(GET64LOLO(A##sase, A##buga), Dau); \ + Bkuma = ROL6464(Bkuma, 18, 27); \ + E##kimo = XOR128(Bkimo, ANDnu128(Bkomu, Bkuma)); \ + XOReq128(Cio, E##kimo); \ + E##komu = XOR128(Bkomu, ANDnu128(Bkuma, Bkame)); \ + XOReq128(Cou, E##komu); \ + E##kuma = XOR128(Bkuma, ANDnu128(Bkame, Bkemi)); \ + XOReq128(Cua, E##kuma); \ +\ + Bsase = XOR128(A##bigo, SWAP64(Doi)); \ + Bsase = ROL6464(Bsase, 62, 55); \ + Bsiso = XOR128(A##kuma, SWAP64(Dau)); \ + Bsiso = ROL6464(Bsiso, 39, 41); \ + Bsusa = XOR64(COPY64HI2LO(A##sase), Dei); \ + Bsusa = ROL6464same(Bsusa, 2); \ + Bsusa = GET64LOLO(Bsusa, Bsase); \ + Bsesi = GET64HILO(Bsase, Bsiso); \ + Bsosu = GET64HILO(Bsiso, Bsusa); \ + E##sase = XOR128(Bsase, ANDnu128(Bsesi, Bsiso)); \ + XOReq128(Cae, E##sase); \ + E##siso = XOR128(Bsiso, ANDnu128(Bsosu, Bsusa)); \ + XOReq128(Cio, E##siso); \ + E##su = GET64LOLO(XOR128(Bsusa, ANDnu128(Bsase, Bsesi)), Zero); \ + XOReq128(Cua, E##su); \ +\ + Zero = ZERO128(); \ + XOReq128(Cae, GET64HIHI(Cua, Zero)); \ + XOReq128(Cae, GET64LOLO(Zero, Cei)); \ + XOReq128(Cio, GET64HIHI(Cei, Zero)); \ + XOReq128(Cio, GET64LOLO(Zero, Cou)); \ + XOReq128(Cua, GET64HIHI(Cou, Zero)); \ + +// --- Theta Rho Pi Chi Iota +// --- 64-bit lanes mapped to 64-bit and 128-bit words +#define thetaRhoPiChiIota(i, A, E) thetaRhoPiChiIotaPrepareTheta(i, A, E) + +const UINT64 KeccakF1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL }; + +#define copyFromStateAndXor576bits(X, state, input) \ + X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \ + X##ba = X##bae; \ + X##be = GET64HIHI(X##bae, X##bae); \ + Cae = X##bae; \ + X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \ + X##bi = X##bio; \ + X##bo = GET64HIHI(X##bio, X##bio); \ + Cio = X##bio; \ + X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \ + Cua = X##bu; \ + X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \ + X##ga = X##gae; \ + X##buga = GET64LOLO(X##bu, X##ga); \ + X##ge = GET64HIHI(X##gae, X##gae); \ + X##bage = GET64LOLO(X##ba, X##ge); \ + XOReq128(Cae, X##gae); \ + X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \ + X##gi = X##gio; \ + X##begi = GET64LOLO(X##be, X##gi); \ + X##go = GET64HIHI(X##gio, X##gio); \ + X##bigo = GET64LOLO(X##bi, X##go); \ + XOReq128(Cio, X##gio); \ + X##gu = LOAD64(state[ 9]); \ + X##bogu = GET64LOLO(X##bo, X##gu); \ + XOReq64(Cua, X##gu); \ + X##kae = LOAD128(state[10]); \ + X##ka = X##kae; \ + X##ke = GET64HIHI(X##kae, X##kae); \ + XOReq128(Cae, X##kae); \ + X##kio = LOAD128(state[12]); \ + X##ki = X##kio; \ + X##ko = GET64HIHI(X##kio, X##kio); \ + XOReq128(Cio, X##kio); \ + X##kuma = LOAD128(state[14]); \ + XOReq64(Cua, X##kuma); \ + X##me = LOAD64(state[16]); \ + X##kame = GET64LOLO(X##ka, X##me); \ + XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \ + X##mio = LOAD128u(state[17]); \ + X##mi = X##mio; \ + X##kemi = GET64LOLO(X##ke, X##mi); \ + X##mo = GET64HIHI(X##mio, X##mio); \ + X##kimo = GET64LOLO(X##ki, X##mo); \ + XOReq128(Cio, X##mio); \ + X##mu = LOAD64(state[19]); \ + X##komu = GET64LOLO(X##ko, X##mu); \ + XOReq64(Cua, X##mu); \ + X##sase = LOAD128(state[20]); \ + XOReq128(Cae, X##sase); \ + X##siso = LOAD128(state[22]); \ + XOReq128(Cio, X##siso); \ + X##su = LOAD64(state[24]); \ + XOReq64(Cua, X##su); \ + +#define copyFromStateAndXor832bits(X, state, input) \ + X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \ + X##ba = X##bae; \ + X##be = GET64HIHI(X##bae, X##bae); \ + Cae = X##bae; \ + X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \ + X##bi = X##bio; \ + X##bo = GET64HIHI(X##bio, X##bio); \ + Cio = X##bio; \ + X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \ + Cua = X##bu; \ + X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \ + X##ga = X##gae; \ + X##buga = GET64LOLO(X##bu, X##ga); \ + X##ge = GET64HIHI(X##gae, X##gae); \ + X##bage = GET64LOLO(X##ba, X##ge); \ + XOReq128(Cae, X##gae); \ + X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \ + X##gi = X##gio; \ + X##begi = GET64LOLO(X##be, X##gi); \ + X##go = GET64HIHI(X##gio, X##gio); \ + X##bigo = GET64LOLO(X##bi, X##go); \ + XOReq128(Cio, X##gio); \ + X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \ + X##bogu = GET64LOLO(X##bo, X##gu); \ + XOReq64(Cua, X##gu); \ + X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \ + X##ka = X##kae; \ + X##ke = GET64HIHI(X##kae, X##kae); \ + XOReq128(Cae, X##kae); \ + X##kio = XOR128(LOAD128(state[12]), LOAD64(input[12])); \ + X##ki = X##kio; \ + X##ko = GET64HIHI(X##kio, X##kio); \ + XOReq128(Cio, X##kio); \ + X##kuma = LOAD128(state[14]); \ + XOReq64(Cua, X##kuma); \ + X##me = LOAD64(state[16]); \ + X##kame = GET64LOLO(X##ka, X##me); \ + XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \ + X##mio = LOAD128u(state[17]); \ + X##mi = X##mio; \ + X##kemi = GET64LOLO(X##ke, X##mi); \ + X##mo = GET64HIHI(X##mio, X##mio); \ + X##kimo = GET64LOLO(X##ki, X##mo); \ + XOReq128(Cio, X##mio); \ + X##mu = LOAD64(state[19]); \ + X##komu = GET64LOLO(X##ko, X##mu); \ + XOReq64(Cua, X##mu); \ + X##sase = LOAD128(state[20]); \ + XOReq128(Cae, X##sase); \ + X##siso = LOAD128(state[22]); \ + XOReq128(Cio, X##siso); \ + X##su = LOAD64(state[24]); \ + XOReq64(Cua, X##su); \ + +#define copyFromStateAndXor1024bits(X, state, input) \ + X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \ + X##ba = X##bae; \ + X##be = GET64HIHI(X##bae, X##bae); \ + Cae = X##bae; \ + X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \ + X##bi = X##bio; \ + X##bo = GET64HIHI(X##bio, X##bio); \ + Cio = X##bio; \ + X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \ + Cua = X##bu; \ + X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \ + X##ga = X##gae; \ + X##buga = GET64LOLO(X##bu, X##ga); \ + X##ge = GET64HIHI(X##gae, X##gae); \ + X##bage = GET64LOLO(X##ba, X##ge); \ + XOReq128(Cae, X##gae); \ + X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \ + X##gi = X##gio; \ + X##begi = GET64LOLO(X##be, X##gi); \ + X##go = GET64HIHI(X##gio, X##gio); \ + X##bigo = GET64LOLO(X##bi, X##go); \ + XOReq128(Cio, X##gio); \ + X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \ + X##bogu = GET64LOLO(X##bo, X##gu); \ + XOReq64(Cua, X##gu); \ + X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \ + X##ka = X##kae; \ + X##ke = GET64HIHI(X##kae, X##kae); \ + XOReq128(Cae, X##kae); \ + X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \ + X##ki = X##kio; \ + X##ko = GET64HIHI(X##kio, X##kio); \ + XOReq128(Cio, X##kio); \ + X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \ + XOReq64(Cua, X##kuma); \ + X##me = LOAD64(state[16]); \ + X##kame = GET64LOLO(X##ka, X##me); \ + XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \ + X##mio = LOAD128u(state[17]); \ + X##mi = X##mio; \ + X##kemi = GET64LOLO(X##ke, X##mi); \ + X##mo = GET64HIHI(X##mio, X##mio); \ + X##kimo = GET64LOLO(X##ki, X##mo); \ + XOReq128(Cio, X##mio); \ + X##mu = LOAD64(state[19]); \ + X##komu = GET64LOLO(X##ko, X##mu); \ + XOReq64(Cua, X##mu); \ + X##sase = LOAD128(state[20]); \ + XOReq128(Cae, X##sase); \ + X##siso = LOAD128(state[22]); \ + XOReq128(Cio, X##siso); \ + X##su = LOAD64(state[24]); \ + XOReq64(Cua, X##su); \ + +#define copyFromStateAndXor1088bits(X, state, input) \ + X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \ + X##ba = X##bae; \ + X##be = GET64HIHI(X##bae, X##bae); \ + Cae = X##bae; \ + X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \ + X##bi = X##bio; \ + X##bo = GET64HIHI(X##bio, X##bio); \ + Cio = X##bio; \ + X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \ + Cua = X##bu; \ + X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \ + X##ga = X##gae; \ + X##buga = GET64LOLO(X##bu, X##ga); \ + X##ge = GET64HIHI(X##gae, X##gae); \ + X##bage = GET64LOLO(X##ba, X##ge); \ + XOReq128(Cae, X##gae); \ + X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \ + X##gi = X##gio; \ + X##begi = GET64LOLO(X##be, X##gi); \ + X##go = GET64HIHI(X##gio, X##gio); \ + X##bigo = GET64LOLO(X##bi, X##go); \ + XOReq128(Cio, X##gio); \ + X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \ + X##bogu = GET64LOLO(X##bo, X##gu); \ + XOReq64(Cua, X##gu); \ + X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \ + X##ka = X##kae; \ + X##ke = GET64HIHI(X##kae, X##kae); \ + XOReq128(Cae, X##kae); \ + X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \ + X##ki = X##kio; \ + X##ko = GET64HIHI(X##kio, X##kio); \ + XOReq128(Cio, X##kio); \ + X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \ + XOReq64(Cua, X##kuma); \ + X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \ + X##kame = GET64LOLO(X##ka, X##me); \ + XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \ + X##mio = LOAD128u(state[17]); \ + X##mi = X##mio; \ + X##kemi = GET64LOLO(X##ke, X##mi); \ + X##mo = GET64HIHI(X##mio, X##mio); \ + X##kimo = GET64LOLO(X##ki, X##mo); \ + XOReq128(Cio, X##mio); \ + X##mu = LOAD64(state[19]); \ + X##komu = GET64LOLO(X##ko, X##mu); \ + XOReq64(Cua, X##mu); \ + X##sase = LOAD128(state[20]); \ + XOReq128(Cae, X##sase); \ + X##siso = LOAD128(state[22]); \ + XOReq128(Cio, X##siso); \ + X##su = LOAD64(state[24]); \ + XOReq64(Cua, X##su); \ + +#define copyFromStateAndXor1152bits(X, state, input) \ + X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \ + X##ba = X##bae; \ + X##be = GET64HIHI(X##bae, X##bae); \ + Cae = X##bae; \ + X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \ + X##bi = X##bio; \ + X##bo = GET64HIHI(X##bio, X##bio); \ + Cio = X##bio; \ + X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \ + Cua = X##bu; \ + X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \ + X##ga = X##gae; \ + X##buga = GET64LOLO(X##bu, X##ga); \ + X##ge = GET64HIHI(X##gae, X##gae); \ + X##bage = GET64LOLO(X##ba, X##ge); \ + XOReq128(Cae, X##gae); \ + X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \ + X##gi = X##gio; \ + X##begi = GET64LOLO(X##be, X##gi); \ + X##go = GET64HIHI(X##gio, X##gio); \ + X##bigo = GET64LOLO(X##bi, X##go); \ + XOReq128(Cio, X##gio); \ + X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \ + X##bogu = GET64LOLO(X##bo, X##gu); \ + XOReq64(Cua, X##gu); \ + X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \ + X##ka = X##kae; \ + X##ke = GET64HIHI(X##kae, X##kae); \ + XOReq128(Cae, X##kae); \ + X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \ + X##ki = X##kio; \ + X##ko = GET64HIHI(X##kio, X##kio); \ + XOReq128(Cio, X##kio); \ + X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \ + XOReq64(Cua, X##kuma); \ + X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \ + X##kame = GET64LOLO(X##ka, X##me); \ + XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \ + X##mio = XOR128(LOAD128u(state[17]), LOAD64(input[17])); \ + X##mi = X##mio; \ + X##kemi = GET64LOLO(X##ke, X##mi); \ + X##mo = GET64HIHI(X##mio, X##mio); \ + X##kimo = GET64LOLO(X##ki, X##mo); \ + XOReq128(Cio, X##mio); \ + X##mu = LOAD64(state[19]); \ + X##komu = GET64LOLO(X##ko, X##mu); \ + XOReq64(Cua, X##mu); \ + X##sase = LOAD128(state[20]); \ + XOReq128(Cae, X##sase); \ + X##siso = LOAD128(state[22]); \ + XOReq128(Cio, X##siso); \ + X##su = LOAD64(state[24]); \ + XOReq64(Cua, X##su); \ + +#define copyFromStateAndXor1344bits(X, state, input) \ + X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \ + X##ba = X##bae; \ + X##be = GET64HIHI(X##bae, X##bae); \ + Cae = X##bae; \ + X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \ + X##bi = X##bio; \ + X##bo = GET64HIHI(X##bio, X##bio); \ + Cio = X##bio; \ + X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \ + Cua = X##bu; \ + X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \ + X##ga = X##gae; \ + X##buga = GET64LOLO(X##bu, X##ga); \ + X##ge = GET64HIHI(X##gae, X##gae); \ + X##bage = GET64LOLO(X##ba, X##ge); \ + XOReq128(Cae, X##gae); \ + X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \ + X##gi = X##gio; \ + X##begi = GET64LOLO(X##be, X##gi); \ + X##go = GET64HIHI(X##gio, X##gio); \ + X##bigo = GET64LOLO(X##bi, X##go); \ + XOReq128(Cio, X##gio); \ + X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \ + X##bogu = GET64LOLO(X##bo, X##gu); \ + XOReq64(Cua, X##gu); \ + X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \ + X##ka = X##kae; \ + X##ke = GET64HIHI(X##kae, X##kae); \ + XOReq128(Cae, X##kae); \ + X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \ + X##ki = X##kio; \ + X##ko = GET64HIHI(X##kio, X##kio); \ + XOReq128(Cio, X##kio); \ + X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \ + XOReq64(Cua, X##kuma); \ + X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \ + X##kame = GET64LOLO(X##ka, X##me); \ + XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \ + X##mio = XOR128(LOAD128u(state[17]), LOAD128u(input[17])); \ + X##mi = X##mio; \ + X##kemi = GET64LOLO(X##ke, X##mi); \ + X##mo = GET64HIHI(X##mio, X##mio); \ + X##kimo = GET64LOLO(X##ki, X##mo); \ + XOReq128(Cio, X##mio); \ + X##mu = XOR64(LOAD64(state[19]), LOAD64(input[19])); \ + X##komu = GET64LOLO(X##ko, X##mu); \ + XOReq64(Cua, X##mu); \ + X##sase = XOR128(LOAD128(state[20]), LOAD64(input[20])); \ + XOReq128(Cae, X##sase); \ + X##siso = LOAD128(state[22]); \ + XOReq128(Cio, X##siso); \ + X##su = LOAD64(state[24]); \ + XOReq64(Cua, X##su); \ + +#define copyFromState(X, state) \ + X##bae = LOAD128(state[ 0]); \ + X##ba = X##bae; \ + X##be = GET64HIHI(X##bae, X##bae); \ + Cae = X##bae; \ + X##bio = LOAD128(state[ 2]); \ + X##bi = X##bio; \ + X##bo = GET64HIHI(X##bio, X##bio); \ + Cio = X##bio; \ + X##bu = LOAD64(state[ 4]); \ + Cua = X##bu; \ + X##gae = LOAD128u(state[ 5]); \ + X##ga = X##gae; \ + X##buga = GET64LOLO(X##bu, X##ga); \ + X##ge = GET64HIHI(X##gae, X##gae); \ + X##bage = GET64LOLO(X##ba, X##ge); \ + XOReq128(Cae, X##gae); \ + X##gio = LOAD128u(state[ 7]); \ + X##gi = X##gio; \ + X##begi = GET64LOLO(X##be, X##gi); \ + X##go = GET64HIHI(X##gio, X##gio); \ + X##bigo = GET64LOLO(X##bi, X##go); \ + XOReq128(Cio, X##gio); \ + X##gu = LOAD64(state[ 9]); \ + X##bogu = GET64LOLO(X##bo, X##gu); \ + XOReq64(Cua, X##gu); \ + X##kae = LOAD128(state[10]); \ + X##ka = X##kae; \ + X##ke = GET64HIHI(X##kae, X##kae); \ + XOReq128(Cae, X##kae); \ + X##kio = LOAD128(state[12]); \ + X##ki = X##kio; \ + X##ko = GET64HIHI(X##kio, X##kio); \ + XOReq128(Cio, X##kio); \ + X##kuma = LOAD128(state[14]); \ + XOReq64(Cua, X##kuma); \ + X##me = LOAD64(state[16]); \ + X##kame = GET64LOLO(X##ka, X##me); \ + XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \ + X##mio = LOAD128u(state[17]); \ + X##mi = X##mio; \ + X##kemi = GET64LOLO(X##ke, X##mi); \ + X##mo = GET64HIHI(X##mio, X##mio); \ + X##kimo = GET64LOLO(X##ki, X##mo); \ + XOReq128(Cio, X##mio); \ + X##mu = LOAD64(state[19]); \ + X##komu = GET64LOLO(X##ko, X##mu); \ + XOReq64(Cua, X##mu); \ + X##sase = LOAD128(state[20]); \ + XOReq128(Cae, X##sase); \ + X##siso = LOAD128(state[22]); \ + XOReq128(Cio, X##siso); \ + X##su = LOAD64(state[24]); \ + XOReq64(Cua, X##su); \ + +#define copyToState(state, X) \ + STORE64(state[ 0], X##bage); \ + STORE64(state[ 1], X##begi); \ + STORE64(state[ 2], X##bigo); \ + STORE64(state[ 3], X##bogu); \ + STORE128(state[ 4], X##buga); \ + STORE64(state[ 6], COPY64HI2LO(X##bage)); \ + STORE64(state[ 7], COPY64HI2LO(X##begi)); \ + STORE64(state[ 8], COPY64HI2LO(X##bigo)); \ + STORE64(state[ 9], COPY64HI2LO(X##bogu)); \ + STORE64(state[10], X##kame); \ + STORE64(state[11], X##kemi); \ + STORE64(state[12], X##kimo); \ + STORE64(state[13], X##komu); \ + STORE128(state[14], X##kuma); \ + STORE64(state[16], COPY64HI2LO(X##kame)); \ + STORE64(state[17], COPY64HI2LO(X##kemi)); \ + STORE64(state[18], COPY64HI2LO(X##kimo)); \ + STORE64(state[19], COPY64HI2LO(X##komu)); \ + STORE128(state[20], X##sase); \ + STORE128(state[22], X##siso); \ + STORE64(state[24], X##su); \ + +#define copyStateVariables(X, Y) \ + X##bage = Y##bage; \ + X##begi = Y##begi; \ + X##bigo = Y##bigo; \ + X##bogu = Y##bogu; \ + X##buga = Y##buga; \ + X##kame = Y##kame; \ + X##kemi = Y##kemi; \ + X##kimo = Y##kimo; \ + X##komu = Y##komu; \ + X##kuma = Y##kuma; \ + X##sase = Y##sase; \ + X##siso = Y##siso; \ + X##su = Y##su; \ + diff --git a/c_src/KeccakNISTInterface.c b/c_src/KeccakNISTInterface.c new file mode 100755 index 0000000..5d92c74 --- /dev/null +++ b/c_src/KeccakNISTInterface.c @@ -0,0 +1,81 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "KeccakNISTInterface.h" +#include "KeccakF-1600-interface.h" + +HashReturn Init(hashState *state, int hashbitlen) +{ + switch(hashbitlen) { + case 0: // Default parameters, arbitrary length output + InitSponge((spongeState*)state, 1024, 576); + break; + case 224: + InitSponge((spongeState*)state, 1152, 448); + break; + case 256: + InitSponge((spongeState*)state, 1088, 512); + break; + case 384: + InitSponge((spongeState*)state, 832, 768); + break; + case 512: + InitSponge((spongeState*)state, 576, 1024); + break; + default: + return BAD_HASHLEN; + } + state->fixedOutputLength = hashbitlen; + return SUCCESS; +} + +HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) +{ + if ((databitlen % 8) == 0) + return Absorb((spongeState*)state, data, databitlen); + else { + HashReturn ret = Absorb((spongeState*)state, data, databitlen - (databitlen % 8)); + if (ret == SUCCESS) { + unsigned char lastByte; + // Align the last partial byte to the least significant bits + lastByte = data[databitlen/8] >> (8 - (databitlen % 8)); + return Absorb((spongeState*)state, &lastByte, databitlen % 8); + } + else + return ret; + } +} + +HashReturn Final(hashState *state, BitSequence *hashval) +{ + return Squeeze(state, hashval, state->fixedOutputLength); +} + +HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval) +{ + hashState state; + HashReturn result; + + if ((hashbitlen != 224) && (hashbitlen != 256) && (hashbitlen != 384) && (hashbitlen != 512)) + return BAD_HASHLEN; // Only the four fixed output lengths available through this API + result = Init(&state, hashbitlen); + if (result != SUCCESS) + return result; + result = Update(&state, data, databitlen); + if (result != SUCCESS) + return result; + result = Final(&state, hashval); + return result; +} + diff --git a/c_src/KeccakNISTInterface.h b/c_src/KeccakNISTInterface.h new file mode 100755 index 0000000..c6987d4 --- /dev/null +++ b/c_src/KeccakNISTInterface.h @@ -0,0 +1,70 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakNISTInterface_h_ +#define _KeccakNISTInterface_h_ + +#include "KeccakSponge.h" + +typedef unsigned char BitSequence; +typedef unsigned long long DataLength; +typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn; + +typedef spongeState hashState; + +/** + * Function to initialize the state of the Keccak[r, c] sponge function. + * The rate r and capacity c values are determined from @a hashbitlen. + * @param state Pointer to the state of the sponge function to be initialized. + * @param hashbitlen The desired number of output bits, + * or 0 for Keccak[] with default parameters + * and arbitrarily-long output. + * @pre The value of hashbitlen must be one of 0, 224, 256, 384 and 512. + * @return SUCCESS if successful, BAD_HASHLEN if the value of hashbitlen is incorrect. + */ +HashReturn Init(hashState *state, int hashbitlen); +/** + * Function to give input data for the sponge function to absorb. + * @param state Pointer to the state of the sponge function initialized by Init(). + * @param data Pointer to the input data. + * When @a databitLen is not a multiple of 8, the last bits of data must be + * in the most significant bits of the last byte. + * @param databitLen The number of input bits provided in the input data. + * @pre In the previous call to Absorb(), databitLen was a multiple of 8. + * @return SUCCESS if successful, FAIL otherwise. + */ +HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen); +/** + * Function to squeeze output data from the sponge function. + * If @a hashbitlen was not 0 in the call to Init(), the number of output bits is equal to @a hashbitlen. + * If @a hashbitlen was 0 in the call to Init(), the output bits must be extracted using the Squeeze() function. + * @param state Pointer to the state of the sponge function initialized by Init(). + * @param hashval Pointer to the buffer where to store the output data. + * @return SUCCESS if successful, FAIL otherwise. + */ +HashReturn Final(hashState *state, BitSequence *hashval); +/** + * Function to compute a hash using the Keccak[r, c] sponge function. + * The rate r and capacity c values are determined from @a hashbitlen. + * @param hashbitlen The desired number of output bits. + * @param data Pointer to the input data. + * When @a databitLen is not a multiple of 8, the last bits of data must be + * in the most significant bits of the last byte. + * @param databitLen The number of input bits provided in the input data. + * @param hashval Pointer to the buffer where to store the output data. + * @pre The value of hashbitlen must be one of 224, 256, 384 and 512. + * @return SUCCESS if successful, BAD_HASHLEN if the value of hashbitlen is incorrect. + */ +HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval); + +#endif diff --git a/c_src/KeccakSponge.c b/c_src/KeccakSponge.c new file mode 100755 index 0000000..5939ba4 --- /dev/null +++ b/c_src/KeccakSponge.c @@ -0,0 +1,266 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "KeccakSponge.h" +#include "KeccakF-1600-interface.h" +#ifdef KeccakReference +#include "displayIntermediateValues.h" +#endif + +int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity) +{ + if (rate+capacity != 1600) + return 1; + if ((rate <= 0) || (rate >= 1600) || ((rate % 64) != 0)) + return 1; + KeccakInitialize(); + state->rate = rate; + state->capacity = capacity; + state->fixedOutputLength = 0; + KeccakInitializeState(state->state); + memset(state->dataQueue, 0, KeccakMaximumRateInBytes); + state->bitsInQueue = 0; + state->squeezing = 0; + state->bitsAvailableForSqueezing = 0; + + return 0; +} + +void AbsorbQueue(spongeState *state) +{ + // state->bitsInQueue is assumed to be equal to state->rate + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed", state->dataQueue, state->rate/8); + #endif +#ifdef ProvideFast576 + if (state->rate == 576) + KeccakAbsorb576bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast832 + if (state->rate == 832) + KeccakAbsorb832bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1024 + if (state->rate == 1024) + KeccakAbsorb1024bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1088 + if (state->rate == 1088) + KeccakAbsorb1088bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1152 + if (state->rate == 1152) + KeccakAbsorb1152bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1344 + if (state->rate == 1344) + KeccakAbsorb1344bits(state->state, state->dataQueue); + else +#endif + KeccakAbsorb(state->state, state->dataQueue, state->rate/64); + state->bitsInQueue = 0; +} + +int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen) +{ + unsigned long long i, j, wholeBlocks; + unsigned int partialBlock, partialByte; + const unsigned char *curData; + + if ((state->bitsInQueue % 8) != 0) + return 1; // Only the last call may contain a partial byte + if (state->squeezing) + return 1; // Too late for additional input + + i = 0; + while(i < databitlen) { + if ((state->bitsInQueue == 0) && (databitlen >= state->rate) && (i <= (databitlen-state->rate))) { + wholeBlocks = (databitlen-i)/state->rate; + curData = data+i/8; +#ifdef ProvideFast576 + if (state->rate == 576) { + for(j=0; jrate/8); + #endif + KeccakAbsorb576bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast832 + if (state->rate == 832) { + for(j=0; jrate/8); + #endif + KeccakAbsorb832bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1024 + if (state->rate == 1024) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1024bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1088 + if (state->rate == 1088) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1088bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1152 + if (state->rate == 1152) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1152bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1344 + if (state->rate == 1344) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1344bits(state->state, curData); + } + } + else +#endif + { + for(j=0; jrate/8) { + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed", curData, state->rate/8); + #endif + KeccakAbsorb(state->state, curData, state->rate/64); + } + } + i += wholeBlocks*state->rate; + } + else { + partialBlock = (unsigned int)(databitlen - i); + if (partialBlock+state->bitsInQueue > state->rate) + partialBlock = state->rate-state->bitsInQueue; + partialByte = partialBlock % 8; + partialBlock -= partialByte; + memcpy(state->dataQueue+state->bitsInQueue/8, data+i/8, partialBlock/8); + state->bitsInQueue += partialBlock; + i += partialBlock; + if (state->bitsInQueue == state->rate) + AbsorbQueue(state); + if (partialByte > 0) { + unsigned char mask = (1 << partialByte)-1; + state->dataQueue[state->bitsInQueue/8] = data[i/8] & mask; + state->bitsInQueue += partialByte; + i += partialByte; + } + } + } + return 0; +} + +void PadAndSwitchToSqueezingPhase(spongeState *state) +{ + // Note: the bits are numbered from 0=LSB to 7=MSB + if (state->bitsInQueue + 1 == state->rate) { + state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8); + AbsorbQueue(state); + memset(state->dataQueue, 0, state->rate/8); + } + else { + memset(state->dataQueue + (state->bitsInQueue+7)/8, 0, state->rate/8 - (state->bitsInQueue+7)/8); + state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8); + } + state->dataQueue[(state->rate-1)/8] |= 1 << ((state->rate-1) % 8); + AbsorbQueue(state); + + #ifdef KeccakReference + displayText(1, "--- Switching to squeezing phase ---"); + #endif +#ifdef ProvideFast1024 + if (state->rate == 1024) { + KeccakExtract1024bits(state->state, state->dataQueue); + state->bitsAvailableForSqueezing = 1024; + } + else +#endif + { + KeccakExtract(state->state, state->dataQueue, state->rate/64); + state->bitsAvailableForSqueezing = state->rate; + } + #ifdef KeccakReference + displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8); + #endif + state->squeezing = 1; +} + +int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength) +{ + unsigned long long i; + unsigned int partialBlock; + + if (!state->squeezing) + PadAndSwitchToSqueezingPhase(state); + if ((outputLength % 8) != 0) + return 1; // Only multiple of 8 bits are allowed, truncation can be done at user level + + i = 0; + while(i < outputLength) { + if (state->bitsAvailableForSqueezing == 0) { + KeccakPermutation(state->state); +#ifdef ProvideFast1024 + if (state->rate == 1024) { + KeccakExtract1024bits(state->state, state->dataQueue); + state->bitsAvailableForSqueezing = 1024; + } + else +#endif + { + KeccakExtract(state->state, state->dataQueue, state->rate/64); + state->bitsAvailableForSqueezing = state->rate; + } + #ifdef KeccakReference + displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8); + #endif + } + partialBlock = state->bitsAvailableForSqueezing; + if ((unsigned long long)partialBlock > outputLength - i) + partialBlock = (unsigned int)(outputLength - i); + memcpy(output+i/8, state->dataQueue+(state->rate-state->bitsAvailableForSqueezing)/8, partialBlock/8); + state->bitsAvailableForSqueezing -= partialBlock; + i += partialBlock; + } + return 0; +} diff --git a/c_src/KeccakSponge.h b/c_src/KeccakSponge.h new file mode 100755 index 0000000..df3d797 --- /dev/null +++ b/c_src/KeccakSponge.h @@ -0,0 +1,76 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakSponge_h_ +#define _KeccakSponge_h_ + +#define KeccakPermutationSize 1600 +#define KeccakPermutationSizeInBytes (KeccakPermutationSize/8) +#define KeccakMaximumRate 1536 +#define KeccakMaximumRateInBytes (KeccakMaximumRate/8) + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + +ALIGN typedef struct spongeStateStruct { + ALIGN unsigned char state[KeccakPermutationSizeInBytes]; + ALIGN unsigned char dataQueue[KeccakMaximumRateInBytes]; + unsigned int rate; + unsigned int capacity; + unsigned int bitsInQueue; + unsigned int fixedOutputLength; + int squeezing; + unsigned int bitsAvailableForSqueezing; +} spongeState; + +/** + * Function to initialize the state of the Keccak[r, c] sponge function. + * The sponge function is set to the absorbing phase. + * @param state Pointer to the state of the sponge function to be initialized. + * @param rate The value of the rate r. + * @param capacity The value of the capacity c. + * @pre One must have r+c=1600 and the rate a multiple of 64 bits in this implementation. + * @return Zero if successful, 1 otherwise. + */ +int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity); +/** + * Function to give input data for the sponge function to absorb. + * @param state Pointer to the state of the sponge function initialized by InitSponge(). + * @param data Pointer to the input data. + * When @a databitLen is not a multiple of 8, the last bits of data must be + * in the least significant bits of the last byte. + * @param databitLen The number of input bits provided in the input data. + * @pre In the previous call to Absorb(), databitLen was a multiple of 8. + * @pre The sponge function must be in the absorbing phase, + * i.e., Squeeze() must not have been called before. + * @return Zero if successful, 1 otherwise. + */ +int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen); +/** + * Function to squeeze output data from the sponge function. + * If the sponge function was in the absorbing phase, this function + * switches it to the squeezing phase. + * @param state Pointer to the state of the sponge function initialized by InitSponge(). + * @param output Pointer to the buffer where to store the output data. + * @param outputLength The number of output bits desired. + * It must be a multiple of 8. + * @return Zero if successful, 1 otherwise. + */ +int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength); + +#endif diff --git a/c_src/brg_endian.h b/c_src/brg_endian.h new file mode 100755 index 0000000..7226eb3 --- /dev/null +++ b/c_src/brg_endian.h @@ -0,0 +1,142 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + Changes for ARM 9/9/2010 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +#if 0 +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif defined(__arm__) +# ifdef __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# else +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif 1 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/c_src/displayIntermediateValues.c b/c_src/displayIntermediateValues.c new file mode 100755 index 0000000..f3bf9e2 --- /dev/null +++ b/c_src/displayIntermediateValues.c @@ -0,0 +1,117 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "displayIntermediateValues.h" +#include "KeccakNISTInterface.h" + +FILE *intermediateValueFile = 0; +int displayLevel = 0; + +void displaySetIntermediateValueFile(FILE *f) +{ + intermediateValueFile = f; +} + +void displaySetLevel(int level) +{ + displayLevel = level; +} + +void displayBytes(int level, const char *text, const unsigned char *bytes, unsigned int size) +{ + unsigned int i; + + if ((intermediateValueFile) && (level <= displayLevel)) { + fprintf(intermediateValueFile, "%s:\n", text); + for(i=0; i> iBit) & 0x01) != 0); + } + fprintf(intermediateValueFile, "\n"); + fprintf(intermediateValueFile, "\n"); + } +} + +void displayStateAsBytes(int level, const char *text, const unsigned char *state) +{ + displayBytes(level, text, state, KeccakPermutationSizeInBytes); +} + +void displayStateAs32bitWords(int level, const char *text, const unsigned int *state) +{ + unsigned int i; + + if ((intermediateValueFile) && (level <= displayLevel)) { + fprintf(intermediateValueFile, "%s:\n", text); + for(i=0; i> 32)); + fprintf(intermediateValueFile, "%08X", (unsigned int)(state[i] & 0xFFFFFFFFULL)); + if ((i%5) == 4) + fprintf(intermediateValueFile, "\n"); + else + fprintf(intermediateValueFile, " "); + } + } +} + +void displayRoundNumber(int level, unsigned int i) +{ + if ((intermediateValueFile) && (level <= displayLevel)) { + fprintf(intermediateValueFile, "\n"); + fprintf(intermediateValueFile, "--- Round %d ---\n", i); + fprintf(intermediateValueFile, "\n"); + } +} + +void displayText(int level, const char *text) +{ + if ((intermediateValueFile) && (level <= displayLevel)) { + fprintf(intermediateValueFile, text); + fprintf(intermediateValueFile, "\n"); + fprintf(intermediateValueFile, "\n"); + } +} diff --git a/c_src/displayIntermediateValues.h b/c_src/displayIntermediateValues.h new file mode 100755 index 0000000..1d6c6c8 --- /dev/null +++ b/c_src/displayIntermediateValues.h @@ -0,0 +1,29 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _displayIntermediateValues_h_ +#define _displayIntermediateValues_h_ + +#include + +void displaySetIntermediateValueFile(FILE *f); +void displaySetLevel(int level); +void displayBytes(int level, const char *text, const unsigned char *bytes, unsigned int size); +void displayBits(int level, const char *text, const unsigned char *data, unsigned int size, int MSBfirst); +void displayStateAsBytes(int level, const char *text, const unsigned char *state); +void displayStateAs32bitWords(int level, const char *text, const unsigned int *state); +void displayStateAs64bitWords(int level, const char *text, const unsigned long long int *state); +void displayRoundNumber(int level, unsigned int i); +void displayText(int level, const char *text); + +#endif diff --git a/c_src/sha3_nif.c b/c_src/sha3_nif.c new file mode 100644 index 0000000..d485250 --- /dev/null +++ b/c_src/sha3_nif.c @@ -0,0 +1,144 @@ +#include "erl_nif.h" +#include "KeccakNISTInterface.h" + +typedef struct nif_hash_context nif_hash_context; + +struct nif_hash_context { + int bitlen; + hashState state; +}; + +static void sha3_resource_cleanup(ErlNifEnv* env, void* arg); +static ERL_NIF_TERM nif_hash_init(ErlNifEnv* env, int argc, + const ERL_NIF_TERM argv[]); +static ERL_NIF_TERM nif_hash_update(ErlNifEnv* env, int argc, + const ERL_NIF_TERM argv[]); +static ERL_NIF_TERM nif_hash_final(ErlNifEnv* env, int argc, + const ERL_NIF_TERM argv[]); +static ERL_NIF_TERM nif_hash(ErlNifEnv* env, int argc, + const ERL_NIF_TERM argv[]); + +static ErlNifFunc nif_funcs[] = +{ + {"hash_init", 1, nif_hash_init}, + {"hash_update", 2, nif_hash_update}, + {"hash_final", 1, nif_hash_final}, + {"hash", 2, nif_hash} +}; + +static ErlNifResourceType *sha3_resource_type; + +static void +sha3_resource_cleanup(ErlNifEnv* env, void* arg) +{ + /* do nothing */ +} + +static ERL_NIF_TERM +nif_hash_init(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) +{ + ERL_NIF_TERM ctxt_term; + nif_hash_context *ctxt; + int bitlen; + + if (!enif_get_int(env, argv[0], &bitlen)) + return 0; + + if (bitlen != 224 && bitlen != 256 && bitlen != 384 && bitlen != 512) + return 0; + + ctxt = enif_alloc_resource(sha3_resource_type, sizeof(nif_hash_context)); + ctxt->bitlen = bitlen; + Init(&ctxt->state, bitlen); + ctxt_term = enif_make_resource(env, ctxt); + enif_release_resource(ctxt); + + return ctxt_term; +} + +static ERL_NIF_TERM +nif_hash_update(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) +{ + ERL_NIF_TERM ctxt_term; + ErlNifBinary src_bin; + nif_hash_context *ctxt, *new; + hashState state; + + if (!enif_get_resource(env, argv[0], sha3_resource_type, (void **)&ctxt) || + !enif_inspect_binary(env, argv[1], &src_bin)) + return 0; + + state = ctxt->state; + Update(&state, src_bin.data, src_bin.size * 8); + new = enif_alloc_resource(sha3_resource_type, sizeof(nif_hash_context)); + new->bitlen = ctxt->bitlen; + new->state = state; + ctxt_term = enif_make_resource(env, new); + enif_release_resource(new); + + return ctxt_term; +} + +static ERL_NIF_TERM +nif_hash_final(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) +{ + ERL_NIF_TERM digest_term; + ErlNifBinary digest_bin; + nif_hash_context *ctxt; + hashState state; + + if (!enif_get_resource(env, argv[0], sha3_resource_type, (void **)&ctxt)) + return 0; + + state = ctxt->state; + enif_alloc_binary(ctxt->bitlen / 8, &digest_bin); + Final(&state, digest_bin.data); + digest_term = enif_make_binary(env, &digest_bin); + enif_release_binary(&digest_bin); + + return digest_term; +} + +static ERL_NIF_TERM +nif_hash(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) +{ + ERL_NIF_TERM digest_term; + ErlNifBinary src_bin, digest_bin; + int bitlen; + + if (!enif_get_int(env, argv[0], &bitlen) || + !enif_inspect_binary(env, argv[1], &src_bin)) + return 0; + + if (bitlen != 224 && bitlen != 256 && bitlen != 384 && bitlen != 512) + return 0; + + enif_alloc_binary(bitlen / 8, &digest_bin); + Hash(bitlen, src_bin.data, src_bin.size * 8, digest_bin.data); + digest_term = enif_make_binary(env, &digest_bin); + enif_release_binary(&digest_bin); + + return digest_term; +} + +static int +on_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info) +{ + ErlNifResourceFlags flags = ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER; + + sha3_resource_type = enif_open_resource_type(env, NULL, "sha3_resource", + &sha3_resource_cleanup, flags, NULL); + if (sha3_resource_type == NULL) + return -1; + else + return 0; +} + +static void +on_unload(ErlNifEnv* env, void* priv_data) +{ + /* do nothing */ +} + +ERL_NIF_INIT(sha3, nif_funcs, &on_load, NULL, NULL, &on_unload); + diff --git a/doc/overview.edoc b/doc/overview.edoc new file mode 100644 index 0000000..41de035 --- /dev/null +++ b/doc/overview.edoc @@ -0,0 +1,8 @@ +@title SHA-3 for Erlang +@author SUZUKI Tetsuya +@copyright 2012- SUZUKI Tetsuya +@version 0.1.0 +@reference Wikipedia: SHA-3 +@reference NIST: Cryptographic Hash Algorithm Competition +@reference The Keccak sponge function family + diff --git a/rebar b/rebar new file mode 100755 index 0000000..e364a2b Binary files /dev/null and b/rebar differ diff --git a/rebar.config b/rebar.config new file mode 100644 index 0000000..16377e2 --- /dev/null +++ b/rebar.config @@ -0,0 +1,33 @@ +{erl_opts, [{i, "src"}, + warnings_as_errors, + {w, all}, + warn_export_all]}. + +{clean_files, [".eunit", + "ebin/*.beam"]}. + +{port_env, [{"CFLAGS", "$CFLAGS -O2 -finline-functions -fomit-frame-pointer -fno-strict-aliasing -Wmissing-prototypes -Wall -std=c99"}]}. + +{port_specs, [ + % TODO: support optimization + % {"i386", "priv/sha3_nif.so", ["c_src/sha3_nif.c", + % "c_src/KeccakNISTInterface.c", + % "c_src/KeccakSponge.c", + % "c_src/KeccakF-1600-opt32.c", + % "c_src/displayIntermediateValues.c"]}, + % {"x86_64", "priv/sha3_nif.so", ["c_src/sha3_nif.c", + % "c_src/KeccakNISTInterface.c", + % "c_src/KeccakSponge.c", + % "c_src/KeccakF-1600-opt64.c", + % "c_src/displayIntermediateValues.c"]}, + {"priv/sha3_nif.so", ["c_src/sha3_nif.c", + "c_src/KeccakNISTInterface.c", + "c_src/KeccakSponge.c", + "c_src/KeccakF-1600-reference.c", + "c_src/displayIntermediateValues.c"]} +]}. + +{eunit_opts, [{report,{eunit_surefire,[{dir,"."}]}}]}. + +{xref_checks, [fail_on_warning, undefined_function_calls]}. + diff --git a/src/sha3.app.src b/src/sha3.app.src new file mode 100644 index 0000000..aee6773 --- /dev/null +++ b/src/sha3.app.src @@ -0,0 +1,12 @@ +{application, sha3, + [ + {description, ""}, + {vsn, "0.1.0"}, + {registered, []}, + {applications, [ + kernel, + stdlib + ]}, + {modules, [sha3]}, + {env, []} + ]}. diff --git a/src/sha3.erl b/src/sha3.erl new file mode 100644 index 0000000..1b15118 --- /dev/null +++ b/src/sha3.erl @@ -0,0 +1,41 @@ +-module(sha3). + +-export([hash_init/1, hash_update/2, hash_final/1, hash/2]). + +-on_load(init/0). + +-type bitlen() :: 224 | 256 | 384 | 512. +-type context() :: binary(). +-type digest() :: <<_:224>> | <<_:256>> | <<_:384>> | <<_:512>>. + +-define(nif_stub, nif_stub_error(?LINE)). +nif_stub_error(Line) -> + erlang:nif_error({nif_not_loaded,module,?MODULE,line,Line}). + +init() -> + PrivDir = case code:priv_dir(?MODULE) of + {error, bad_name} -> + EbinDir = filename:dirname(code:which(?MODULE)), + AppPath = filename:dirname(EbinDir), + filename:join(AppPath, "priv"); + Path -> + Path + end, + erlang:load_nif(filename:join(PrivDir, sha3_nif), 0). + +-spec hash_init(bitlen()) -> context(). +hash_init(_BitLen) -> + ?nif_stub. + +-spec hash_update(context(), binary()) -> context(). +hash_update(_Context, _Binary) -> + ?nif_stub. + +-spec hash_final(context()) -> digest(). +hash_final(_Context) -> + ?nif_stub. + +-spec hash(bitlen(), binary()) -> digest(). +hash(_BitLen, _Binary) -> + ?nif_stub. + diff --git a/test/sha3_tests.erl b/test/sha3_tests.erl new file mode 100644 index 0000000..c8b262f --- /dev/null +++ b/test/sha3_tests.erl @@ -0,0 +1,48 @@ +-module(sha3_tests). + +-include_lib("eunit/include/eunit.hrl"). + +simple_data() -> + <<16#00112233445566778899AABBCCDDEEFF:128>>. + +simple_digest() -> + <<16#038907E89C919CD8F90A7FBC5A88FF9278108DAEF3EBCDA0CEB383E1:224>>. + +simple_test() -> + Digest = sha3:hash(224, simple_data()), + Expected = simple_digest(), + ?assertEqual(Expected, Digest). + +update_test() -> + Context1 = sha3:hash_init(224), + Context2 = sha3:hash_update(Context1, simple_data()), + Digest = sha3:hash_final(Context2), + Expected = simple_digest(), + ?assertEqual(Expected, Digest). + +update_context_test() -> + Context1 = sha3:hash_init(224), + Context2 = sha3:hash_update(Context1, simple_data()), + Context3 = sha3:hash_update(Context1, simple_data()), + Digest1 = sha3:hash_final(Context2), + Digest2 = sha3:hash_final(Context3), + Expected = simple_digest(), + ?assertEqual(Expected, Digest1), + ?assertEqual(Expected, Digest2). + +hash_224_test() -> + ?assertEqual(<<16#038907E89C919CD8F90A7FBC5A88FF9278108DAEF3EBCDA0CEB383E1:224>>, + sha3:hash(224, <<16#00112233445566778899AABBCCDDEEFF:128>>)). + +hash_256_test() -> + ?assertEqual(<<16#22BCE46032802AF0ABFACF3768F7BE04A34F5F01DF60F44FFD52D3CA937350C0:256>>, + sha3:hash(256, <<16#00112233445566778899AABBCCDDEEFF:128>>)). + +hash_384_test() -> + ?assertEqual(<<16#25FAC1ADECBE1B254976FE32C2FE78829B23D7D84316141ECD208D6806A9DB4352A014ADA4106BA0D210DDA0FD18E150:384>>, + sha3:hash(384, <<16#00112233445566778899AABBCCDDEEFF:128>>)). + +hash_512_test() -> + ?assertEqual(<<16#94EE7851163C39C3489373AA0BF885D95925EAD7484C586D2E0D01D9C8069D3C30E2EEA2DC63A91B517FE53E43A31D764A2154A2DA92876366B138ABC4406805:512>>, + sha3:hash(512, <<16#00112233445566778899AABBCCDDEEFF:128>>)). +