commit d9d541dce82e83d1d70e4427eb310a1f72189884
Author: SUZUKI Tetsuya <suzuki@spice-of-life.net>
Date:   Wed Oct 3 15:05:19 2012 +0900

    initial import

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..81fb1a5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+.DS_Store
+.eunit
+ebin
+deps
+priv
+*.o
+*.beam
+*.plt
+*.swp
+*.html
+*.png
+edoc-info
+stylesheet.css
+
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..cfd50bf
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,12 @@
+language: erlang
+notifications:
+  disabled: true
+branches:
+  only:
+    - develop
+    - 0.1.0
+otp_release:
+  - R15B02
+  - R15B01
+  - R15B
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3e8b822
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,23 @@
+.PHONY: doc
+
+all:
+	./rebar compile
+	./rebar doc
+	./rebar xref
+	./rebar eunit
+
+compile:
+	./rebar compile
+
+doc:
+	./rebar doc
+
+xref: compile
+	./rebar xref
+
+clean:
+	./rebar clean
+
+test: xref
+	./rebar eunit
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9e95ae5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,21 @@
+erlang-sha3
+===========
+
+[![Build Status](https://secure.travis-ci.org/szktty/erlang-sha3.png?branch=develop)](http://travis-ci.org/szktty/erlang-sha3)
+
+SHA3 for Erlang
+
+
+Licenses
+--------
+
+This program is distributed under Apache License 2.0.
+
+Keccak source files are distributed under CC0 1.0 Universal (CC0 1.0) Public Domain Dedication license.
+
+
+Author
+------
+
+SUZUKI Tetsuya <tetsuya.suzuki@gmail.com>
+
diff --git a/c_src/AVR8-rotate64.h b/c_src/AVR8-rotate64.h
new file mode 100755
index 0000000..4f921b9
--- /dev/null
+++ b/c_src/AVR8-rotate64.h
@@ -0,0 +1,27 @@
+/*
+File: AVR8-rotate64.h
+
+This code is originally by Daniel Otte (daniel.otte@rub.de) in 2006-2010 as part of the AVR-Crypto-Lib, and was then improved by Ronny Van Keer, STMicroelectronics, in 2010. 
+
+Implementation by Daniel Otte and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef ROTATE64_H_
+#define ROTATE64_H_
+
+#include <stdint.h>
+
+
+#define ROT_CODE(a) ((((a)/8+((((a)%8)>4)?1:0))<<4) | ((a) & 7))
+
+uint64_t rotate64_1bit_left(uint64_t a);
+uint64_t rotate64_1bit_right(uint64_t a);
+uint64_t rotate64left_code(uint64_t a, int8_t code);
+
+#endif /* ROTATE64_H_ */
+
diff --git a/c_src/AVR8-rotate64.s b/c_src/AVR8-rotate64.s
new file mode 100755
index 0000000..f30d030
--- /dev/null
+++ b/c_src/AVR8-rotate64.s
@@ -0,0 +1,285 @@
+/*
+File: AVR8-rotate64.s
+
+This code is originally by Daniel Otte (daniel.otte@rub.de) in 2006-2010 as part of the AVR-Crypto-Lib, and was then improved by Ronny Van Keer, STMicroelectronics, in 2010. 
+
+Implementation by Daniel Otte and Ronny Van Keer,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+.global rotate64_1bit_left
+rotate64_4bit_left:
+	lsl r18
+	rol r19
+	rol r20
+	rol r21
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	adc r18, r1
+rotate64_3bit_left:
+	lsl r18
+	rol r19
+	rol r20
+	rol r21
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	adc r18, r1
+rotate64_2bit_left:
+	lsl r18
+	rol r19
+	rol r20
+	rol r21
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	adc r18, r1
+rotate64_1bit_left:
+	lsl r18
+	rol r19
+	rol r20
+	rol r21
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	adc r18, r1
+	ret
+
+.global rotate64_1bit_right
+rotate64_3bit_right:
+	bst r18, 0
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	ror r20
+	ror r19
+	ror r18
+	bld r25, 7
+rotate64_2bit_right:
+	bst r18, 0
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	ror r20
+	ror r19
+	ror r18
+	bld r25, 7
+rotate64_1bit_right:
+	bst r18, 0
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	ror r20
+	ror r19
+	ror r18
+	bld r25, 7
+	ret
+
+/*
+**	Each byte rotate routine must be 16 instructions long.
+*/
+rotate64_0byte_left:
+	andi r16, 0x07
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+	add r30, r16
+
+	adc r31, r1
+	ijmp
+	nop
+	nop
+
+	nop
+	nop
+	nop
+	nop
+
+	nop
+	nop
+	nop
+	nop
+
+rotate64_1byte_left:
+	mov r0, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+
+	mov r22, r21
+	mov r21, r20
+	mov r20, r19
+	mov r19, r18
+
+	mov r18, r0
+	andi r16, 0x07
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+
+	add r30, r16
+	adc r31, r1
+	ijmp
+	nop
+
+rotate64_2byte_left:
+	movw r0, r24
+	movw r24, r22
+	movw r22, r20
+	movw r20, r18
+
+	movw r18, r0
+	clr r1
+	andi r16, 0x07
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+	add r30, r16
+	adc r31, r1
+	ijmp
+
+	nop
+	nop
+	nop
+	nop
+
+rotate64_3byte_left:
+	mov r0, r25
+	mov r25, r22
+	mov r22, r19
+	mov r19, r24
+
+	mov r24, r21
+	mov r21, r18
+	mov r18, r23
+	mov r23, r20
+
+	mov r20, r0
+	andi r16, 0x07
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+
+	add r30, r16
+	adc r31, r1
+	ijmp
+	nop
+
+rotate64_4byte_left:
+	movw r0, r24
+	movw r24, r20
+	movw r20, r0
+	movw r0, r22
+
+	movw r22, r18
+	movw r18, r0
+	clr r1
+	andi r16, 0x07
+
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+	add r30, r16
+	adc r31, r1
+
+	ijmp
+	nop
+	nop
+	nop
+
+rotate64_5byte_left:
+	mov r0, r25
+	mov r25, r20
+	mov r20, r23
+	mov r23, r18
+
+	mov r18, r21
+	mov r21, r24
+	mov r24, r19
+	mov r19, r22
+
+	mov r22, r0
+	andi r16, 0x07
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+
+	add r30, r16
+	adc r31, r1
+	ijmp
+	nop
+
+rotate64_6byte_left:
+	movw r0, r18
+	movw r18, r20
+	movw r20, r22
+	movw r22, r24
+
+	movw r24, r0
+	clr r1
+	andi r16, 0x07
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+	add r30, r16
+	adc r31, r1
+	ijmp
+
+	nop
+	nop
+	nop
+	nop
+
+rotate64_7byte_left:
+	mov r0, r18
+	mov r18, r19
+	mov r19, r20
+	mov r20, r21
+
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+
+	mov r25, r0
+	andi r16, 0x07
+	ldi r30, pm_lo8(bit_rot_jmp_table)
+	ldi r31, pm_hi8(bit_rot_jmp_table)
+
+	add r30, r16
+	adc r31, r1
+	ijmp
+	nop
+
+
+bit_rot_jmp_table:
+	ret
+	rjmp rotate64_1bit_left
+	rjmp rotate64_2bit_left
+	rjmp rotate64_3bit_left
+	rjmp rotate64_4bit_left
+	rjmp rotate64_3bit_right
+	rjmp rotate64_2bit_right
+	rjmp rotate64_1bit_right
+
+.global rotate64left_code
+rotate64left_code:
+	ldi r30, pm_lo8(rotate64_0byte_left)
+	ldi r31, pm_hi8(rotate64_0byte_left)
+	mov r0, r16
+	andi r16, 0x70
+	add r30, r16
+	adc r31, r1
+	mov r16, r0
+	ijmp
+	
\ No newline at end of file
diff --git a/c_src/Keccak-avr8-settings.h b/c_src/Keccak-avr8-settings.h
new file mode 100755
index 0000000..030e8eb
--- /dev/null
+++ b/c_src/Keccak-avr8-settings.h
@@ -0,0 +1,2 @@
+#define	cKeccakR							1088
+#define cKeccakFixedOutputLengthInBytes		32
diff --git a/c_src/KeccakF-1600-32-rvk.macros b/c_src/KeccakF-1600-32-rvk.macros
new file mode 100755
index 0000000..c0c9029
--- /dev/null
+++ b/c_src/KeccakF-1600-32-rvk.macros
@@ -0,0 +1,555 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by Ronny Van Keer,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+static const UINT32 KeccakF1600RoundConstants_int2[2*24] =
+{
+    0x00000001UL,    0x00000000UL,
+    0x00000000UL,    0x00000089UL,
+    0x00000000UL,    0x8000008bUL,
+    0x00000000UL,    0x80008080UL,
+    0x00000001UL,    0x0000008bUL,
+    0x00000001UL,    0x00008000UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000001UL,    0x80000082UL,
+    0x00000000UL,    0x0000000bUL,
+    0x00000000UL,    0x0000000aUL,
+    0x00000001UL,    0x00008082UL,
+    0x00000000UL,    0x00008003UL,
+    0x00000001UL,    0x0000808bUL,
+    0x00000001UL,    0x8000000bUL,
+    0x00000001UL,    0x8000008aUL,
+    0x00000001UL,    0x80000081UL,
+    0x00000000UL,    0x80000081UL,
+    0x00000000UL,    0x80000008UL,
+    0x00000000UL,    0x00000083UL,
+    0x00000000UL,    0x80008003UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000000UL,    0x80000088UL,
+    0x00000001UL,    0x00008000UL,
+    0x00000000UL,    0x80008082UL
+};
+
+#undef rounds
+
+#define rounds \
+{ \
+    UINT32 Da0, De0, Di0, Do0, Du0; \
+    UINT32 Da1, De1, Di1, Do1, Du1; \
+    UINT32 Ba, Be, Bi, Bo, Bu; \
+    UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \
+    UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \
+    UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \
+    UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \
+    UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \
+    UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \
+    UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \
+    UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \
+    UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \
+    UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \
+    UINT32 Cw, Cx, Cy, Cz; \
+    UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \
+    UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \
+    UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \
+    UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \
+    UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \
+    UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \
+    UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \
+    UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \
+    UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \
+    UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \
+	const UINT32 * pRoundConstants = KeccakF1600RoundConstants_int2; \
+    UINT32 i; \
+\
+    copyFromState(A, state) \
+\
+    for( i = 12; i != 0; --i ) { \
+	    Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
+	    Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+	    Da0 = Cx^ROL32(Du1, 1); \
+	    Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
+	    Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+	    Da1 = Cz^Du0; \
+\
+	    Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
+	    Do0 = Cw^ROL32(Cz, 1); \
+	    Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
+	    Do1 = Cy^Cx; \
+\
+	    Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
+	    De0 = Cx^ROL32(Cy, 1); \
+	    Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
+	    De1 = Cz^Cw; \
+\
+	    Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
+	    Di0 = Du0^ROL32(Cy, 1); \
+	    Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
+	    Di1 = Du1^Cw; \
+\
+	    Du0 = Cw^ROL32(Cz, 1); \
+	    Du1 = Cy^Cx; \
+\
+	    Aba0 ^= Da0; \
+	    Ba = Aba0; \
+	    Age0 ^= De0; \
+	    Be = ROL32(Age0, 22); \
+	    Aki1 ^= Di1; \
+	    Bi = ROL32(Aki1, 22); \
+	    Amo1 ^= Do1; \
+	    Bo = ROL32(Amo1, 11); \
+	    Asu0 ^= Du0; \
+	    Bu = ROL32(Asu0, 7); \
+	    Eba0 =   Ba ^((~Be)&  Bi ) ^ *(pRoundConstants++); \
+	    Ebe0 =   Be ^((~Bi)&  Bo ); \
+	    Ebi0 =   Bi ^((~Bo)&  Bu ); \
+	    Ebo0 =   Bo ^((~Bu)&  Ba ); \
+	    Ebu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abo0 ^= Do0; \
+	    Ba = ROL32(Abo0, 14); \
+	    Agu0 ^= Du0; \
+	    Be = ROL32(Agu0, 10); \
+	    Aka1 ^= Da1; \
+	    Bi = ROL32(Aka1, 2); \
+	    Ame1 ^= De1; \
+	    Bo = ROL32(Ame1, 23); \
+	    Asi1 ^= Di1; \
+	    Bu = ROL32(Asi1, 31); \
+	    Ega0 =   Ba ^((~Be)&  Bi ); \
+	    Ege0 =   Be ^((~Bi)&  Bo ); \
+	    Egi0 =   Bi ^((~Bo)&  Bu ); \
+	    Ego0 =   Bo ^((~Bu)&  Ba ); \
+	    Egu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abe1 ^= De1; \
+	    Ba = ROL32(Abe1, 1); \
+	    Agi0 ^= Di0; \
+	    Be = ROL32(Agi0, 3); \
+	    Ako1 ^= Do1; \
+	    Bi = ROL32(Ako1, 13); \
+	    Amu0 ^= Du0; \
+	    Bo = ROL32(Amu0, 4); \
+	    Asa0 ^= Da0; \
+	    Bu = ROL32(Asa0, 9); \
+	    Eka0 =   Ba ^((~Be)&  Bi ); \
+	    Eke0 =   Be ^((~Bi)&  Bo ); \
+	    Eki0 =   Bi ^((~Bo)&  Bu ); \
+	    Eko0 =   Bo ^((~Bu)&  Ba ); \
+	    Eku0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abu1 ^= Du1; \
+	    Ba = ROL32(Abu1, 14); \
+	    Aga0 ^= Da0; \
+	    Be = ROL32(Aga0, 18); \
+	    Ake0 ^= De0; \
+	    Bi = ROL32(Ake0, 5); \
+	    Ami1 ^= Di1; \
+	    Bo = ROL32(Ami1, 8); \
+	    Aso0 ^= Do0; \
+	    Bu = ROL32(Aso0, 28); \
+	    Ema0 =   Ba ^((~Be)&  Bi ); \
+	    Eme0 =   Be ^((~Bi)&  Bo ); \
+	    Emi0 =   Bi ^((~Bo)&  Bu ); \
+	    Emo0 =   Bo ^((~Bu)&  Ba ); \
+	    Emu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abi0 ^= Di0; \
+	    Ba = ROL32(Abi0, 31); \
+	    Ago1 ^= Do1; \
+	    Be = ROL32(Ago1, 28); \
+	    Aku1 ^= Du1; \
+	    Bi = ROL32(Aku1, 20); \
+	    Ama1 ^= Da1; \
+	    Bo = ROL32(Ama1, 21); \
+	    Ase0 ^= De0; \
+	    Bu = ROL32(Ase0, 1); \
+	    Esa0 =   Ba ^((~Be)&  Bi ); \
+	    Ese0 =   Be ^((~Bi)&  Bo ); \
+	    Esi0 =   Bi ^((~Bo)&  Bu ); \
+	    Eso0 =   Bo ^((~Bu)&  Ba ); \
+	    Esu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Aba1 ^= Da1; \
+	    Ba = Aba1; \
+	    Age1 ^= De1; \
+	    Be = ROL32(Age1, 22); \
+	    Aki0 ^= Di0; \
+	    Bi = ROL32(Aki0, 21); \
+	    Amo0 ^= Do0; \
+	    Bo = ROL32(Amo0, 10); \
+	    Asu1 ^= Du1; \
+	    Bu = ROL32(Asu1, 7); \
+	    Eba1 =   Ba ^((~Be)&  Bi ); \
+	    Eba1 ^= *(pRoundConstants++); \
+	    Ebe1 =   Be ^((~Bi)&  Bo ); \
+	    Ebi1 =   Bi ^((~Bo)&  Bu ); \
+	    Ebo1 =   Bo ^((~Bu)&  Ba ); \
+	    Ebu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abo1 ^= Do1; \
+	    Ba = ROL32(Abo1, 14); \
+	    Agu1 ^= Du1; \
+	    Be = ROL32(Agu1, 10); \
+	    Aka0 ^= Da0; \
+	    Bi = ROL32(Aka0, 1); \
+	    Ame0 ^= De0; \
+	    Bo = ROL32(Ame0, 22); \
+	    Asi0 ^= Di0; \
+	    Bu = ROL32(Asi0, 30); \
+	    Ega1 =   Ba ^((~Be)&  Bi ); \
+	    Ege1 =   Be ^((~Bi)&  Bo ); \
+	    Egi1 =   Bi ^((~Bo)&  Bu ); \
+	    Ego1 =   Bo ^((~Bu)&  Ba ); \
+	    Egu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abe0 ^= De0; \
+	    Ba = Abe0; \
+	    Agi1 ^= Di1; \
+	    Be = ROL32(Agi1, 3); \
+	    Ako0 ^= Do0; \
+	    Bi = ROL32(Ako0, 12); \
+	    Amu1 ^= Du1; \
+	    Bo = ROL32(Amu1, 4); \
+	    Asa1 ^= Da1; \
+	    Bu = ROL32(Asa1, 9); \
+	    Eka1 =   Ba ^((~Be)&  Bi ); \
+	    Eke1 =   Be ^((~Bi)&  Bo ); \
+	    Eki1 =   Bi ^((~Bo)&  Bu ); \
+	    Eko1 =   Bo ^((~Bu)&  Ba ); \
+	    Eku1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abu0 ^= Du0; \
+	    Ba = ROL32(Abu0, 13); \
+	    Aga1 ^= Da1; \
+	    Be = ROL32(Aga1, 18); \
+	    Ake1 ^= De1; \
+	    Bi = ROL32(Ake1, 5); \
+	    Ami0 ^= Di0; \
+	    Bo = ROL32(Ami0, 7); \
+	    Aso1 ^= Do1; \
+	    Bu = ROL32(Aso1, 28); \
+	    Ema1 =   Ba ^((~Be)&  Bi ); \
+	    Eme1 =   Be ^((~Bi)&  Bo ); \
+	    Emi1 =   Bi ^((~Bo)&  Bu ); \
+	    Emo1 =   Bo ^((~Bu)&  Ba ); \
+	    Emu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Abi1 ^= Di1; \
+	    Ba = ROL32(Abi1, 31); \
+	    Ago0 ^= Do0; \
+	    Be = ROL32(Ago0, 27); \
+	    Aku0 ^= Du0; \
+	    Bi = ROL32(Aku0, 19); \
+	    Ama0 ^= Da0; \
+	    Bo = ROL32(Ama0, 20); \
+	    Ase1 ^= De1; \
+	    Bu = ROL32(Ase1, 1); \
+	    Esa1 =   Ba ^((~Be)&  Bi ); \
+	    Ese1 =   Be ^((~Bi)&  Bo ); \
+	    Esi1 =   Bi ^((~Bo)&  Bu ); \
+	    Eso1 =   Bo ^((~Bu)&  Ba ); \
+	    Esu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Cx = Ebu0^Egu0^Eku0^Emu0^Esu0; \
+	    Du1 = Ebe1^Ege1^Eke1^Eme1^Ese1; \
+	    Da0 = Cx^ROL32(Du1, 1); \
+	    Cz = Ebu1^Egu1^Eku1^Emu1^Esu1; \
+	    Du0 = Ebe0^Ege0^Eke0^Eme0^Ese0; \
+	    Da1 = Cz^Du0; \
+\
+	    Cw = Ebi0^Egi0^Eki0^Emi0^Esi0; \
+	    Do0 = Cw^ROL32(Cz, 1); \
+	    Cy = Ebi1^Egi1^Eki1^Emi1^Esi1; \
+	    Do1 = Cy^Cx; \
+\
+	    Cx = Eba0^Ega0^Eka0^Ema0^Esa0; \
+	    De0 = Cx^ROL32(Cy, 1); \
+	    Cz = Eba1^Ega1^Eka1^Ema1^Esa1; \
+	    De1 = Cz^Cw; \
+\
+	    Cy = Ebo1^Ego1^Eko1^Emo1^Eso1; \
+	    Di0 = Du0^ROL32(Cy, 1); \
+	    Cw = Ebo0^Ego0^Eko0^Emo0^Eso0; \
+	    Di1 = Du1^Cw; \
+\
+	    Du0 = Cw^ROL32(Cz, 1); \
+	    Du1 = Cy^Cx; \
+\
+	    Eba0 ^= Da0; \
+	    Ba = Eba0; \
+	    Ege0 ^= De0; \
+	    Be = ROL32(Ege0, 22); \
+	    Eki1 ^= Di1; \
+	    Bi = ROL32(Eki1, 22); \
+	    Emo1 ^= Do1; \
+	    Bo = ROL32(Emo1, 11); \
+	    Esu0 ^= Du0; \
+	    Bu = ROL32(Esu0, 7); \
+	    Aba0 =   Ba ^((~Be)&  Bi ); \
+	    Aba0 ^= *(pRoundConstants++); \
+	    Abe0 =   Be ^((~Bi)&  Bo ); \
+	    Abi0 =   Bi ^((~Bo)&  Bu ); \
+	    Abo0 =   Bo ^((~Bu)&  Ba ); \
+	    Abu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebo0 ^= Do0; \
+	    Ba = ROL32(Ebo0, 14); \
+	    Egu0 ^= Du0; \
+	    Be = ROL32(Egu0, 10); \
+	    Eka1 ^= Da1; \
+	    Bi = ROL32(Eka1, 2); \
+	    Eme1 ^= De1; \
+	    Bo = ROL32(Eme1, 23); \
+	    Esi1 ^= Di1; \
+	    Bu = ROL32(Esi1, 31); \
+	    Aga0 =   Ba ^((~Be)&  Bi ); \
+	    Age0 =   Be ^((~Bi)&  Bo ); \
+	    Agi0 =   Bi ^((~Bo)&  Bu ); \
+	    Ago0 =   Bo ^((~Bu)&  Ba ); \
+	    Agu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebe1 ^= De1; \
+	    Ba = ROL32(Ebe1, 1); \
+	    Egi0 ^= Di0; \
+	    Be = ROL32(Egi0, 3); \
+	    Eko1 ^= Do1; \
+	    Bi = ROL32(Eko1, 13); \
+	    Emu0 ^= Du0; \
+	    Bo = ROL32(Emu0, 4); \
+	    Esa0 ^= Da0; \
+	    Bu = ROL32(Esa0, 9); \
+	    Aka0 =   Ba ^((~Be)&  Bi ); \
+	    Ake0 =   Be ^((~Bi)&  Bo ); \
+	    Aki0 =   Bi ^((~Bo)&  Bu ); \
+	    Ako0 =   Bo ^((~Bu)&  Ba ); \
+	    Aku0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebu1 ^= Du1; \
+	    Ba = ROL32(Ebu1, 14); \
+	    Ega0 ^= Da0; \
+	    Be = ROL32(Ega0, 18); \
+	    Eke0 ^= De0; \
+	    Bi = ROL32(Eke0, 5); \
+	    Emi1 ^= Di1; \
+	    Bo = ROL32(Emi1, 8); \
+	    Eso0 ^= Do0; \
+	    Bu = ROL32(Eso0, 28); \
+	    Ama0 =   Ba ^((~Be)&  Bi ); \
+	    Ame0 =   Be ^((~Bi)&  Bo ); \
+	    Ami0 =   Bi ^((~Bo)&  Bu ); \
+	    Amo0 =   Bo ^((~Bu)&  Ba ); \
+	    Amu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebi0 ^= Di0; \
+	    Ba = ROL32(Ebi0, 31); \
+	    Ego1 ^= Do1; \
+	    Be = ROL32(Ego1, 28); \
+	    Eku1 ^= Du1; \
+	    Bi = ROL32(Eku1, 20); \
+	    Ema1 ^= Da1; \
+	    Bo = ROL32(Ema1, 21); \
+	    Ese0 ^= De0; \
+	    Bu = ROL32(Ese0, 1); \
+	    Asa0 =   Ba ^((~Be)&  Bi ); \
+	    Ase0 =   Be ^((~Bi)&  Bo ); \
+	    Asi0 =   Bi ^((~Bo)&  Bu ); \
+	    Aso0 =   Bo ^((~Bu)&  Ba ); \
+	    Asu0 =   Bu ^((~Ba)&  Be ); \
+\
+	    Eba1 ^= Da1; \
+	    Ba = Eba1; \
+	    Ege1 ^= De1; \
+	    Be = ROL32(Ege1, 22); \
+	    Eki0 ^= Di0; \
+	    Bi = ROL32(Eki0, 21); \
+	    Emo0 ^= Do0; \
+	    Bo = ROL32(Emo0, 10); \
+	    Esu1 ^= Du1; \
+	    Bu = ROL32(Esu1, 7); \
+	    Aba1 =   Ba ^((~Be)&  Bi ); \
+	    Aba1 ^= *(pRoundConstants++); \
+	    Abe1 =   Be ^((~Bi)&  Bo ); \
+	    Abi1 =   Bi ^((~Bo)&  Bu ); \
+	    Abo1 =   Bo ^((~Bu)&  Ba ); \
+	    Abu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebo1 ^= Do1; \
+	    Ba = ROL32(Ebo1, 14); \
+	    Egu1 ^= Du1; \
+	    Be = ROL32(Egu1, 10); \
+	    Eka0 ^= Da0; \
+	    Bi = ROL32(Eka0, 1); \
+	    Eme0 ^= De0; \
+	    Bo = ROL32(Eme0, 22); \
+	    Esi0 ^= Di0; \
+	    Bu = ROL32(Esi0, 30); \
+	    Aga1 =   Ba ^((~Be)&  Bi ); \
+	    Age1 =   Be ^((~Bi)&  Bo ); \
+	    Agi1 =   Bi ^((~Bo)&  Bu ); \
+	    Ago1 =   Bo ^((~Bu)&  Ba ); \
+	    Agu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebe0 ^= De0; \
+	    Ba = Ebe0; \
+	    Egi1 ^= Di1; \
+	    Be = ROL32(Egi1, 3); \
+	    Eko0 ^= Do0; \
+	    Bi = ROL32(Eko0, 12); \
+	    Emu1 ^= Du1; \
+	    Bo = ROL32(Emu1, 4); \
+	    Esa1 ^= Da1; \
+	    Bu = ROL32(Esa1, 9); \
+	    Aka1 =   Ba ^((~Be)&  Bi ); \
+	    Ake1 =   Be ^((~Bi)&  Bo ); \
+	    Aki1 =   Bi ^((~Bo)&  Bu ); \
+	    Ako1 =   Bo ^((~Bu)&  Ba ); \
+	    Aku1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebu0 ^= Du0; \
+	    Ba = ROL32(Ebu0, 13); \
+	    Ega1 ^= Da1; \
+	    Be = ROL32(Ega1, 18); \
+	    Eke1 ^= De1; \
+	    Bi = ROL32(Eke1, 5); \
+	    Emi0 ^= Di0; \
+	    Bo = ROL32(Emi0, 7); \
+	    Eso1 ^= Do1; \
+	    Bu = ROL32(Eso1, 28); \
+	    Ama1 =   Ba ^((~Be)&  Bi ); \
+	    Ame1 =   Be ^((~Bi)&  Bo ); \
+	    Ami1 =   Bi ^((~Bo)&  Bu ); \
+	    Amo1 =   Bo ^((~Bu)&  Ba ); \
+	    Amu1 =   Bu ^((~Ba)&  Be ); \
+\
+	    Ebi1 ^= Di1; \
+	    Ba = ROL32(Ebi1, 31); \
+	    Ego0 ^= Do0; \
+	    Be = ROL32(Ego0, 27); \
+	    Eku0 ^= Du0; \
+	    Bi = ROL32(Eku0, 19); \
+	    Ema0 ^= Da0; \
+	    Bo = ROL32(Ema0, 20); \
+	    Ese1 ^= De1; \
+	    Bu = ROL32(Ese1, 1); \
+	    Asa1 =   Ba ^((~Be)&  Bi ); \
+	    Ase1 =   Be ^((~Bi)&  Bo ); \
+	    Asi1 =   Bi ^((~Bo)&  Bu ); \
+	    Aso1 =   Bo ^((~Bu)&  Ba ); \
+	    Asu1 =   Bu ^((~Ba)&  Be ); \
+    } \
+    copyToState(state, A) \
+}
+
+#define copyFromState(X, state) \
+    X##ba0 = state[ 0]; \
+    X##ba1 = state[ 1]; \
+    X##be0 = state[ 2]; \
+    X##be1 = state[ 3]; \
+    X##bi0 = state[ 4]; \
+    X##bi1 = state[ 5]; \
+    X##bo0 = state[ 6]; \
+    X##bo1 = state[ 7]; \
+    X##bu0 = state[ 8]; \
+    X##bu1 = state[ 9]; \
+    X##ga0 = state[10]; \
+    X##ga1 = state[11]; \
+    X##ge0 = state[12]; \
+    X##ge1 = state[13]; \
+    X##gi0 = state[14]; \
+    X##gi1 = state[15]; \
+    X##go0 = state[16]; \
+    X##go1 = state[17]; \
+    X##gu0 = state[18]; \
+    X##gu1 = state[19]; \
+    X##ka0 = state[20]; \
+    X##ka1 = state[21]; \
+    X##ke0 = state[22]; \
+    X##ke1 = state[23]; \
+    X##ki0 = state[24]; \
+    X##ki1 = state[25]; \
+    X##ko0 = state[26]; \
+    X##ko1 = state[27]; \
+    X##ku0 = state[28]; \
+    X##ku1 = state[29]; \
+    X##ma0 = state[30]; \
+    X##ma1 = state[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba0; \
+    state[ 1] = X##ba1; \
+    state[ 2] = X##be0; \
+    state[ 3] = X##be1; \
+    state[ 4] = X##bi0; \
+    state[ 5] = X##bi1; \
+    state[ 6] = X##bo0; \
+    state[ 7] = X##bo1; \
+    state[ 8] = X##bu0; \
+    state[ 9] = X##bu1; \
+    state[10] = X##ga0; \
+    state[11] = X##ga1; \
+    state[12] = X##ge0; \
+    state[13] = X##ge1; \
+    state[14] = X##gi0; \
+    state[15] = X##gi1; \
+    state[16] = X##go0; \
+    state[17] = X##go1; \
+    state[18] = X##gu0; \
+    state[19] = X##gu1; \
+    state[20] = X##ka0; \
+    state[21] = X##ka1; \
+    state[22] = X##ke0; \
+    state[23] = X##ke1; \
+    state[24] = X##ki0; \
+    state[25] = X##ki1; \
+    state[26] = X##ko0; \
+    state[27] = X##ko1; \
+    state[28] = X##ku0; \
+    state[29] = X##ku1; \
+    state[30] = X##ma0; \
+    state[31] = X##ma1; \
+    state[32] = X##me0; \
+    state[33] = X##me1; \
+    state[34] = X##mi0; \
+    state[35] = X##mi1; \
+    state[36] = X##mo0; \
+    state[37] = X##mo1; \
+    state[38] = X##mu0; \
+    state[39] = X##mu1; \
+    state[40] = X##sa0; \
+    state[41] = X##sa1; \
+    state[42] = X##se0; \
+    state[43] = X##se1; \
+    state[44] = X##si0; \
+    state[45] = X##si1; \
+    state[46] = X##so0; \
+    state[47] = X##so1; \
+    state[48] = X##su0; \
+    state[49] = X##su1; \
+
diff --git a/c_src/KeccakF-1600-32-s1.macros b/c_src/KeccakF-1600-32-s1.macros
new file mode 100755
index 0000000..973cc19
--- /dev/null
+++ b/c_src/KeccakF-1600-32-s1.macros
@@ -0,0 +1,1187 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \
+    UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \
+    UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \
+    UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \
+    UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \
+    UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \
+    UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \
+    UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \
+    UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \
+    UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \
+    UINT32 Bba0, Bbe0, Bbi0, Bbo0, Bbu0; \
+    UINT32 Bba1, Bbe1, Bbi1, Bbo1, Bbu1; \
+    UINT32 Bga0, Bge0, Bgi0, Bgo0, Bgu0; \
+    UINT32 Bga1, Bge1, Bgi1, Bgo1, Bgu1; \
+    UINT32 Bka0, Bke0, Bki0, Bko0, Bku0; \
+    UINT32 Bka1, Bke1, Bki1, Bko1, Bku1; \
+    UINT32 Bma0, Bme0, Bmi0, Bmo0, Bmu0; \
+    UINT32 Bma1, Bme1, Bmi1, Bmo1, Bmu1; \
+    UINT32 Bsa0, Bse0, Bsi0, Bso0, Bsu0; \
+    UINT32 Bsa1, Bse1, Bsi1, Bso1, Bsu1; \
+    UINT32 Ca0, Ce0, Ci0, Co0, Cu0; \
+    UINT32 Ca1, Ce1, Ci1, Co1, Cu1; \
+    UINT32 Da0, De0, Di0, Do0, Du0; \
+    UINT32 Da1, De1, Di1, Do1, Du1; \
+    UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \
+    UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \
+    UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \
+    UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \
+    UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \
+    UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \
+    UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \
+    UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \
+    UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \
+    UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \
+
+#define prepareTheta \
+    Ca0 = Aba0^Aga0^Aka0^Ama0^Asa0; \
+    Ca1 = Aba1^Aga1^Aka1^Ama1^Asa1; \
+    Ce0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+    Ce1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+    Ci0 = Abi0^Agi0^Aki0^Ami0^Asi0; \
+    Ci1 = Abi1^Agi1^Aki1^Ami1^Asi1; \
+    Co0 = Abo0^Ago0^Ako0^Amo0^Aso0; \
+    Co1 = Abo1^Ago1^Ako1^Amo1^Aso1; \
+    Cu0 = Abu0^Agu0^Aku0^Amu0^Asu0; \
+    Cu1 = Abu1^Agu1^Aku1^Amu1^Asu1; \
+
+#ifdef UseBebigokimisa
+// --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa')
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    Ce0 = E##be0; \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    Ce1 = E##be1; \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    Ce0 ^= E##se0; \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    Ce1 ^= E##se1; \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+// --- Code for round (lane complementing pattern 'bebigokimisa')
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+\
+
+#else // UseBebigokimisa
+// --- Code for round, with prepare-theta
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    Ce0 = E##be0; \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    Ce1 = E##be1; \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    Ce0 ^= E##se0; \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    Ce1 ^= E##se1; \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+// --- Code for round
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+\
+
+#endif // UseBebigokimisa
+
+const UINT32 KeccakF1600RoundConstants_int2_0[24] = {
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL };
+
+const UINT32 KeccakF1600RoundConstants_int2_1[24] = {
+    0x00000000UL,
+    0x00000089UL,
+    0x8000008bUL,
+    0x80008080UL,
+    0x0000008bUL,
+    0x00008000UL,
+    0x80008088UL,
+    0x80000082UL,
+    0x0000000bUL,
+    0x0000000aUL,
+    0x00008082UL,
+    0x00008003UL,
+    0x0000808bUL,
+    0x8000000bUL,
+    0x8000008aUL,
+    0x80000081UL,
+    0x80000081UL,
+    0x80000008UL,
+    0x00000083UL,
+    0x80008003UL,
+    0x80008088UL,
+    0x80000088UL,
+    0x00008000UL,
+    0x80008082UL };
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]^input[32]; \
+    X##me1 = state[33]^input[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromState(X, state) \
+    X##ba0 = state[ 0]; \
+    X##ba1 = state[ 1]; \
+    X##be0 = state[ 2]; \
+    X##be1 = state[ 3]; \
+    X##bi0 = state[ 4]; \
+    X##bi1 = state[ 5]; \
+    X##bo0 = state[ 6]; \
+    X##bo1 = state[ 7]; \
+    X##bu0 = state[ 8]; \
+    X##bu1 = state[ 9]; \
+    X##ga0 = state[10]; \
+    X##ga1 = state[11]; \
+    X##ge0 = state[12]; \
+    X##ge1 = state[13]; \
+    X##gi0 = state[14]; \
+    X##gi1 = state[15]; \
+    X##go0 = state[16]; \
+    X##go1 = state[17]; \
+    X##gu0 = state[18]; \
+    X##gu1 = state[19]; \
+    X##ka0 = state[20]; \
+    X##ka1 = state[21]; \
+    X##ke0 = state[22]; \
+    X##ke1 = state[23]; \
+    X##ki0 = state[24]; \
+    X##ki1 = state[25]; \
+    X##ko0 = state[26]; \
+    X##ko1 = state[27]; \
+    X##ku0 = state[28]; \
+    X##ku1 = state[29]; \
+    X##ma0 = state[30]; \
+    X##ma1 = state[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba0; \
+    state[ 1] = X##ba1; \
+    state[ 2] = X##be0; \
+    state[ 3] = X##be1; \
+    state[ 4] = X##bi0; \
+    state[ 5] = X##bi1; \
+    state[ 6] = X##bo0; \
+    state[ 7] = X##bo1; \
+    state[ 8] = X##bu0; \
+    state[ 9] = X##bu1; \
+    state[10] = X##ga0; \
+    state[11] = X##ga1; \
+    state[12] = X##ge0; \
+    state[13] = X##ge1; \
+    state[14] = X##gi0; \
+    state[15] = X##gi1; \
+    state[16] = X##go0; \
+    state[17] = X##go1; \
+    state[18] = X##gu0; \
+    state[19] = X##gu1; \
+    state[20] = X##ka0; \
+    state[21] = X##ka1; \
+    state[22] = X##ke0; \
+    state[23] = X##ke1; \
+    state[24] = X##ki0; \
+    state[25] = X##ki1; \
+    state[26] = X##ko0; \
+    state[27] = X##ko1; \
+    state[28] = X##ku0; \
+    state[29] = X##ku1; \
+    state[30] = X##ma0; \
+    state[31] = X##ma1; \
+    state[32] = X##me0; \
+    state[33] = X##me1; \
+    state[34] = X##mi0; \
+    state[35] = X##mi1; \
+    state[36] = X##mo0; \
+    state[37] = X##mo1; \
+    state[38] = X##mu0; \
+    state[39] = X##mu1; \
+    state[40] = X##sa0; \
+    state[41] = X##sa1; \
+    state[42] = X##se0; \
+    state[43] = X##se1; \
+    state[44] = X##si0; \
+    state[45] = X##si1; \
+    state[46] = X##so0; \
+    state[47] = X##so1; \
+    state[48] = X##su0; \
+    state[49] = X##su1; \
+
+#define copyStateVariables(X, Y) \
+    X##ba0 = Y##ba0; \
+    X##ba1 = Y##ba1; \
+    X##be0 = Y##be0; \
+    X##be1 = Y##be1; \
+    X##bi0 = Y##bi0; \
+    X##bi1 = Y##bi1; \
+    X##bo0 = Y##bo0; \
+    X##bo1 = Y##bo1; \
+    X##bu0 = Y##bu0; \
+    X##bu1 = Y##bu1; \
+    X##ga0 = Y##ga0; \
+    X##ga1 = Y##ga1; \
+    X##ge0 = Y##ge0; \
+    X##ge1 = Y##ge1; \
+    X##gi0 = Y##gi0; \
+    X##gi1 = Y##gi1; \
+    X##go0 = Y##go0; \
+    X##go1 = Y##go1; \
+    X##gu0 = Y##gu0; \
+    X##gu1 = Y##gu1; \
+    X##ka0 = Y##ka0; \
+    X##ka1 = Y##ka1; \
+    X##ke0 = Y##ke0; \
+    X##ke1 = Y##ke1; \
+    X##ki0 = Y##ki0; \
+    X##ki1 = Y##ki1; \
+    X##ko0 = Y##ko0; \
+    X##ko1 = Y##ko1; \
+    X##ku0 = Y##ku0; \
+    X##ku1 = Y##ku1; \
+    X##ma0 = Y##ma0; \
+    X##ma1 = Y##ma1; \
+    X##me0 = Y##me0; \
+    X##me1 = Y##me1; \
+    X##mi0 = Y##mi0; \
+    X##mi1 = Y##mi1; \
+    X##mo0 = Y##mo0; \
+    X##mo1 = Y##mo1; \
+    X##mu0 = Y##mu0; \
+    X##mu1 = Y##mu1; \
+    X##sa0 = Y##sa0; \
+    X##sa1 = Y##sa1; \
+    X##se0 = Y##se0; \
+    X##se1 = Y##se1; \
+    X##si0 = Y##si0; \
+    X##si1 = Y##si1; \
+    X##so0 = Y##so0; \
+    X##so1 = Y##so1; \
+    X##su0 = Y##su0; \
+    X##su1 = Y##su1; \
+
diff --git a/c_src/KeccakF-1600-32-s2.macros b/c_src/KeccakF-1600-32-s2.macros
new file mode 100755
index 0000000..3c27a34
--- /dev/null
+++ b/c_src/KeccakF-1600-32-s2.macros
@@ -0,0 +1,1187 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \
+    UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \
+    UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \
+    UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \
+    UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \
+    UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \
+    UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \
+    UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \
+    UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \
+    UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \
+    UINT32 Bba0, Bbe0, Bbi0, Bbo0, Bbu0; \
+    UINT32 Bba1, Bbe1, Bbi1, Bbo1, Bbu1; \
+    UINT32 Bga0, Bge0, Bgi0, Bgo0, Bgu0; \
+    UINT32 Bga1, Bge1, Bgi1, Bgo1, Bgu1; \
+    UINT32 Bka0, Bke0, Bki0, Bko0, Bku0; \
+    UINT32 Bka1, Bke1, Bki1, Bko1, Bku1; \
+    UINT32 Bma0, Bme0, Bmi0, Bmo0, Bmu0; \
+    UINT32 Bma1, Bme1, Bmi1, Bmo1, Bmu1; \
+    UINT32 Bsa0, Bse0, Bsi0, Bso0, Bsu0; \
+    UINT32 Bsa1, Bse1, Bsi1, Bso1, Bsu1; \
+    UINT32 Ca0, Ce0, Ci0, Co0, Cu0; \
+    UINT32 Ca1, Ce1, Ci1, Co1, Cu1; \
+    UINT32 Da0, De0, Di0, Do0, Du0; \
+    UINT32 Da1, De1, Di1, Do1, Du1; \
+    UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \
+    UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \
+    UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \
+    UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \
+    UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \
+    UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \
+    UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \
+    UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \
+    UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \
+    UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \
+
+#define prepareTheta \
+    Ca0 = Aba0^Aga0^Aka0^Ama0^Asa0; \
+    Ca1 = Aba1^Aga1^Aka1^Ama1^Asa1; \
+    Ce0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+    Ce1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+    Ci0 = Abi0^Agi0^Aki0^Ami0^Asi0; \
+    Ci1 = Abi1^Agi1^Aki1^Ami1^Asi1; \
+    Co0 = Abo0^Ago0^Ako0^Amo0^Aso0; \
+    Co1 = Abo1^Ago1^Ako1^Amo1^Aso1; \
+    Cu0 = Abu0^Agu0^Aku0^Amu0^Asu0; \
+    Cu1 = Abu1^Agu1^Aku1^Amu1^Asu1; \
+
+#ifdef UseBebigokimisa
+// --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa')
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    Ce0 = E##be0; \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    Ce1 = E##be1; \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    Ce0 ^= E##se0; \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    Ce1 ^= E##se1; \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+// --- Code for round (lane complementing pattern 'bebigokimisa')
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
+    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
+    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
+    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
+    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
+    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
+    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
+    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
+    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
+    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
+    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
+    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
+    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
+    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
+    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
+    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
+    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
+    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
+    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
+\
+
+#else // UseBebigokimisa
+// --- Code for round, with prepare-theta
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    Ca0 = E##ba0; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    Ce0 = E##be0; \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    Ci0 = E##bi0; \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    Co0 = E##bo0; \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+    Cu0 = E##bu0; \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    Ca1 = E##ba1; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    Ce1 = E##be1; \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    Ci1 = E##bi1; \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    Co1 = E##bo1; \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+    Cu1 = E##bu1; \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    Ca0 ^= E##ga0; \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    Ce0 ^= E##ge0; \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    Ci0 ^= E##gi0; \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    Co0 ^= E##go0; \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+    Cu0 ^= E##gu0; \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    Ca1 ^= E##ga1; \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    Ce1 ^= E##ge1; \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    Ci1 ^= E##gi1; \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    Co1 ^= E##go1; \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+    Cu1 ^= E##gu1; \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    Ca0 ^= E##ka0; \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    Ce0 ^= E##ke0; \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    Ci0 ^= E##ki0; \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    Co0 ^= E##ko0; \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+    Cu0 ^= E##ku0; \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    Ca1 ^= E##ka1; \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    Ce1 ^= E##ke1; \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    Ci1 ^= E##ki1; \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    Co1 ^= E##ko1; \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+    Cu1 ^= E##ku1; \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    Ca0 ^= E##ma0; \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    Ce0 ^= E##me0; \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    Ci0 ^= E##mi0; \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    Co0 ^= E##mo0; \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+    Cu0 ^= E##mu0; \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    Ca1 ^= E##ma1; \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    Ce1 ^= E##me1; \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    Ci1 ^= E##mi1; \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    Co1 ^= E##mo1; \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+    Cu1 ^= E##mu1; \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    Ca0 ^= E##sa0; \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    Ce0 ^= E##se0; \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    Ci0 ^= E##si0; \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    Co0 ^= E##so0; \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+    Cu0 ^= E##su0; \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    Ca1 ^= E##sa1; \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    Ce1 ^= E##se1; \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    Ci1 ^= E##si1; \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    Co1 ^= E##so1; \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+    Cu1 ^= E##su1; \
+\
+
+// --- Code for round
+// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
+#define thetaRhoPiChiIota(i, A, E) \
+    Da0 = Cu0^ROL32(Ce1, 1); \
+    Da1 = Cu1^Ce0; \
+    De0 = Ca0^ROL32(Ci1, 1); \
+    De1 = Ca1^Ci0; \
+    Di0 = Ce0^ROL32(Co1, 1); \
+    Di1 = Ce1^Co0; \
+    Do0 = Ci0^ROL32(Cu1, 1); \
+    Do1 = Ci1^Cu0; \
+    Du0 = Co0^ROL32(Ca1, 1); \
+    Du1 = Co1^Ca0; \
+\
+    A##ba0 ^= Da0; \
+    Bba0 = A##ba0; \
+    A##ge0 ^= De0; \
+    Bbe0 = ROL32(A##ge0, 22); \
+    A##ki1 ^= Di1; \
+    Bbi0 = ROL32(A##ki1, 22); \
+    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
+    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
+    A##mo1 ^= Do1; \
+    Bbo0 = ROL32(A##mo1, 11); \
+    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
+    A##su0 ^= Du0; \
+    Bbu0 = ROL32(A##su0, 7); \
+    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
+    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
+    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
+\
+    A##ba1 ^= Da1; \
+    Bba1 = A##ba1; \
+    A##ge1 ^= De1; \
+    Bbe1 = ROL32(A##ge1, 22); \
+    A##ki0 ^= Di0; \
+    Bbi1 = ROL32(A##ki0, 21); \
+    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
+    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
+    A##mo0 ^= Do0; \
+    Bbo1 = ROL32(A##mo0, 10); \
+    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
+    A##su1 ^= Du1; \
+    Bbu1 = ROL32(A##su1, 7); \
+    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
+    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
+    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
+\
+    A##bo0 ^= Do0; \
+    Bga0 = ROL32(A##bo0, 14); \
+    A##gu0 ^= Du0; \
+    Bge0 = ROL32(A##gu0, 10); \
+    A##ka1 ^= Da1; \
+    Bgi0 = ROL32(A##ka1, 2); \
+    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
+    A##me1 ^= De1; \
+    Bgo0 = ROL32(A##me1, 23); \
+    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
+    A##si1 ^= Di1; \
+    Bgu0 = ROL32(A##si1, 31); \
+    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
+    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
+    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
+\
+    A##bo1 ^= Do1; \
+    Bga1 = ROL32(A##bo1, 14); \
+    A##gu1 ^= Du1; \
+    Bge1 = ROL32(A##gu1, 10); \
+    A##ka0 ^= Da0; \
+    Bgi1 = ROL32(A##ka0, 1); \
+    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
+    A##me0 ^= De0; \
+    Bgo1 = ROL32(A##me0, 22); \
+    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
+    A##si0 ^= Di0; \
+    Bgu1 = ROL32(A##si0, 30); \
+    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
+    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
+    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
+\
+    A##be1 ^= De1; \
+    Bka0 = ROL32(A##be1, 1); \
+    A##gi0 ^= Di0; \
+    Bke0 = ROL32(A##gi0, 3); \
+    A##ko1 ^= Do1; \
+    Bki0 = ROL32(A##ko1, 13); \
+    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
+    A##mu0 ^= Du0; \
+    Bko0 = ROL32(A##mu0, 4); \
+    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
+    A##sa0 ^= Da0; \
+    Bku0 = ROL32(A##sa0, 9); \
+    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
+    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
+    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
+\
+    A##be0 ^= De0; \
+    Bka1 = A##be0; \
+    A##gi1 ^= Di1; \
+    Bke1 = ROL32(A##gi1, 3); \
+    A##ko0 ^= Do0; \
+    Bki1 = ROL32(A##ko0, 12); \
+    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
+    A##mu1 ^= Du1; \
+    Bko1 = ROL32(A##mu1, 4); \
+    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
+    A##sa1 ^= Da1; \
+    Bku1 = ROL32(A##sa1, 9); \
+    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
+    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
+    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
+\
+    A##bu1 ^= Du1; \
+    Bma0 = ROL32(A##bu1, 14); \
+    A##ga0 ^= Da0; \
+    Bme0 = ROL32(A##ga0, 18); \
+    A##ke0 ^= De0; \
+    Bmi0 = ROL32(A##ke0, 5); \
+    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
+    A##mi1 ^= Di1; \
+    Bmo0 = ROL32(A##mi1, 8); \
+    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
+    A##so0 ^= Do0; \
+    Bmu0 = ROL32(A##so0, 28); \
+    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
+    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
+    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
+\
+    A##bu0 ^= Du0; \
+    Bma1 = ROL32(A##bu0, 13); \
+    A##ga1 ^= Da1; \
+    Bme1 = ROL32(A##ga1, 18); \
+    A##ke1 ^= De1; \
+    Bmi1 = ROL32(A##ke1, 5); \
+    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
+    A##mi0 ^= Di0; \
+    Bmo1 = ROL32(A##mi0, 7); \
+    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
+    A##so1 ^= Do1; \
+    Bmu1 = ROL32(A##so1, 28); \
+    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
+    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
+    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
+\
+    A##bi0 ^= Di0; \
+    Bsa0 = ROL32(A##bi0, 31); \
+    A##go1 ^= Do1; \
+    Bse0 = ROL32(A##go1, 28); \
+    A##ku1 ^= Du1; \
+    Bsi0 = ROL32(A##ku1, 20); \
+    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
+    A##ma1 ^= Da1; \
+    Bso0 = ROL32(A##ma1, 21); \
+    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
+    A##se0 ^= De0; \
+    Bsu0 = ROL32(A##se0, 1); \
+    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
+    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
+    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
+\
+    A##bi1 ^= Di1; \
+    Bsa1 = ROL32(A##bi1, 31); \
+    A##go0 ^= Do0; \
+    Bse1 = ROL32(A##go0, 27); \
+    A##ku0 ^= Du0; \
+    Bsi1 = ROL32(A##ku0, 19); \
+    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
+    A##ma0 ^= Da0; \
+    Bso1 = ROL32(A##ma0, 20); \
+    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
+    A##se1 ^= De1; \
+    Bsu1 = ROL32(A##se1, 1); \
+    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
+    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
+    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
+\
+
+#endif // UseBebigokimisa
+
+const UINT32 KeccakF1600RoundConstants_int2_0[24] = {
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL,
+    0x00000001UL,
+    0x00000000UL };
+
+const UINT32 KeccakF1600RoundConstants_int2_1[24] = {
+    0x00000000UL,
+    0x00000089UL,
+    0x8000008bUL,
+    0x80008080UL,
+    0x0000008bUL,
+    0x00008000UL,
+    0x80008088UL,
+    0x80000082UL,
+    0x0000000bUL,
+    0x0000000aUL,
+    0x00008082UL,
+    0x00008003UL,
+    0x0000808bUL,
+    0x8000000bUL,
+    0x8000008aUL,
+    0x80000081UL,
+    0x80000081UL,
+    0x80000008UL,
+    0x00000083UL,
+    0x80008003UL,
+    0x80008088UL,
+    0x80000088UL,
+    0x00008000UL,
+    0x80008082UL };
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba0 = state[ 0]^input[ 0]; \
+    X##ba1 = state[ 1]^input[ 1]; \
+    X##be0 = state[ 2]^input[ 2]; \
+    X##be1 = state[ 3]^input[ 3]; \
+    X##bi0 = state[ 4]^input[ 4]; \
+    X##bi1 = state[ 5]^input[ 5]; \
+    X##bo0 = state[ 6]^input[ 6]; \
+    X##bo1 = state[ 7]^input[ 7]; \
+    X##bu0 = state[ 8]^input[ 8]; \
+    X##bu1 = state[ 9]^input[ 9]; \
+    X##ga0 = state[10]^input[10]; \
+    X##ga1 = state[11]^input[11]; \
+    X##ge0 = state[12]^input[12]; \
+    X##ge1 = state[13]^input[13]; \
+    X##gi0 = state[14]^input[14]; \
+    X##gi1 = state[15]^input[15]; \
+    X##go0 = state[16]^input[16]; \
+    X##go1 = state[17]^input[17]; \
+    X##gu0 = state[18]^input[18]; \
+    X##gu1 = state[19]^input[19]; \
+    X##ka0 = state[20]^input[20]; \
+    X##ka1 = state[21]^input[21]; \
+    X##ke0 = state[22]^input[22]; \
+    X##ke1 = state[23]^input[23]; \
+    X##ki0 = state[24]^input[24]; \
+    X##ki1 = state[25]^input[25]; \
+    X##ko0 = state[26]^input[26]; \
+    X##ko1 = state[27]^input[27]; \
+    X##ku0 = state[28]^input[28]; \
+    X##ku1 = state[29]^input[29]; \
+    X##ma0 = state[30]^input[30]; \
+    X##ma1 = state[31]^input[31]; \
+    X##me0 = state[32]^input[32]; \
+    X##me1 = state[33]^input[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyFromState(X, state) \
+    X##ba0 = state[ 0]; \
+    X##ba1 = state[ 1]; \
+    X##be0 = state[ 2]; \
+    X##be1 = state[ 3]; \
+    X##bi0 = state[ 4]; \
+    X##bi1 = state[ 5]; \
+    X##bo0 = state[ 6]; \
+    X##bo1 = state[ 7]; \
+    X##bu0 = state[ 8]; \
+    X##bu1 = state[ 9]; \
+    X##ga0 = state[10]; \
+    X##ga1 = state[11]; \
+    X##ge0 = state[12]; \
+    X##ge1 = state[13]; \
+    X##gi0 = state[14]; \
+    X##gi1 = state[15]; \
+    X##go0 = state[16]; \
+    X##go1 = state[17]; \
+    X##gu0 = state[18]; \
+    X##gu1 = state[19]; \
+    X##ka0 = state[20]; \
+    X##ka1 = state[21]; \
+    X##ke0 = state[22]; \
+    X##ke1 = state[23]; \
+    X##ki0 = state[24]; \
+    X##ki1 = state[25]; \
+    X##ko0 = state[26]; \
+    X##ko1 = state[27]; \
+    X##ku0 = state[28]; \
+    X##ku1 = state[29]; \
+    X##ma0 = state[30]; \
+    X##ma1 = state[31]; \
+    X##me0 = state[32]; \
+    X##me1 = state[33]; \
+    X##mi0 = state[34]; \
+    X##mi1 = state[35]; \
+    X##mo0 = state[36]; \
+    X##mo1 = state[37]; \
+    X##mu0 = state[38]; \
+    X##mu1 = state[39]; \
+    X##sa0 = state[40]; \
+    X##sa1 = state[41]; \
+    X##se0 = state[42]; \
+    X##se1 = state[43]; \
+    X##si0 = state[44]; \
+    X##si1 = state[45]; \
+    X##so0 = state[46]; \
+    X##so1 = state[47]; \
+    X##su0 = state[48]; \
+    X##su1 = state[49]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba0; \
+    state[ 1] = X##ba1; \
+    state[ 2] = X##be0; \
+    state[ 3] = X##be1; \
+    state[ 4] = X##bi0; \
+    state[ 5] = X##bi1; \
+    state[ 6] = X##bo0; \
+    state[ 7] = X##bo1; \
+    state[ 8] = X##bu0; \
+    state[ 9] = X##bu1; \
+    state[10] = X##ga0; \
+    state[11] = X##ga1; \
+    state[12] = X##ge0; \
+    state[13] = X##ge1; \
+    state[14] = X##gi0; \
+    state[15] = X##gi1; \
+    state[16] = X##go0; \
+    state[17] = X##go1; \
+    state[18] = X##gu0; \
+    state[19] = X##gu1; \
+    state[20] = X##ka0; \
+    state[21] = X##ka1; \
+    state[22] = X##ke0; \
+    state[23] = X##ke1; \
+    state[24] = X##ki0; \
+    state[25] = X##ki1; \
+    state[26] = X##ko0; \
+    state[27] = X##ko1; \
+    state[28] = X##ku0; \
+    state[29] = X##ku1; \
+    state[30] = X##ma0; \
+    state[31] = X##ma1; \
+    state[32] = X##me0; \
+    state[33] = X##me1; \
+    state[34] = X##mi0; \
+    state[35] = X##mi1; \
+    state[36] = X##mo0; \
+    state[37] = X##mo1; \
+    state[38] = X##mu0; \
+    state[39] = X##mu1; \
+    state[40] = X##sa0; \
+    state[41] = X##sa1; \
+    state[42] = X##se0; \
+    state[43] = X##se1; \
+    state[44] = X##si0; \
+    state[45] = X##si1; \
+    state[46] = X##so0; \
+    state[47] = X##so1; \
+    state[48] = X##su0; \
+    state[49] = X##su1; \
+
+#define copyStateVariables(X, Y) \
+    X##ba0 = Y##ba0; \
+    X##ba1 = Y##ba1; \
+    X##be0 = Y##be0; \
+    X##be1 = Y##be1; \
+    X##bi0 = Y##bi0; \
+    X##bi1 = Y##bi1; \
+    X##bo0 = Y##bo0; \
+    X##bo1 = Y##bo1; \
+    X##bu0 = Y##bu0; \
+    X##bu1 = Y##bu1; \
+    X##ga0 = Y##ga0; \
+    X##ga1 = Y##ga1; \
+    X##ge0 = Y##ge0; \
+    X##ge1 = Y##ge1; \
+    X##gi0 = Y##gi0; \
+    X##gi1 = Y##gi1; \
+    X##go0 = Y##go0; \
+    X##go1 = Y##go1; \
+    X##gu0 = Y##gu0; \
+    X##gu1 = Y##gu1; \
+    X##ka0 = Y##ka0; \
+    X##ka1 = Y##ka1; \
+    X##ke0 = Y##ke0; \
+    X##ke1 = Y##ke1; \
+    X##ki0 = Y##ki0; \
+    X##ki1 = Y##ki1; \
+    X##ko0 = Y##ko0; \
+    X##ko1 = Y##ko1; \
+    X##ku0 = Y##ku0; \
+    X##ku1 = Y##ku1; \
+    X##ma0 = Y##ma0; \
+    X##ma1 = Y##ma1; \
+    X##me0 = Y##me0; \
+    X##me1 = Y##me1; \
+    X##mi0 = Y##mi0; \
+    X##mi1 = Y##mi1; \
+    X##mo0 = Y##mo0; \
+    X##mo1 = Y##mo1; \
+    X##mu0 = Y##mu0; \
+    X##mu1 = Y##mu1; \
+    X##sa0 = Y##sa0; \
+    X##sa1 = Y##sa1; \
+    X##se0 = Y##se0; \
+    X##se1 = Y##se1; \
+    X##si0 = Y##si0; \
+    X##si1 = Y##si1; \
+    X##so0 = Y##so0; \
+    X##so1 = Y##so1; \
+    X##su0 = Y##su0; \
+    X##su1 = Y##su1; \
+
diff --git a/c_src/KeccakF-1600-32.macros b/c_src/KeccakF-1600-32.macros
new file mode 100755
index 0000000..9ade600
--- /dev/null
+++ b/c_src/KeccakF-1600-32.macros
@@ -0,0 +1,26 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifdef UseSchedule
+    #if (UseSchedule == 1)
+        #include "KeccakF-1600-32-s1.macros"
+    #elif (UseSchedule == 2)
+        #include "KeccakF-1600-32-s2.macros"
+    #elif (UseSchedule == 3)
+        #include "KeccakF-1600-32-rvk.macros"
+    #else
+        #error "This schedule is not supported."
+    #endif
+#else
+    #include "KeccakF-1600-32-s1.macros"
+#endif
diff --git a/c_src/KeccakF-1600-64.macros b/c_src/KeccakF-1600-64.macros
new file mode 100755
index 0000000..0c20bca
--- /dev/null
+++ b/c_src/KeccakF-1600-64.macros
@@ -0,0 +1,728 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT64 Aba, Abe, Abi, Abo, Abu; \
+    UINT64 Aga, Age, Agi, Ago, Agu; \
+    UINT64 Aka, Ake, Aki, Ako, Aku; \
+    UINT64 Ama, Ame, Ami, Amo, Amu; \
+    UINT64 Asa, Ase, Asi, Aso, Asu; \
+    UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    UINT64 Bga, Bge, Bgi, Bgo, Bgu; \
+    UINT64 Bka, Bke, Bki, Bko, Bku; \
+    UINT64 Bma, Bme, Bmi, Bmo, Bmu; \
+    UINT64 Bsa, Bse, Bsi, Bso, Bsu; \
+    UINT64 Ca, Ce, Ci, Co, Cu; \
+    UINT64 Da, De, Di, Do, Du; \
+    UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    UINT64 Ega, Ege, Egi, Ego, Egu; \
+    UINT64 Eka, Eke, Eki, Eko, Eku; \
+    UINT64 Ema, Eme, Emi, Emo, Emu; \
+    UINT64 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = Aba^Aga^Aka^Ama^Asa; \
+    Ce = Abe^Age^Ake^Ame^Ase; \
+    Ci = Abi^Agi^Aki^Ami^Asi; \
+    Co = Abo^Ago^Ako^Amo^Aso; \
+    Cu = Abu^Agu^Aku^Amu^Asu; \
+
+#ifdef UseBebigokimisa
+// --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa')
+// --- 64-bit lanes mapped to 64-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+    Cu ^= E##su; \
+\
+
+// --- Code for round (lane complementing pattern 'bebigokimisa')
+// --- 64-bit lanes mapped to 64-bit words
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+\
+
+#else // UseBebigokimisa
+// --- Code for round, with prepare-theta
+// --- 64-bit lanes mapped to 64-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+    Cu ^= E##su; \
+\
+
+// --- Code for round
+// --- 64-bit lanes mapped to 64-bit words
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+\
+
+#endif // UseBebigokimisa
+
+const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]; \
+    X##ka = state[10]; \
+    X##ke = state[11]; \
+    X##ki = state[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]^input[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]^input[16]; \
+    X##mi = state[17]^input[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##ba = state[ 0]^input[ 0]; \
+    X##be = state[ 1]^input[ 1]; \
+    X##bi = state[ 2]^input[ 2]; \
+    X##bo = state[ 3]^input[ 3]; \
+    X##bu = state[ 4]^input[ 4]; \
+    X##ga = state[ 5]^input[ 5]; \
+    X##ge = state[ 6]^input[ 6]; \
+    X##gi = state[ 7]^input[ 7]; \
+    X##go = state[ 8]^input[ 8]; \
+    X##gu = state[ 9]^input[ 9]; \
+    X##ka = state[10]^input[10]; \
+    X##ke = state[11]^input[11]; \
+    X##ki = state[12]^input[12]; \
+    X##ko = state[13]^input[13]; \
+    X##ku = state[14]^input[14]; \
+    X##ma = state[15]^input[15]; \
+    X##me = state[16]^input[16]; \
+    X##mi = state[17]^input[17]; \
+    X##mo = state[18]^input[18]; \
+    X##mu = state[19]^input[19]; \
+    X##sa = state[20]^input[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyFromState(X, state) \
+    X##ba = state[ 0]; \
+    X##be = state[ 1]; \
+    X##bi = state[ 2]; \
+    X##bo = state[ 3]; \
+    X##bu = state[ 4]; \
+    X##ga = state[ 5]; \
+    X##ge = state[ 6]; \
+    X##gi = state[ 7]; \
+    X##go = state[ 8]; \
+    X##gu = state[ 9]; \
+    X##ka = state[10]; \
+    X##ke = state[11]; \
+    X##ki = state[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba; \
+    state[ 1] = X##be; \
+    state[ 2] = X##bi; \
+    state[ 3] = X##bo; \
+    state[ 4] = X##bu; \
+    state[ 5] = X##ga; \
+    state[ 6] = X##ge; \
+    state[ 7] = X##gi; \
+    state[ 8] = X##go; \
+    state[ 9] = X##gu; \
+    state[10] = X##ka; \
+    state[11] = X##ke; \
+    state[12] = X##ki; \
+    state[13] = X##ko; \
+    state[14] = X##ku; \
+    state[15] = X##ma; \
+    state[16] = X##me; \
+    state[17] = X##mi; \
+    state[18] = X##mo; \
+    state[19] = X##mu; \
+    state[20] = X##sa; \
+    state[21] = X##se; \
+    state[22] = X##si; \
+    state[23] = X##so; \
+    state[24] = X##su; \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
diff --git a/c_src/KeccakF-1600-arm.c b/c_src/KeccakF-1600-arm.c
new file mode 100755
index 0000000..abd6dc9
--- /dev/null
+++ b/c_src/KeccakF-1600-arm.c
@@ -0,0 +1,123 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by Ronny Van Keer,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "KeccakF-1600-interface.h"
+#include "KeccakSponge.h"
+#include <string.h>
+
+typedef unsigned char UINT8;
+typedef unsigned short UINT16;
+typedef unsigned int UINT32;
+typedef unsigned long long int UINT64;
+
+void KeccakPermutationOnWordsAfterXoring_ARM_asm(UINT32 *state, const UINT8 *input, int laneCount);
+
+void KeccakInitialize( void )
+{
+}
+
+void KeccakInitializeState(unsigned char *state)
+{
+    memset(state, 0, KeccakPermutationSizeInBytes);
+}
+
+void KeccakPermutation(unsigned char *state)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, 0, 0);
+}
+
+#ifdef ProvideFast576
+void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 9);
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 13);
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 16);
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 17);
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 18);
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, 21);
+}
+#endif
+
+
+void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
+{
+	KeccakPermutationOnWordsAfterXoring_ARM_asm((UINT32*)state, data, laneCount);
+}
+
+// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+UINT64 fromInterleaving(UINT64 x)
+{
+   UINT64 t;
+
+   t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL;  x = x ^ t ^ (t << 16);
+   t = (x ^ (x >>  8)) & 0x0000FF000000FF00ULL;  x = x ^ t ^ (t <<  8);
+   t = (x ^ (x >>  4)) & 0x00F000F000F000F0ULL;  x = x ^ t ^ (t <<  4);
+   t = (x ^ (x >>  2)) & 0x0C0C0C0C0C0C0C0CULL;  x = x ^ t ^ (t <<  2);
+   t = (x ^ (x >>  1)) & 0x2222222222222222ULL;  x = x ^ t ^ (t <<  1);
+
+   return x;
+}
+
+void setInterleavedWordsInto8bytes(UINT8* dest, UINT32* evenAndOdd)
+{
+    ((UINT64*)dest)[0] = fromInterleaving(*(UINT64*)evenAndOdd);
+}
+
+#define extractLanes(laneCount, state, data) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            setInterleavedWordsInto8bytes(data+i*8, (UINT32*)state+i*2); \
+    }
+
+#ifdef ProvideFast1024
+void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
+{
+	extractLanes(16, state, data)
+}
+#endif
+
+void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+    extractLanes(laneCount, state, data)
+}
diff --git a/c_src/KeccakF-1600-armcc.s b/c_src/KeccakF-1600-armcc.s
new file mode 100755
index 0000000..b87d0ba
--- /dev/null
+++ b/c_src/KeccakF-1600-armcc.s
@@ -0,0 +1,653 @@
+;// The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+;// Michaël Peeters and Gilles Van Assche. For more information, feedback or
+;// questions, please refer to our website: http://keccak.noekeon.org/
+;// 
+;// Implementation by Ronny Van Keer,
+;// hereby denoted as "the implementer".
+;// 
+;// To the extent possible under law, the implementer has waived all copyright
+;// and related or neighboring rights to the source code in this file.
+;// http://creativecommons.org/publicdomain/zero/1.0/
+
+
+		PRESERVE8
+		THUMB
+		AREA    |.text|, CODE, READONLY
+
+;// --- defines
+
+_ba0	equ  0*4
+_ba1	equ  1*4
+_be0	equ  2*4
+_be1	equ  3*4
+_bi0	equ  4*4
+_bi1	equ  5*4
+_bo0	equ  6*4
+_bo1	equ  7*4
+_bu0	equ  8*4
+_bu1	equ  9*4
+_ga0	equ 10*4
+_ga1	equ 11*4
+_ge0	equ 12*4
+_ge1	equ 13*4
+_gi0	equ 14*4
+_gi1	equ 15*4
+_go0	equ 16*4
+_go1	equ 17*4
+_gu0	equ 18*4
+_gu1	equ 19*4
+_ka0	equ 20*4
+_ka1	equ 21*4
+_ke0	equ 22*4
+_ke1	equ 23*4
+_ki0	equ 24*4
+_ki1	equ 25*4
+_ko0	equ 26*4
+_ko1	equ 27*4
+_ku0	equ 28*4
+_ku1	equ 29*4
+_ma0	equ 30*4
+_ma1	equ 31*4
+_me0	equ 32*4
+_me1	equ 33*4
+_mi0	equ 34*4
+_mi1	equ 35*4
+_mo0	equ 36*4
+_mo1	equ 37*4
+_mu0	equ 38*4
+_mu1	equ 39*4
+_sa0	equ 40*4
+_sa1	equ 41*4
+_se0	equ 42*4
+_se1	equ 43*4
+_si0	equ 44*4
+_si1	equ 45*4
+_so0	equ 46*4
+_so1	equ 47*4
+_su0	equ 48*4
+_su1	equ 49*4
+
+mDe1	equ 50*4
+mDi0	equ 51*4
+mDo0	equ 52*4
+mDo1	equ 53*4
+
+;// --- macros
+
+		MACRO
+		xor5		$result,$ptr,$b,$g,$k,$m,$s
+
+		ldr			$result, [$ptr, #$b]
+		ldr			r1, [$ptr, #$g]
+		ldr			r2, [$ptr, #$k]
+		eor			$result, $result, r1				
+		ldr			r1, [$ptr, #$m]
+		eor			$result, $result, r2
+		ldr			r2, [$ptr, #$s]
+		eor			$result, $result, r1				
+		eor			$result, $result, r2
+		MEND
+
+		MACRO
+		xorrol 		$b, $yy, $rr
+
+		eor			$b, $b, $yy
+		ror			$b, #32-$rr
+		MEND
+
+
+		MACRO
+		xandnot 	$resptr, $resofs, $aa, $bb, $cc
+
+		bic			r1, $cc, $bb
+		eor			r1, r1, $aa
+		str			r1, [$resptr, #$resofs]
+		MEND
+
+		MACRO
+		xandnotRC 	$resptr, $resofs, $aa, $bb, $cc
+
+		ldr			r1, [r3], #4
+		bic			$cc, $cc, $bb
+		eor			$cc, $cc, r1
+		eor			$cc, $cc, $aa
+		str			$cc, [$resptr, #$resofs]
+		MEND
+
+
+		EXPORT  KeccakPermutationOnWordsAfterXoring_ARM_asm
+KeccakPermutationOnWordsAfterXoring_ARM_asm   PROC
+
+		push	{r4-r12,lr}
+		sub      sp,sp,#4*(50+4)
+
+		movs	r9, r2
+		beq		interleaveDone
+		mov		r8,r0
+interleaveLoop
+
+		ldr		r4, [r1], #4
+		ldr		r5, [r1], #4
+		ldrd    r6, r7, [r8]
+
+		;// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+		and		r3,r4,#0x55555555
+		orr		r3,r3,r3, LSR #1
+		and		r3,r3,#0x33333333
+		orr		r3,r3,r3, LSR #2
+		and		r3,r3,#0x0F0F0F0F
+		orr		r3,r3,r3, LSR #4
+		and		r3,r3,#0x00FF00FF
+		bfi		r3,r3,#8, #8
+		eor		r6,r6,r3, LSR #8
+
+		and		r3,r5,#0x55555555
+		orr		r3,r3,r3, LSR #1
+		and		r3,r3,#0x33333333
+		orr		r3,r3,r3, LSR #2
+		and		r3,r3,#0x0F0F0F0F
+		orr		r3,r3,r3, LSR #4
+		and		r3,r3,#0x00FF00FF
+		orr		r3,r3,r3, LSR #8
+		eor		r6,r6,r3, LSL #16
+
+		and		r3,r4,#0xAAAAAAAA
+		orr		r3,r3,r3, LSL #1
+		and		r3,r3,#0xCCCCCCCC
+		orr		r3,r3,r3, LSL #2
+		and		r3,r3,#0xF0F0F0F0
+		orr		r3,r3,r3, LSL #4
+		and		r3,r3,#0xFF00FF00
+		orr		r3,r3,r3, LSL #8
+		eor		r7,r7,r3, LSR #16
+
+		and		r3,r5,#0xAAAAAAAA
+		orr		r3,r3,r3, LSL #1
+		and		r3,r3,#0xCCCCCCCC
+		orr		r3,r3,r3, LSL #2
+		and		r3,r3,#0xF0F0F0F0
+		orr		r3,r3,r3, LSL #4
+		and		r3,r3,#0xFF00FF00
+		orr		r3,r3,r3, LSL #8
+		bfc		r3, #0, #16
+		eor		r7,r7,r3
+
+		strd	r6,r7,[r8], #8
+
+		subs	r9,r9,#1
+		bne		interleaveLoop
+
+interleaveDone
+
+		ldr		r3, =KeccakF1600RoundConstantsWithTerminator
+		b		roundLoop	;//jump over the table
+		LTORG
+
+		ALIGN
+
+KeccakF1600RoundConstantsWithTerminator
+		;//		0			1
+		dcd		0x00000001,	0x00000000
+		dcd		0x00000000,	0x00000089
+		dcd		0x00000000,	0x8000008b
+		dcd		0x00000000,	0x80008080
+		dcd		0x00000001,	0x0000008b
+		dcd		0x00000001,	0x00008000
+		dcd		0x00000001,	0x80008088
+		dcd		0x00000001,	0x80000082
+		dcd		0x00000000,	0x0000000b
+		dcd		0x00000000,	0x0000000a
+		dcd		0x00000001,	0x00008082
+		dcd		0x00000000,	0x00008003
+		dcd		0x00000001,	0x0000808b
+		dcd		0x00000001,	0x8000000b
+		dcd		0x00000001,	0x8000008a
+		dcd		0x00000001,	0x80000081
+		dcd		0x00000000,	0x80000081
+		dcd		0x00000000,	0x80000008
+		dcd		0x00000000,	0x00000083
+		dcd		0x00000000,	0x80008003
+		dcd		0x00000001,	0x80008088
+		dcd		0x00000000,	0x80000088
+		dcd		0x00000001,	0x00008000
+		dcd		0x00000000,	0x80008082
+		dcd		0xFFFFFFFF	;//terminator
+
+roundLoop
+
+		;//prepTheta	A		
+	    xor5		r10, r0,_bu0, _gu0, _ku0, _mu0, _su0
+	    xor5		r6, r0,_be1, _ge1, _ke1, _me1, _se1
+		eor			r5, r10, r6, ROR #31
+	    xor5	  	r11, r0,_bu1, _gu1, _ku1, _mu1, _su1
+	    xor5		r7, r0,_be0, _ge0, _ke0, _me0, _se0
+	    eor			r4, r11, r7
+
+	    xor5		r8, r0,_bi0, _gi0, _ki0, _mi0, _si0
+	    eor			r1, r8, r11, ROR #31
+		str			r1, [sp, #mDo0]
+	    xor5		r9, r0,_bi1, _gi1, _ki1, _mi1, _si1
+	    eor			r1, r9, r10
+		str			r1, [sp, #mDo1]
+
+	    xor5		r10, r0,_ba0, _ga0, _ka0, _ma0, _sa0
+	    eor			lr, r10, r9, ROR #31
+	    xor5		r11, r0,_ba1, _ga1, _ka1, _ma1, _sa1
+	    eor			r1, r11, r8
+		str			r1, [sp, #mDe1]
+
+	    xor5		r9, r0,_bo1, _go1, _ko1, _mo1, _so1
+	    eor			r1, r7, r9, ROR #31
+		str			r1, [sp, #mDi0]
+	    xor5		r8, r0,_bo0, _go0, _ko0, _mo0, _so0
+	    eor			r2, r6, r8
+
+	    eor			r7, r8, r11, ROR #31
+	    eor			r6, r9, r10
+
+		;//thetaRhoPiChiIota 0, in A, out E
+		ldr			r8, [r0, #_ba0]
+		ldr			r9, [r0, #_ge0]
+		ldr			r10, [r0, #_ki1]
+		ldr			r11, [r0, #_mo1]
+		ldr			r12, [r0, #_su0]
+		ldr			r1, [sp, #mDo1]
+		eor			r8, r8, r5
+		xorrol 		r9, lr, 22
+		xorrol 		r10, r2, 22
+		xorrol 		r11, r1,  11
+		xorrol 		r12, r7,  7
+		xandnot		sp, _be0, r9, r10, r11
+		xandnot		sp, _bi0, r10, r11, r12
+		xandnot		sp, _bo0, r11, r12, r8
+		xandnot		sp, _bu0, r12, r8, r9
+		xandnotRC	sp, _ba0, r8, r9, r10
+
+		ldr			r8, [r0, #_bo0]
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [r0, #_gu0]
+	    xorrol 		r8, r1, 14
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [r0, #_ka1]
+		ldr			r11, [r0, #_me1]
+		ldr			r12, [r0, #_si1]
+	    xorrol 		r9, r7, 10
+	    xorrol 		r10, r4,  2
+	    xorrol 		r11, r1,  23
+	    xorrol 		r12, r2, 31
+		xandnot		sp, _ga0, r8, r9, r10
+		xandnot		sp, _ge0, r9, r10, r11
+		xandnot		sp, _gi0, r10, r11, r12
+		xandnot		sp, _go0, r11, r12, r8
+		xandnot		sp, _gu0, r12, r8, r9
+
+		ldr			r8, [r0, #_be1]
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [r0, #_gi0]
+	    xorrol 		r8, r1,   1
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [r0, #_ko1]
+	    xorrol 		r9, r1,   3
+		ldr			r1, [sp, #mDo1]
+		ldr			r11, [r0, #_mu0]
+		ldr			r12, [r0, #_sa0]
+	    xorrol 		r10, r1,  13
+	    xorrol 		r11, r7,  4
+	    xorrol 		r12, r5,  9
+		xandnot		sp, _ka0, r8, r9, r10
+		xandnot		sp, _ke0, r9, r10, r11
+		xandnot		sp, _ki0, r10, r11, r12
+		xandnot		sp, _ko0, r11, r12, r8
+		xandnot		sp, _ku0, r12, r8, r9
+
+		ldr			r8, [r0, #_bu1]
+		ldr			r9, [r0, #_ga0]
+		ldr			r10, [r0, #_ke0]
+		ldr			r11, [r0, #_mi1]
+		ldr			r12, [r0, #_so0]
+		ldr			r1, [sp, #mDo0]
+	    xorrol 		r8, r6, 14
+	    xorrol 		r9, r5, 18
+	    xorrol 		r10, lr,  5
+	    xorrol 		r11, r2,  8
+	    xorrol 		r12, r1,  28
+		xandnot		sp, _ma0, r8, r9, r10
+		xandnot		sp, _me0, r9, r10, r11
+		xandnot		sp, _mi0, r10, r11, r12
+		xandnot		sp, _mo0, r11, r12, r8
+		xandnot		sp, _mu0, r12, r8, r9
+
+		ldr			r1, [sp, #mDi0]
+		ldr			r8, [r0, #_bi0]
+		ldr			r9, [r0, #_go1]
+	    xorrol 		r8, r1,  31
+		ldr			r1, [sp, #mDo1]
+		ldr			r10, [r0, #_ku1]
+	    xorrol 		r9, r1,  28
+		ldr			r11, [r0, #_ma1]
+		ldr			r12, [r0, #_se0]
+	    xorrol 		r10, r6, 20
+	    xorrol 		r11, r4, 21
+	    xorrol 		r12, lr,  1
+		xandnot		sp, _sa0, r8, r9, r10
+		xandnot		sp, _se0, r9, r10, r11
+		xandnot		sp, _si0, r10, r11, r12
+		xandnot		sp, _so0, r11, r12, r8
+		xandnot		sp, _su0, r12, r8, r9
+
+		;//	thetaRhoPiChiIota 1, in A, out E
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [r0, #_ge1]
+		ldr			r8, [r0, #_ba1]
+	    xorrol 		r9, r1,  22
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [r0, #_ki0]
+	    eor			r8, r8, r4    
+	    xorrol 		r10, r1,  21
+		ldr			r1, [sp, #mDo0]
+		ldr			r11, [r0, #_mo0]
+		ldr			r12, [r0, #_su1]
+	    xorrol 		r11, r1,  10
+	    xorrol 		r12, r6,  7
+		xandnot		sp, _be1, r9, r10, r11
+		xandnot		sp, _bi1, r10, r11, r12
+		xandnot		sp, _bo1, r11, r12, r8
+		xandnot		sp, _bu1, r12, r8, r9
+		xandnotRC	sp, _ba1, r8, r9, r10
+
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [r0, #_bo1]
+		ldr			r12, [r0, #_si0]
+	    xorrol 		r8, r1,  14
+		ldr			r1, [sp, #mDi0]
+		ldr			r9, [r0, #_gu1]
+	    xorrol 		r12, r1,  30
+		ldr			r10, [r0, #_ka0]
+		ldr			r11, [r0, #_me0]
+	    xorrol 		r9, r6, 10
+	    xorrol 		r10, r5,  1
+	    xorrol 		r11, lr, 22
+		xandnot		sp, _ga1, r8, r9, r10
+		xandnot		sp, _ge1, r9, r10, r11
+		xandnot		sp, _gi1, r10, r11, r12
+		xandnot		sp, _go1, r11, r12, r8
+		xandnot		sp, _gu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r10, [r0, #_ko0]
+		ldr			r8, [r0, #_be0]
+	    xorrol 		r10, r1,  12
+		ldr			r9, [r0, #_gi1]
+		ldr			r11, [r0, #_mu1]
+		ldr			r12, [r0, #_sa1]
+	    eor			r8, r8, lr    
+	    xorrol 		r9, r2,  3
+	    xorrol 		r11, r6,  4
+	    xorrol 		r12, r4,  9
+		xandnot		sp, _ka1, r8, r9, r10
+		xandnot		sp, _ke1, r9, r10, r11
+		xandnot		sp, _ki1, r10, r11, r12
+		xandnot		sp, _ko1, r11, r12, r8
+		xandnot		sp, _ku1, r12, r8, r9
+
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [r0, #_ke1]
+		ldr			r11, [r0, #_mi0]
+	    xorrol 		r10, r1,   5
+		ldr			r1, [sp, #mDi0]
+		ldr			r12, [r0, #_so1]
+	    xorrol 		r11, r1,   7
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [r0, #_bu0]
+		ldr			r9, [r0, #_ga1]
+	    xorrol 		r8, r7, 13
+	    xorrol 		r9, r4, 18
+	    xorrol 		r12, r1,  28
+		xandnot		sp, _ma1, r8, r9, r10
+		xandnot		sp, _me1, r9, r10, r11
+		xandnot		sp, _mi1, r10, r11, r12
+		xandnot		sp, _mo1, r11, r12, r8
+		xandnot		sp, _mu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [r0, #_go0]
+		ldr			r8, [r0, #_bi1]
+	    xorrol 		r9, r1,  27
+		ldr			r10, [r0, #_ku0]
+		ldr			r11, [r0, #_ma0]
+		ldr			r12, [r0, #_se1]
+		ldr			r1, [sp, #mDe1]
+	    xorrol 		r8, r2, 31
+	    xorrol 		r10, r7, 19
+	    xorrol 		r11, r5, 20
+	    xorrol 		r12, r1,  1
+		xandnot		sp, _sa1, r8, r9, r10
+		xandnot		sp, _se1, r9, r10, r11
+		xandnot		sp, _si1, r10, r11, r12
+		xandnot		sp, _so1, r11, r12, r8
+		xandnot		sp, _su1, r12, r8, r9
+
+		;//prepTheta	E		
+	    xor5		r10, sp,_bu0, _gu0, _ku0, _mu0, _su0
+	    xor5		r6, sp,_be1, _ge1, _ke1, _me1, _se1
+		eor			r5, r10, r6, ROR #31
+	    xor5	  	r11, sp,_bu1, _gu1, _ku1, _mu1, _su1
+	    xor5		r7, sp,_be0, _ge0, _ke0, _me0, _se0
+	    eor			r4, r11, r7
+
+	    xor5		r8, sp,_bi0, _gi0, _ki0, _mi0, _si0
+	    eor			r1, r8, r11, ROR #31
+		str			r1, [sp, #mDo0]
+	    xor5		r9, sp,_bi1, _gi1, _ki1, _mi1, _si1
+	    eor			r1, r9, r10
+		str			r1, [sp, #mDo1]
+
+	    xor5		r10, sp,_ba0, _ga0, _ka0, _ma0, _sa0
+	    eor			lr, r10, r9, ROR #31
+	    xor5		r11, sp,_ba1, _ga1, _ka1, _ma1, _sa1
+	    eor			r1, r11, r8
+		str			r1, [sp, #mDe1]
+
+	    xor5		r9, sp,_bo1, _go1, _ko1, _mo1, _so1
+	    eor			r1, r7, r9, ROR #31
+		str			r1, [sp, #mDi0]
+	    xor5		r8, sp,_bo0, _go0, _ko0, _mo0, _so0
+	    eor			r2, r6, r8
+
+	    eor			r7, r8, r11, ROR #31
+	    eor			r6, r9, r10
+
+		;//thetaRhoPiChiIota 0, in E, out A
+		ldr			r8, [sp, #_ba0]
+		ldr			r9, [sp, #_ge0]
+		ldr			r10, [sp, #_ki1]
+		ldr			r11, [sp, #_mo1]
+		ldr			r12, [sp, #_su0]
+		ldr			r1, [sp, #mDo1]
+		eor			r8, r8, r5
+		xorrol 		r9, lr, 22
+		xorrol 		r10, r2, 22
+		xorrol 		r11, r1,  11
+		xorrol 		r12, r7,  7
+		xandnot		r0, _be0, r9, r10, r11
+		xandnot		r0, _bi0, r10, r11, r12
+		xandnot		r0, _bo0, r11, r12, r8
+		xandnot		r0, _bu0, r12, r8, r9
+		xandnotRC	r0, _ba0, r8, r9, r10
+
+		ldr			r8, [sp, #_bo0]
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [sp, #_gu0]
+	    xorrol 		r8, r1, 14
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [sp, #_ka1]
+		ldr			r11, [sp, #_me1]
+		ldr			r12, [sp, #_si1]
+	    xorrol 		r9, r7, 10
+	    xorrol 		r10, r4,  2
+	    xorrol 		r11, r1,  23
+	    xorrol 		r12, r2, 31
+		xandnot		r0, _ga0, r8, r9, r10
+		xandnot		r0, _ge0, r9, r10, r11
+		xandnot		r0, _gi0, r10, r11, r12
+		xandnot		r0, _go0, r11, r12, r8
+		xandnot		r0, _gu0, r12, r8, r9
+
+		ldr			r8, [sp, #_be1]
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [sp, #_gi0]
+	    xorrol 		r8, r1,   1
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [sp, #_ko1]
+	    xorrol 		r9, r1,   3
+		ldr			r1, [sp, #mDo1]
+		ldr			r11, [sp, #_mu0]
+		ldr			r12, [sp, #_sa0]
+	    xorrol 		r10, r1,  13
+	    xorrol 		r11, r7,  4
+	    xorrol 		r12, r5,  9
+		xandnot		r0, _ka0, r8, r9, r10
+		xandnot		r0, _ke0, r9, r10, r11
+		xandnot		r0, _ki0, r10, r11, r12
+		xandnot		r0, _ko0, r11, r12, r8
+		xandnot		r0, _ku0, r12, r8, r9
+
+		ldr			r8, [sp, #_bu1]
+		ldr			r9, [sp, #_ga0]
+		ldr			r10, [sp, #_ke0]
+		ldr			r11, [sp, #_mi1]
+		ldr			r12, [sp, #_so0]
+		ldr			r1, [sp, #mDo0]
+	    xorrol 		r8, r6, 14
+	    xorrol 		r9, r5, 18
+	    xorrol 		r10, lr,  5
+	    xorrol 		r11, r2,  8
+	    xorrol 		r12, r1,  28
+		xandnot		r0, _ma0, r8, r9, r10
+		xandnot		r0, _me0, r9, r10, r11
+		xandnot		r0, _mi0, r10, r11, r12
+		xandnot		r0, _mo0, r11, r12, r8
+		xandnot		r0, _mu0, r12, r8, r9
+
+		ldr			r1, [sp, #mDi0]
+		ldr			r8, [sp, #_bi0]
+		ldr			r9, [sp, #_go1]
+	    xorrol 		r8, r1,  31
+		ldr			r1, [sp, #mDo1]
+		ldr			r10, [sp, #_ku1]
+	    xorrol 		r9, r1,  28
+		ldr			r11, [sp, #_ma1]
+		ldr			r12, [sp, #_se0]
+	    xorrol 		r10, r6, 20
+	    xorrol 		r11, r4, 21
+	    xorrol 		r12, lr,  1
+		xandnot		r0, _sa0, r8, r9, r10
+		xandnot		r0, _se0, r9, r10, r11
+		xandnot		r0, _si0, r10, r11, r12
+		xandnot		r0, _so0, r11, r12, r8
+		xandnot		r0, _su0, r12, r8, r9
+
+		;//	thetaRhoPiChiIota 1, in A, out E
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [sp, #_ge1]
+		ldr			r8, [sp, #_ba1]
+	    xorrol 		r9, r1,  22
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [sp, #_ki0]
+	    eor			r8, r8, r4    
+	    xorrol 		r10, r1,  21
+		ldr			r1, [sp, #mDo0]
+		ldr			r11, [sp, #_mo0]
+		ldr			r12, [sp, #_su1]
+	    xorrol 		r11, r1,  10
+	    xorrol 		r12, r6,  7
+		xandnot		r0, _be1, r9, r10, r11
+		xandnot		r0, _bi1, r10, r11, r12
+		xandnot		r0, _bo1, r11, r12, r8
+		xandnot		r0, _bu1, r12, r8, r9
+		xandnotRC	r0, _ba1, r8, r9, r10
+
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [sp, #_bo1]
+		ldr			r12, [sp, #_si0]
+	    xorrol 		r8, r1,  14
+		ldr			r1, [sp, #mDi0]
+		ldr			r9, [sp, #_gu1]
+	    xorrol 		r12, r1,  30
+		ldr			r10, [sp, #_ka0]
+		ldr			r11, [sp, #_me0]
+	    xorrol 		r9, r6, 10
+	    xorrol 		r10, r5,  1
+	    xorrol 		r11, lr, 22
+		xandnot		r0, _ga1, r8, r9, r10
+		xandnot		r0, _ge1, r9, r10, r11
+		xandnot		r0, _gi1, r10, r11, r12
+		xandnot		r0, _go1, r11, r12, r8
+		xandnot		r0, _gu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r10, [sp, #_ko0]
+		ldr			r8, [sp, #_be0]
+	    xorrol 		r10, r1,  12
+		ldr			r9, [sp, #_gi1]
+		ldr			r11, [sp, #_mu1]
+		ldr			r12, [sp, #_sa1]
+	    eor			r8, r8, lr    
+	    xorrol 		r9, r2,  3
+	    xorrol 		r11, r6,  4
+	    xorrol 		r12, r4,  9
+		xandnot		r0, _ka1, r8, r9, r10
+		xandnot		r0, _ke1, r9, r10, r11
+		xandnot		r0, _ki1, r10, r11, r12
+		xandnot		r0, _ko1, r11, r12, r8
+		xandnot		r0, _ku1, r12, r8, r9
+
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [sp, #_ke1]
+		ldr			r11, [sp, #_mi0]
+	    xorrol 		r10, r1,   5
+		ldr			r1, [sp, #mDi0]
+		ldr			r12, [sp, #_so1]
+	    xorrol 		r11, r1,   7
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [sp, #_bu0]
+		ldr			r9, [sp, #_ga1]
+	    xorrol 		r8, r7, 13
+	    xorrol 		r9, r4, 18
+	    xorrol 		r12, r1,  28
+		xandnot		r0, _ma1, r8, r9, r10
+		xandnot		r0, _me1, r9, r10, r11
+		xandnot		r0, _mi1, r10, r11, r12
+		xandnot		r0, _mo1, r11, r12, r8
+		xandnot		r0, _mu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [sp, #_go0]
+		ldr			r8, [sp, #_bi1]
+	    xorrol 		r9, r1,  27
+		ldr			r10, [sp, #_ku0]
+		ldr			r11, [sp, #_ma0]
+		ldr			r12, [sp, #_se1]
+		ldr			r1, [sp, #mDe1]
+	    xorrol 		r8, r2, 31
+	    xorrol 		r10, r7, 19
+	    xorrol 		r11, r5, 20
+	    xorrol 		r12, r1,  1
+		xandnot		r0, _sa1, r8, r9, r10
+		xandnot		r0, _se1, r9, r10, r11
+		xandnot		r0, _si1, r10, r11, r12
+		xandnot		r0, _so1, r11, r12, r8
+		ldr			r10, [r3]
+		xandnot		r0, _su1, r12, r8, r9
+
+		cmp			r10, #0xFFFFFFFF
+		bne			roundLoop
+
+		add			sp,sp,#4*(50+4)
+		pop			{r4-r12,pc}
+
+		ENDP
+
+		ALIGN
+
+		END
diff --git a/c_src/KeccakF-1600-armgcc.s b/c_src/KeccakF-1600-armgcc.s
new file mode 100755
index 0000000..d16594b
--- /dev/null
+++ b/c_src/KeccakF-1600-armgcc.s
@@ -0,0 +1,686 @@
+@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+@ Michaël Peeters and Gilles Van Assche. For more information, feedback or
+@ questions, please refer to our website: http://keccak.noekeon.org/
+@ 
+@ Implementation by Ronny Van Keer,
+@ hereby denoted as "the implementer".
+@ 
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+	.equ DO1STROUNDING, 0
+
+	@ PRESERVE8
+	@	THUMB
+	.syntax unified
+	.cpu cortex-m3
+	.thumb
+
+
+@// --- defines
+
+.equ _ba0	,   0*4
+.equ _ba1	,   1*4
+.equ _be0	,   2*4
+.equ _be1	,   3*4
+.equ _bi0	,   4*4
+.equ _bi1	,   5*4
+.equ _bo0	,   6*4
+.equ _bo1	,   7*4
+.equ _bu0	,   8*4
+.equ _bu1	,   9*4
+.equ _ga0	,  10*4
+.equ _ga1	,  11*4
+.equ _ge0	,  12*4
+.equ _ge1	,  13*4
+.equ _gi0	,  14*4
+.equ _gi1	,  15*4
+.equ _go0	,  16*4
+.equ _go1	,  17*4
+.equ _gu0	,  18*4
+.equ _gu1	,  19*4
+.equ _ka0	,  20*4
+.equ _ka1	,  21*4
+.equ _ke0	,  22*4
+.equ _ke1	,  23*4
+.equ _ki0	,  24*4
+.equ _ki1	,  25*4
+.equ _ko0	,  26*4
+.equ _ko1	,  27*4
+.equ _ku0	,  28*4
+.equ _ku1	,  29*4
+.equ _ma0	,  30*4
+.equ _ma1	,  31*4
+.equ _me0	,  32*4
+.equ _me1	,  33*4
+.equ _mi0	,  34*4
+.equ _mi1	,  35*4
+.equ _mo0	,  36*4
+.equ _mo1	,  37*4
+.equ _mu0	,  38*4
+.equ _mu1	,  39*4
+.equ _sa0	,  40*4
+.equ _sa1	,  41*4
+.equ _se0	,  42*4
+.equ _se1	,  43*4
+.equ _si0	,  44*4
+.equ _si1	,  45*4
+.equ _so0	,  46*4
+.equ _so1	,  47*4
+.equ _su0	,  48*4
+.equ _su1	,  49*4
+
+.equ mDe1	,  50*4
+.equ mDi0	,  51*4
+.equ mDo0	,  52*4
+.equ mDo1	,  53*4
+
+@// --- macros
+
+.macro		xor5		result,ptr,b,g,k,m,s
+
+		ldr			\result, [\ptr, #\b]
+		ldr			r1, [\ptr, #\g]
+		ldr			r2, [\ptr, #\k]
+		eor			\result, \result, r1				
+		ldr			r1, [\ptr, #\m]
+		eor			\result, \result, r2
+		ldr			r2, [\ptr, #\s]
+		eor			\result, \result, r1				
+		eor			\result, \result, r2
+		.endm
+
+.macro		xorrol 		b, yy, rr
+
+		eor			\b, \b, \yy
+		ror			\b, #32-\rr
+		.endm
+
+
+.macro		xandnot 	resptr, resofs, aa, bb, cc
+
+		bic			r1, \cc, \bb
+		eor			r1, r1, \aa
+		str			r1, [\resptr, #\resofs]
+		.endm
+
+.macro		xandnotRC 	resptr, resofs, aa, bb, cc
+
+		ldr			r1, [r3], #4
+		bic			\cc, \cc, \bb
+		eor			\cc, \cc, r1
+		eor			\cc, \cc, \aa
+		str			\cc, [\resptr, #\resofs]
+		.endm
+
+
+	.size	KeccakPermutationOnWords, .-KeccakPermutationOnWords
+	.align	2
+	.global	KeccakPermutationOnWordsAfterXoring_ARM_asm
+	.thumb
+	.thumb_func
+	.type	KeccakPermutationOnWordsAfterXoring_ARM_asm, %function
+KeccakPermutationOnWordsAfterXoring_ARM_asm:
+	@ args = 0, pretend = 0, frame = 408
+	@ frame_needed = 0, uses_anonymous_args = 0
+	@ link register save eliminated.
+
+    push	{r4-r12,lr}
+		sub   sp,sp,#4*(50+4)
+
+		movs	r9, r2
+		beq		interleaveDone
+		mov		r8,r0
+interleaveLoop:
+
+		ldr		r4, [r1], #4
+		ldr		r5, [r1], #4
+		ldrd    r6, r7, [r8]
+
+		@// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+		and		r3,r4,#0x55555555
+		orr		r3,r3,r3, LSR #1
+		and		r3,r3,#0x33333333
+		orr		r3,r3,r3, LSR #2
+		and		r3,r3,#0x0F0F0F0F
+		orr		r3,r3,r3, LSR #4
+		and		r3,r3,#0x00FF00FF
+		bfi		r3,r3,#8, #8
+		eor		r6,r6,r3, LSR #8
+
+		and		r3,r5,#0x55555555
+		orr		r3,r3,r3, LSR #1
+		and		r3,r3,#0x33333333
+		orr		r3,r3,r3, LSR #2
+		and		r3,r3,#0x0F0F0F0F
+		orr		r3,r3,r3, LSR #4
+		and		r3,r3,#0x00FF00FF
+		orr		r3,r3,r3, LSR #8
+		eor		r6,r6,r3, LSL #16
+
+		and		r3,r4,#0xAAAAAAAA
+		orr		r3,r3,r3, LSL #1
+		and		r3,r3,#0xCCCCCCCC
+		orr		r3,r3,r3, LSL #2
+		and		r3,r3,#0xF0F0F0F0
+		orr		r3,r3,r3, LSL #4
+		and		r3,r3,#0xFF00FF00
+		orr		r3,r3,r3, LSL #8
+		eor		r7,r7,r3, LSR #16
+
+		and		r3,r5,#0xAAAAAAAA
+		orr		r3,r3,r3, LSL #1
+		and		r3,r3,#0xCCCCCCCC
+		orr		r3,r3,r3, LSL #2
+		and		r3,r3,#0xF0F0F0F0
+		orr		r3,r3,r3, LSL #4
+		and		r3,r3,#0xFF00FF00
+		orr		r3,r3,r3, LSL #8
+		bfc		r3, #0, #16
+		eor		r7,r7,r3
+
+		strd	r6,r7,[r8], #8
+
+		subs	r9,r9,#1
+		bne		interleaveLoop
+
+interleaveDone:
+
+		ldr		r3, =KeccakF1600RoundConstantsWithTerminator
+		b		roundLoop	@//jump over the table
+		.ltorg
+
+	@ ALIGN
+
+KeccakF1600RoundConstantsWithTerminator:
+		@//		0			1
+		.word	0x00000001
+    .word 0x00000000
+		.word 0x00000000
+    .word 0x00000089
+		.word 0x00000000
+    .word 0x8000008b
+		.word 0x00000000
+    .word 0x80008080
+		.word 0x00000001
+    .word 0x0000008b
+		.word 0x00000001
+		.word 0x00008000
+		.word 0x00000001
+		.word 0x80008088
+		.word 0x00000001
+		.word 0x80000082
+		.word 0x00000000
+		.word 0x0000000b
+		.word 0x00000000
+		.word 0x0000000a
+		.word 0x00000001
+		.word 0x00008082
+		.word 0x00000000
+		.word 0x00008003
+		.word 0x00000001
+		.word 0x0000808b
+		.word 0x00000001
+ 		.word 0x8000000b
+		.word 0x00000001
+		.word 0x8000008a
+		.word 0x00000001
+		.word 0x80000081
+		.word 0x00000000
+		.word 0x80000081
+		.word 0x00000000
+		.word 0x80000008
+		.word 0x00000000
+		.word 0x00000083
+		.word 0x00000000
+		.word 0x80008003
+		.word 0x00000001
+		.word 0x80008088
+		.word 0x00000000
+		.word 0x80000088
+		.word 0x00000001
+		.word 0x00008000
+		.word 0x00000000
+		.word 0x80008082
+		.word 0xFFFFFFFF	@//terminator
+
+roundLoop:
+
+		@//prepTheta	A		
+	    xor5		r10, r0,_bu0, _gu0, _ku0, _mu0, _su0
+	    xor5		r6, r0,_be1, _ge1, _ke1, _me1, _se1
+		eor			r5, r10, r6, ROR #31
+	    xor5	  	r11, r0,_bu1, _gu1, _ku1, _mu1, _su1
+	    xor5		r7, r0,_be0, _ge0, _ke0, _me0, _se0
+	    eor			r4, r11, r7
+
+	    xor5		r8, r0,_bi0, _gi0, _ki0, _mi0, _si0
+	    eor			r1, r8, r11, ROR #31
+		str			r1, [sp, #mDo0]
+	    xor5		r9, r0,_bi1, _gi1, _ki1, _mi1, _si1
+	    eor			r1, r9, r10
+		str			r1, [sp, #mDo1]
+
+	    xor5		r10, r0,_ba0, _ga0, _ka0, _ma0, _sa0
+	    eor			lr, r10, r9, ROR #31
+	    xor5		r11, r0,_ba1, _ga1, _ka1, _ma1, _sa1
+	    eor			r1, r11, r8
+		str			r1, [sp, #mDe1]
+
+	    xor5		r9, r0,_bo1, _go1, _ko1, _mo1, _so1
+	    eor			r1, r7, r9, ROR #31
+		str			r1, [sp, #mDi0]
+	    xor5		r8, r0,_bo0, _go0, _ko0, _mo0, _so0
+	    eor			r2, r6, r8
+
+	    eor			r7, r8, r11, ROR #31
+	    eor			r6, r9, r10
+
+		@//thetaRhoPiChiIota 0, in A, out E
+		ldr			r8, [r0, #_ba0]
+		ldr			r9, [r0, #_ge0]
+		ldr			r10, [r0, #_ki1]
+		ldr			r11, [r0, #_mo1]
+		ldr			r12, [r0, #_su0]
+		ldr			r1, [sp, #mDo1]
+		eor			r8, r8, r5
+		xorrol 		r9, lr, 22
+		xorrol 		r10, r2, 22
+		xorrol 		r11, r1,  11
+		xorrol 		r12, r7,  7
+		xandnot		sp, _be0, r9, r10, r11
+		xandnot		sp, _bi0, r10, r11, r12
+		xandnot		sp, _bo0, r11, r12, r8
+		xandnot		sp, _bu0, r12, r8, r9
+		xandnotRC	sp, _ba0, r8, r9, r10
+
+		ldr			r8, [r0, #_bo0]
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [r0, #_gu0]
+	    xorrol 		r8, r1, 14
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [r0, #_ka1]
+		ldr			r11, [r0, #_me1]
+		ldr			r12, [r0, #_si1]
+	    xorrol 		r9, r7, 10
+	    xorrol 		r10, r4,  2
+	    xorrol 		r11, r1,  23
+	    xorrol 		r12, r2, 31
+		xandnot		sp, _ga0, r8, r9, r10
+		xandnot		sp, _ge0, r9, r10, r11
+		xandnot		sp, _gi0, r10, r11, r12
+		xandnot		sp, _go0, r11, r12, r8
+		xandnot		sp, _gu0, r12, r8, r9
+
+		ldr			r8, [r0, #_be1]
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [r0, #_gi0]
+	    xorrol 		r8, r1,   1
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [r0, #_ko1]
+	    xorrol 		r9, r1,   3
+		ldr			r1, [sp, #mDo1]
+		ldr			r11, [r0, #_mu0]
+		ldr			r12, [r0, #_sa0]
+	    xorrol 		r10, r1,  13
+	    xorrol 		r11, r7,  4
+	    xorrol 		r12, r5,  9
+		xandnot		sp, _ka0, r8, r9, r10
+		xandnot		sp, _ke0, r9, r10, r11
+		xandnot		sp, _ki0, r10, r11, r12
+		xandnot		sp, _ko0, r11, r12, r8
+		xandnot		sp, _ku0, r12, r8, r9
+
+		ldr			r8, [r0, #_bu1]
+		ldr			r9, [r0, #_ga0]
+		ldr			r10, [r0, #_ke0]
+		ldr			r11, [r0, #_mi1]
+		ldr			r12, [r0, #_so0]
+		ldr			r1, [sp, #mDo0]
+	    xorrol 		r8, r6, 14
+	    xorrol 		r9, r5, 18
+	    xorrol 		r10, lr,  5
+	    xorrol 		r11, r2,  8
+	    xorrol 		r12, r1,  28
+		xandnot		sp, _ma0, r8, r9, r10
+		xandnot		sp, _me0, r9, r10, r11
+		xandnot		sp, _mi0, r10, r11, r12
+		xandnot		sp, _mo0, r11, r12, r8
+		xandnot		sp, _mu0, r12, r8, r9
+
+		ldr			r1, [sp, #mDi0]
+		ldr			r8, [r0, #_bi0]
+		ldr			r9, [r0, #_go1]
+	    xorrol 		r8, r1,  31
+		ldr			r1, [sp, #mDo1]
+		ldr			r10, [r0, #_ku1]
+	    xorrol 		r9, r1,  28
+		ldr			r11, [r0, #_ma1]
+		ldr			r12, [r0, #_se0]
+	    xorrol 		r10, r6, 20
+	    xorrol 		r11, r4, 21
+	    xorrol 		r12, lr,  1
+		xandnot		sp, _sa0, r8, r9, r10
+		xandnot		sp, _se0, r9, r10, r11
+		xandnot		sp, _si0, r10, r11, r12
+		xandnot		sp, _so0, r11, r12, r8
+		xandnot		sp, _su0, r12, r8, r9
+
+		@//	thetaRhoPiChiIota 1, in A, out E
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [r0, #_ge1]
+		ldr			r8, [r0, #_ba1]
+	    xorrol 		r9, r1,  22
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [r0, #_ki0]
+	    eor			r8, r8, r4    
+	    xorrol 		r10, r1,  21
+		ldr			r1, [sp, #mDo0]
+		ldr			r11, [r0, #_mo0]
+		ldr			r12, [r0, #_su1]
+	    xorrol 		r11, r1,  10
+	    xorrol 		r12, r6,  7
+		xandnot		sp, _be1, r9, r10, r11
+		xandnot		sp, _bi1, r10, r11, r12
+		xandnot		sp, _bo1, r11, r12, r8
+		xandnot		sp, _bu1, r12, r8, r9
+		xandnotRC	sp, _ba1, r8, r9, r10
+
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [r0, #_bo1]
+		ldr			r12, [r0, #_si0]
+	    xorrol 		r8, r1,  14
+		ldr			r1, [sp, #mDi0]
+		ldr			r9, [r0, #_gu1]
+	    xorrol 		r12, r1,  30
+		ldr			r10, [r0, #_ka0]
+		ldr			r11, [r0, #_me0]
+	    xorrol 		r9, r6, 10
+	    xorrol 		r10, r5,  1
+	    xorrol 		r11, lr, 22
+		xandnot		sp, _ga1, r8, r9, r10
+		xandnot		sp, _ge1, r9, r10, r11
+		xandnot		sp, _gi1, r10, r11, r12
+		xandnot		sp, _go1, r11, r12, r8
+		xandnot		sp, _gu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r10, [r0, #_ko0]
+		ldr			r8, [r0, #_be0]
+	    xorrol 		r10, r1,  12
+		ldr			r9, [r0, #_gi1]
+		ldr			r11, [r0, #_mu1]
+		ldr			r12, [r0, #_sa1]
+	    eor			r8, r8, lr    
+	    xorrol 		r9, r2,  3
+	    xorrol 		r11, r6,  4
+	    xorrol 		r12, r4,  9
+		xandnot		sp, _ka1, r8, r9, r10
+		xandnot		sp, _ke1, r9, r10, r11
+		xandnot		sp, _ki1, r10, r11, r12
+		xandnot		sp, _ko1, r11, r12, r8
+		xandnot		sp, _ku1, r12, r8, r9
+
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [r0, #_ke1]
+		ldr			r11, [r0, #_mi0]
+	    xorrol 		r10, r1,   5
+		ldr			r1, [sp, #mDi0]
+		ldr			r12, [r0, #_so1]
+	    xorrol 		r11, r1,   7
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [r0, #_bu0]
+		ldr			r9, [r0, #_ga1]
+	    xorrol 		r8, r7, 13
+	    xorrol 		r9, r4, 18
+	    xorrol 		r12, r1,  28
+		xandnot		sp, _ma1, r8, r9, r10
+		xandnot		sp, _me1, r9, r10, r11
+		xandnot		sp, _mi1, r10, r11, r12
+		xandnot		sp, _mo1, r11, r12, r8
+		xandnot		sp, _mu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [r0, #_go0]
+		ldr			r8, [r0, #_bi1]
+	    xorrol 		r9, r1,  27
+		ldr			r10, [r0, #_ku0]
+		ldr			r11, [r0, #_ma0]
+		ldr			r12, [r0, #_se1]
+		ldr			r1, [sp, #mDe1]
+	    xorrol 		r8, r2, 31
+	    xorrol 		r10, r7, 19
+	    xorrol 		r11, r5, 20
+	    xorrol 		r12, r1,  1
+		xandnot		sp, _sa1, r8, r9, r10
+		xandnot		sp, _se1, r9, r10, r11
+		xandnot		sp, _si1, r10, r11, r12
+		xandnot		sp, _so1, r11, r12, r8
+		xandnot		sp, _su1, r12, r8, r9
+
+		@//prepTheta	E		
+	    xor5		r10, sp,_bu0, _gu0, _ku0, _mu0, _su0
+	    xor5		r6, sp,_be1, _ge1, _ke1, _me1, _se1
+		eor			r5, r10, r6, ROR #31
+	    xor5	  	r11, sp,_bu1, _gu1, _ku1, _mu1, _su1
+	    xor5		r7, sp,_be0, _ge0, _ke0, _me0, _se0
+	    eor			r4, r11, r7
+
+	    xor5		r8, sp,_bi0, _gi0, _ki0, _mi0, _si0
+	    eor			r1, r8, r11, ROR #31
+		str			r1, [sp, #mDo0]
+	    xor5		r9, sp,_bi1, _gi1, _ki1, _mi1, _si1
+	    eor			r1, r9, r10
+		str			r1, [sp, #mDo1]
+
+	    xor5		r10, sp,_ba0, _ga0, _ka0, _ma0, _sa0
+	    eor			lr, r10, r9, ROR #31
+	    xor5		r11, sp,_ba1, _ga1, _ka1, _ma1, _sa1
+	    eor			r1, r11, r8
+		str			r1, [sp, #mDe1]
+
+	    xor5		r9, sp,_bo1, _go1, _ko1, _mo1, _so1
+	    eor			r1, r7, r9, ROR #31
+		str			r1, [sp, #mDi0]
+	    xor5		r8, sp,_bo0, _go0, _ko0, _mo0, _so0
+	    eor			r2, r6, r8
+
+	    eor			r7, r8, r11, ROR #31
+	    eor			r6, r9, r10
+
+		@//thetaRhoPiChiIota 0, in E, out A
+		ldr			r8, [sp, #_ba0]
+		ldr			r9, [sp, #_ge0]
+		ldr			r10, [sp, #_ki1]
+		ldr			r11, [sp, #_mo1]
+		ldr			r12, [sp, #_su0]
+		ldr			r1, [sp, #mDo1]
+		eor			r8, r8, r5
+		xorrol 		r9, lr, 22
+		xorrol 		r10, r2, 22
+		xorrol 		r11, r1,  11
+		xorrol 		r12, r7,  7
+		xandnot		r0, _be0, r9, r10, r11
+		xandnot		r0, _bi0, r10, r11, r12
+		xandnot		r0, _bo0, r11, r12, r8
+		xandnot		r0, _bu0, r12, r8, r9
+		xandnotRC	r0, _ba0, r8, r9, r10
+
+		ldr			r8, [sp, #_bo0]
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [sp, #_gu0]
+	    xorrol 		r8, r1, 14
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [sp, #_ka1]
+		ldr			r11, [sp, #_me1]
+		ldr			r12, [sp, #_si1]
+	    xorrol 		r9, r7, 10
+	    xorrol 		r10, r4,  2
+	    xorrol 		r11, r1,  23
+	    xorrol 		r12, r2, 31
+		xandnot		r0, _ga0, r8, r9, r10
+		xandnot		r0, _ge0, r9, r10, r11
+		xandnot		r0, _gi0, r10, r11, r12
+		xandnot		r0, _go0, r11, r12, r8
+		xandnot		r0, _gu0, r12, r8, r9
+
+		ldr			r8, [sp, #_be1]
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [sp, #_gi0]
+	    xorrol 		r8, r1,   1
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [sp, #_ko1]
+	    xorrol 		r9, r1,   3
+		ldr			r1, [sp, #mDo1]
+		ldr			r11, [sp, #_mu0]
+		ldr			r12, [sp, #_sa0]
+	    xorrol 		r10, r1,  13
+	    xorrol 		r11, r7,  4
+	    xorrol 		r12, r5,  9
+		xandnot		r0, _ka0, r8, r9, r10
+		xandnot		r0, _ke0, r9, r10, r11
+		xandnot		r0, _ki0, r10, r11, r12
+		xandnot		r0, _ko0, r11, r12, r8
+		xandnot		r0, _ku0, r12, r8, r9
+
+		ldr			r8, [sp, #_bu1]
+		ldr			r9, [sp, #_ga0]
+		ldr			r10, [sp, #_ke0]
+		ldr			r11, [sp, #_mi1]
+		ldr			r12, [sp, #_so0]
+		ldr			r1, [sp, #mDo0]
+	    xorrol 		r8, r6, 14
+	    xorrol 		r9, r5, 18
+	    xorrol 		r10, lr,  5
+	    xorrol 		r11, r2,  8
+	    xorrol 		r12, r1,  28
+		xandnot		r0, _ma0, r8, r9, r10
+		xandnot		r0, _me0, r9, r10, r11
+		xandnot		r0, _mi0, r10, r11, r12
+		xandnot		r0, _mo0, r11, r12, r8
+		xandnot		r0, _mu0, r12, r8, r9
+
+		ldr			r1, [sp, #mDi0]
+		ldr			r8, [sp, #_bi0]
+		ldr			r9, [sp, #_go1]
+	    xorrol 		r8, r1,  31
+		ldr			r1, [sp, #mDo1]
+		ldr			r10, [sp, #_ku1]
+	    xorrol 		r9, r1,  28
+		ldr			r11, [sp, #_ma1]
+		ldr			r12, [sp, #_se0]
+	    xorrol 		r10, r6, 20
+	    xorrol 		r11, r4, 21
+	    xorrol 		r12, lr,  1
+		xandnot		r0, _sa0, r8, r9, r10
+		xandnot		r0, _se0, r9, r10, r11
+		xandnot		r0, _si0, r10, r11, r12
+		xandnot		r0, _so0, r11, r12, r8
+		xandnot		r0, _su0, r12, r8, r9
+
+		@//	thetaRhoPiChiIota 1, in A, out E
+		ldr			r1, [sp, #mDe1]
+		ldr			r9, [sp, #_ge1]
+		ldr			r8, [sp, #_ba1]
+	    xorrol 		r9, r1,  22
+		ldr			r1, [sp, #mDi0]
+		ldr			r10, [sp, #_ki0]
+	    eor			r8, r8, r4    
+	    xorrol 		r10, r1,  21
+		ldr			r1, [sp, #mDo0]
+		ldr			r11, [sp, #_mo0]
+		ldr			r12, [sp, #_su1]
+	    xorrol 		r11, r1,  10
+	    xorrol 		r12, r6,  7
+		xandnot		r0, _be1, r9, r10, r11
+		xandnot		r0, _bi1, r10, r11, r12
+		xandnot		r0, _bo1, r11, r12, r8
+		xandnot		r0, _bu1, r12, r8, r9
+		xandnotRC	r0, _ba1, r8, r9, r10
+
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [sp, #_bo1]
+		ldr			r12, [sp, #_si0]
+	    xorrol 		r8, r1,  14
+		ldr			r1, [sp, #mDi0]
+		ldr			r9, [sp, #_gu1]
+	    xorrol 		r12, r1,  30
+		ldr			r10, [sp, #_ka0]
+		ldr			r11, [sp, #_me0]
+	    xorrol 		r9, r6, 10
+	    xorrol 		r10, r5,  1
+	    xorrol 		r11, lr, 22
+		xandnot		r0, _ga1, r8, r9, r10
+		xandnot		r0, _ge1, r9, r10, r11
+		xandnot		r0, _gi1, r10, r11, r12
+		xandnot		r0, _go1, r11, r12, r8
+		xandnot		r0, _gu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r10, [sp, #_ko0]
+		ldr			r8, [sp, #_be0]
+	    xorrol 		r10, r1,  12
+		ldr			r9, [sp, #_gi1]
+		ldr			r11, [sp, #_mu1]
+		ldr			r12, [sp, #_sa1]
+	    eor			r8, r8, lr    
+	    xorrol 		r9, r2,  3
+	    xorrol 		r11, r6,  4
+	    xorrol 		r12, r4,  9
+		xandnot		r0, _ka1, r8, r9, r10
+		xandnot		r0, _ke1, r9, r10, r11
+		xandnot		r0, _ki1, r10, r11, r12
+		xandnot		r0, _ko1, r11, r12, r8
+		xandnot		r0, _ku1, r12, r8, r9
+
+		ldr			r1, [sp, #mDe1]
+		ldr			r10, [sp, #_ke1]
+		ldr			r11, [sp, #_mi0]
+	    xorrol 		r10, r1,   5
+		ldr			r1, [sp, #mDi0]
+		ldr			r12, [sp, #_so1]
+	    xorrol 		r11, r1,   7
+		ldr			r1, [sp, #mDo1]
+		ldr			r8, [sp, #_bu0]
+		ldr			r9, [sp, #_ga1]
+	    xorrol 		r8, r7, 13
+	    xorrol 		r9, r4, 18
+	    xorrol 		r12, r1,  28
+		xandnot		r0, _ma1, r8, r9, r10
+		xandnot		r0, _me1, r9, r10, r11
+		xandnot		r0, _mi1, r10, r11, r12
+		xandnot		r0, _mo1, r11, r12, r8
+		xandnot		r0, _mu1, r12, r8, r9
+
+		ldr			r1, [sp, #mDo0]
+		ldr			r9, [sp, #_go0]
+		ldr			r8, [sp, #_bi1]
+	    xorrol 		r9, r1,  27
+		ldr			r10, [sp, #_ku0]
+		ldr			r11, [sp, #_ma0]
+		ldr			r12, [sp, #_se1]
+		ldr			r1, [sp, #mDe1]
+	    xorrol 		r8, r2, 31
+	    xorrol 		r10, r7, 19
+	    xorrol 		r11, r5, 20
+	    xorrol 		r12, r1,  1
+		xandnot		r0, _sa1, r8, r9, r10
+		xandnot		r0, _se1, r9, r10, r11
+		xandnot		r0, _si1, r10, r11, r12
+		xandnot		r0, _so1, r11, r12, r8
+		ldr			r10, [r3]
+		xandnot		r0, _su1, r12, r8, r9
+
+		cmp			r10, #0xFFFFFFFF
+		bne			roundLoop
+
+		add			sp,sp,#4*(50+4)
+		pop			{r4-r12,pc}
+
+	@
+
+	@ ALIGN
+
diff --git a/c_src/KeccakF-1600-avr8.c b/c_src/KeccakF-1600-avr8.c
new file mode 100755
index 0000000..7ea2679
--- /dev/null
+++ b/c_src/KeccakF-1600-avr8.c
@@ -0,0 +1,163 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by Ronny Van Keer,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include <avr/pgmspace.h>
+#include "AVR8-rotate64.h"
+
+typedef unsigned char				UINT8;
+typedef UINT8								tSmallUInt;
+typedef unsigned long long  UINT64;
+typedef UINT64 							tKeccakLane;
+
+#define cKeccakLaneSizeInBits   (sizeof(tKeccakLane) * 8)
+
+#define cKeccakNumberOfRounds   24
+
+static tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] PROGMEM = 
+{
+    (tKeccakLane)0x0000000000000001ULL,
+    (tKeccakLane)0x0000000000008082ULL,
+    (tKeccakLane)0x800000000000808aULL,
+    (tKeccakLane)0x8000000080008000ULL,
+    (tKeccakLane)0x000000000000808bULL,
+    (tKeccakLane)0x0000000080000001ULL,
+    (tKeccakLane)0x8000000080008081ULL,
+    (tKeccakLane)0x8000000000008009ULL,
+    (tKeccakLane)0x000000000000008aULL,
+    (tKeccakLane)0x0000000000000088ULL,
+    (tKeccakLane)0x0000000080008009ULL,
+    (tKeccakLane)0x000000008000000aULL,
+    (tKeccakLane)0x000000008000808bULL,
+    (tKeccakLane)0x800000000000008bULL,
+    (tKeccakLane)0x8000000000008089ULL,
+    (tKeccakLane)0x8000000000008003ULL,
+    (tKeccakLane)0x8000000000008002ULL,
+    (tKeccakLane)0x8000000000000080ULL,
+    (tKeccakLane)0x000000000000800aULL,
+    (tKeccakLane)0x800000008000000aULL,
+    (tKeccakLane)0x8000000080008081ULL,
+    (tKeccakLane)0x8000000000008080ULL,
+    (tKeccakLane)0x0000000080000001ULL,
+    (tKeccakLane)0x8000000080008008ULL
+};
+
+static tSmallUInt KeccakF_RotationConstants[24] PROGMEM = 
+{
+	 ROT_CODE( 1), ROT_CODE( 3), ROT_CODE( 6), ROT_CODE(10), ROT_CODE(15), 
+	 ROT_CODE(21), ROT_CODE(28), ROT_CODE(36), ROT_CODE(45), ROT_CODE(55),  
+	 ROT_CODE( 2), ROT_CODE(14), ROT_CODE(27), ROT_CODE(41), ROT_CODE(56),  
+	 ROT_CODE( 8), ROT_CODE(25), ROT_CODE(43), ROT_CODE(62), ROT_CODE(18), 
+	 ROT_CODE(39), ROT_CODE(61), ROT_CODE(20), ROT_CODE(44)
+};
+
+static tSmallUInt KeccakF_PiLane[24] PROGMEM = 
+{
+    10,  7, 11, 17, 18,  3,  5, 16,  8, 21, 24,  4, 15, 23, 19, 13, 12,  2, 20, 14, 22,  9,  6,  1 
+};
+
+static tSmallUInt KeccakF_Mod5[10] PROGMEM = 
+{
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4
+};
+
+
+void KeccakF( tKeccakLane * state )
+{
+	tSmallUInt	round;
+	tKeccakLane	C[5];
+
+	// prepare Theta
+	{
+		tSmallUInt x;
+		tKeccakLane	* pC;
+		for ( x = 0, pC = C; x < 5; ++x, ++pC )
+		{
+			*pC = state[x] ^ state[5 + x] ^ state[10 + x] ^ state[15 + x] ^ state[20 + x];
+		}
+	}
+
+  for( round = 0; round < cKeccakNumberOfRounds; ++round )
+  {
+		// Theta
+  	{
+			tSmallUInt x;
+			for ( x = 0; x < 5; ++x )
+			{
+				tKeccakLane temp;
+				tSmallUInt y;
+				temp = rotate64_1bit_left( C[pgm_read_byte((KeccakF_Mod5+1)+x)] );
+				temp ^= C[pgm_read_byte((KeccakF_Mod5+4)+x)];
+				for ( y = 0; y < 25; y += 5 )
+				{
+					state[y + x] ^= temp;
+				}
+			}
+		}
+
+    // Rho Pi
+    {
+			tKeccakLane temp;
+			tSmallUInt x;
+	
+			temp = state[1];
+			for ( x = 0; x < 24; ++x )
+			{
+				tSmallUInt t;
+				tKeccakLane T[1];
+				t = pgm_read_byte(&KeccakF_PiLane[x]);
+				T[0] = state[t];
+				state[t] = rotate64left_code( temp, pgm_read_byte(&KeccakF_RotationConstants[x]) );
+				temp = T[0];
+			}
+		}
+
+		// Chi Iota Prepare Theta
+		{
+			tSmallUInt z;
+			UINT8 * p = (unsigned char *)state;
+			UINT8 * pC = (unsigned char *)C;
+
+			for( z = 0; z < 8; ++z, ++p, ++pC ) 
+			{
+				tSmallUInt y;
+				UINT8 c0, c1, c2, c3, c4, t;
+
+				c0 = c1 = c2 = c3 = c4 = 0;
+				for( y = 5; y != 0; --y, p += 40 ) 
+				{
+					UINT8 a0 = *p;
+					UINT8 a1 = *(p+8);
+					UINT8 a2 = *(p+16);
+					UINT8 a3 = *(p+24);
+					UINT8 a4 = *(p+32);
+
+					*p			= t = a0 ^ ((~a1) & a2); c0 ^= t;
+					*(p+8)	= t = a1 ^ ((~a2) & a3); c1 ^= t;
+					*(p+16) = a2 ^= ((~a3) & a4); c2 ^= a2;
+					*(p+24) = a3 ^= ((~a4) & a0); c3 ^= a3;
+					*(p+32) = a4 ^= ((~a0) & a1); c4 ^= a4;
+				}
+				p -= 5 * 5 * 8;
+				y = pgm_read_byte( (UINT8 *)(KeccakF_RoundConstants+round) + z );
+				*p ^= y;
+				*pC 			= c0 ^ y;
+				*(pC+ 8)	= c1;
+				*(pC+16)	= c2;
+				*(pC+24)	= c3;
+				*(pC+32)	= c4;
+			}
+		}
+  }
+
+}
diff --git a/c_src/KeccakF-1600-avr8asm-compact.s b/c_src/KeccakF-1600-avr8asm-compact.s
new file mode 100755
index 0000000..c87920f
--- /dev/null
+++ b/c_src/KeccakF-1600-avr8asm-compact.s
@@ -0,0 +1,647 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "Keccak-avr8-settings.h"
+#include "crypto_hash.h"
+
+#define	cKeccakR_SizeInBytes	(cKeccakR/8)
+
+#ifndef crypto_hash_BYTES
+    #ifdef cKeccakFixedOutputLengthInBytes
+        #define crypto_hash_BYTES cKeccakFixedOutputLengthInBytes
+    #else
+        #define crypto_hash_BYTES cKeccakR_SizeInBytes
+    #endif
+#endif
+
+//	Registers used in all routines
+#define	zero			1
+#define	rpState		24
+#define	rX				26
+#define	rY				28
+#define	rZ				30
+
+
+/*
+ * int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen )
+ *
+ * argument out   is passed in r24:r25
+ * argument in    is passed in r22:r23
+ * argument inlen is passed in r14:r21, only lowest 16-bits (r14-r15) are used
+ */
+.global crypto_hash		// populate.py, please update crypto_hash
+crypto_hash:			// populate.py, please update crypto_hash
+
+	//	crypto_hash only registers
+	#define	rT1				16
+	#define	rT2				17
+	#define	rT3				18
+	#define	rInLen		22 //(2 regs)
+	#define	sp				0x3D
+
+	push	r2
+	push	r3
+	push	r4
+	push	r5
+	push	r6
+	push	r7
+	push	r8
+	push	r9
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	r16
+	push	r17
+	push	r28
+	push	r29
+
+	//	Allocate state (25*8) + C variables (5*8)
+	in		rZ,			 			sp
+	in		rZ+1, 				sp+1
+	subi  rZ,			  		240
+	sbci  rZ+1,					0
+	in		r0,						0x3F
+	cli         				
+	out		sp+1, 				rZ+1
+	out		sp,						rZ
+	out		0x3F,					r0
+	adiw	rZ,						41			// pointer to start of state, end of C, compensate post decrement
+
+	push	r24										// save out pointer
+	push	r25
+
+	movw	rpState,			rZ
+	movw	rY, 					r22				//y contains in pointer
+	movw	rInLen,				r14
+
+	ldi		rT3,					5*5*8			//clear state
+clearStateLoop:
+	st		z+,						zero
+	dec		rT3
+	brne	clearStateLoop
+
+	//	Full blocks
+	cpi		rInLen,				cKeccakR_SizeInBytes
+	cpc		rInLen+1,			zero         
+	brcs	ch_lastblock
+
+ch_FullRateLoop:
+	ldi		rT3, 					cKeccakR_SizeInBytes
+	movw	rZ, 					rpState
+ch_XorLanesLoop:
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	subi	rT3, 					1
+	brne	ch_XorLanesLoop
+	
+	push	rY
+	push	rY+1
+	call  KeccakF
+	pop		rY+1
+	pop		rY
+
+	subi  rInLen,   		cKeccakR_SizeInBytes
+	sbci  rInLen+1, 		0
+	cpi   rInLen,   		cKeccakR_SizeInBytes
+	cpc   rInLen+1, 		zero         
+	brcc	ch_FullRateLoop
+
+ch_lastblock:					//	XOR last uncomplete block into state
+	movw	rZ, 					rpState
+
+	subi	rInLen,				0
+	breq	ch_Padding
+ch_xorBytesLoop:
+	ld		rT1, 					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	subi	rInLen,				1
+	brne	ch_xorBytesLoop
+
+ch_Padding:
+	ldi		rT1,					1
+	ld		rT2,					Z
+	eor		rT1,					rT2
+	st		Z,						rT1
+
+	ldi		rZ,						cKeccakR_SizeInBytes-1
+	add		rZ,						rpState
+	mov		rZ+1,					rpState+1
+	adc		rZ+1,					zero
+	ld		rT1,					Z
+	subi	rT1,					0x80
+	st		Z,						rT1
+
+	call  KeccakF
+
+	//output 
+	ldi		rT3,			 		crypto_hash_BYTES
+	movw	rY, 					rpState
+	pop		rZ+1																	; restore out pointer
+	pop		rZ
+outputLoop:
+	ld		rT1,					Y+
+	st		Z+, 					rT1
+	dec		rT3
+	brne  outputLoop
+
+
+	//	Free state and pop registers
+	ldi		rZ,						199
+	add		rpState,			rZ
+	adc		rpState+1,		zero
+	in		r0,						0x3F
+	cli         				
+	out		sp+1, 				rpState+1
+	out		sp,						rpState
+	out		0x3F,					r0
+
+	pop		r29
+	pop		r28
+	pop		r17
+	pop		r16
+	pop		r15
+	pop		r14
+	pop		r13
+	pop		r12
+	pop		r11
+	pop		r10
+	pop		r9
+	pop		r8
+	pop		r7
+	pop		r6
+	pop		r5
+	pop		r4
+	pop		r3
+	pop		r2
+
+	// return 0
+	mov		r24, 					zero
+	mov		r25, 					zero
+
+	#undef	rInLen
+	#undef	rT1
+	#undef	rT2
+	#undef	rT3
+	#undef	sp				
+
+	ret
+
+
+//#define ROT_BIT(a)	 (a <= 4) ? ((a == 0) ? 0x80 : (a & 7)) : (0x80 | (8-a))
+
+#define ROT_BIT(a)	 ((a) & 7)
+#define ROT_BYTE(a)	 (((a)/8 + !!(((a)%8) > 4)) & 7)
+
+KeccakF_RhoPiConstants:
+	.BYTE	 ROT_BIT( 1), ROT_BYTE( 3),	10 * 8
+	.BYTE	 ROT_BIT( 3), ROT_BYTE( 6),	 7 * 8
+	.BYTE	 ROT_BIT( 6), ROT_BYTE(10),	11 * 8
+	.BYTE	 ROT_BIT(10), ROT_BYTE(15),	17 * 8
+	.BYTE	 ROT_BIT(15), ROT_BYTE(21),	18 * 8
+	.BYTE	 ROT_BIT(21), ROT_BYTE(28),	 3 * 8
+	.BYTE	 ROT_BIT(28), ROT_BYTE(36),	 5 * 8
+	.BYTE	 ROT_BIT(36), ROT_BYTE(45),	16 * 8
+	.BYTE	 ROT_BIT(45), ROT_BYTE(55),	 8 * 8
+	.BYTE	 ROT_BIT(55), ROT_BYTE( 2),	21 * 8 
+	.BYTE	 ROT_BIT( 2), ROT_BYTE(14),	24 * 8
+	.BYTE	 ROT_BIT(14), ROT_BYTE(27),	 4 * 8
+	.BYTE	 ROT_BIT(27), ROT_BYTE(41),	15 * 8
+	.BYTE	 ROT_BIT(41), ROT_BYTE(56),	23 * 8
+	.BYTE	 ROT_BIT(56), ROT_BYTE( 8),	19 * 8 
+	.BYTE	 ROT_BIT( 8), ROT_BYTE(25),	13 * 8
+	.BYTE	 ROT_BIT(25), ROT_BYTE(43),	12 * 8
+	.BYTE	 ROT_BIT(43), ROT_BYTE(62),	 2 * 8
+	.BYTE	 ROT_BIT(62), ROT_BYTE(18),	20 * 8
+	.BYTE	 ROT_BIT(18), ROT_BYTE(39),	14 * 8
+	.BYTE	 ROT_BIT(39), ROT_BYTE(61),	22 * 8
+	.BYTE	 ROT_BIT(61), ROT_BYTE(20),	 9 * 8
+	.BYTE	 ROT_BIT(20), ROT_BYTE(44),	 6 * 8
+	.BYTE	 ROT_BIT(44), ROT_BYTE( 1),	 1 * 8
+
+
+KeccakF_RoundConstants:
+	.BYTE	   0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x82, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x8a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x00, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x8b, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x09, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x8a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x09, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x8b, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x89, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x03, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x02, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x0a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x08, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0xFF, 0		//terminator
+
+	.text
+
+
+ 
+// KeccakF
+// Not callable from C!
+//
+// argument rpState is passed in r24:r25
+//
+KeccakF:
+
+	//	Variables used in multiple operations
+	#define	rTemp			2			// 8 regs (2-9)
+	#define	rTempBis	10		// 8 regs (10-17)
+	#define	rTempTer	18		// 2 regs (18-19)
+	#define	pRound		20		// 2 regs (20-21)
+
+	//	Initial Prepare Theta
+	#define	TCIPx				rTempTer
+	
+	movw	rZ, 					rpState				// Z points to 5 C lanes
+	sbiw	rZ,						40
+	movw	rY,						rpState
+	ldi		TCIPx,				5*8
+KeccakInitialPrepTheta_Loop:
+	ld		r0,						Y
+	adiw	rY,						40
+	ld		rTemp,				Y
+	adiw	rY,						40
+	eor		r0,						rTemp
+	ld		rTemp,				Y
+	adiw	rY,						40
+	eor		r0,						rTemp
+	ld		rTemp,				Y
+	eor		r0,						rTemp
+	ldd		rTemp,				Y+40
+	eor		r0,						rTemp
+	st		Z+, 					r0
+	subi	rY,						119
+	sbc		rY+1,					zero
+	dec		TCIPx
+	brne	KeccakInitialPrepTheta_Loop
+	#undef	TCIPx
+
+	ldi		pRound,				lo8(KeccakF_RoundConstants)
+	ldi		pRound+1,			hi8(KeccakF_RoundConstants)
+Keccak_RoundLoop:
+
+	//	Theta
+	#define	TCplus			rX
+	#define	TCminus			rZ
+	#define	TCcoordX		rTempTer
+	#define	TCcoordY		rTempTer+1
+
+	movw	TCminus,			rpState
+	sbiw	TCminus,			1*8
+	movw	TCplus,				rpState
+	sbiw	TCplus,				4*8
+	movw	rY,						rpState
+
+	ldi		TCcoordX,			0x16
+KeccakTheta_Loop1:
+	ld		rTemp+0,			X+	
+	ld		rTemp+1,			X+	
+	ld		rTemp+2,			X+	
+	ld		rTemp+3,			X+	
+	ld		rTemp+4,			X+	
+	ld		rTemp+5,			X+	
+	ld		rTemp+6,			X+	
+	ld		rTemp+7,			X+	
+
+	lsl		rTemp+0
+	rol		rTemp+1
+	rol		rTemp+2
+	rol		rTemp+3
+	rol		rTemp+4
+	rol		rTemp+5
+	rol		rTemp+6
+	rol		rTemp+7
+	adc		rTemp+0, 			zero
+
+	ld		r0,						Z+	
+	eor		rTemp+0,			r0
+	ld		r0,						Z+	
+	eor		rTemp+1,			r0
+	ld		r0,						Z+	
+	eor		rTemp+2,			r0
+	ld		r0,						Z+	
+	eor		rTemp+3,			r0
+	ld		r0,						Z+	
+	eor		rTemp+4,			r0
+	ld		r0,						Z+	
+	eor		rTemp+5,			r0
+	ld		r0,						Z+	
+	eor		rTemp+6,			r0
+	ld		r0,						Z+	
+	eor		rTemp+7,			r0
+
+	ldi		TCcoordY,			5
+KeccakTheta_Loop2:
+	ld		r0,						Y
+	eor		r0, 					rTemp+0
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+1
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+2
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+3
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+4
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+5
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+6
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+7
+	st		Y+, 					r0
+	adiw	rY, 					32
+
+	dec		TCcoordY
+	brne	KeccakTheta_Loop2
+
+	subi	rY, 					200-8
+	sbc		rY+1, 				zero
+
+	lsr		TCcoordX
+	brcc	1f
+	breq	KeccakTheta_End
+	rjmp	KeccakTheta_Loop1
+1:	
+	cpi		TCcoordX, 		0x0B
+	brne	2f
+	sbiw	TCminus, 			40
+	rjmp	KeccakTheta_Loop1
+2:
+	sbiw	TCplus,				40
+	rjmp	KeccakTheta_Loop1
+
+KeccakTheta_End:	
+	#undef	TCplus
+	#undef	TCminus
+	#undef	TCcoordX
+	#undef	TCcoordY
+
+
+	//	Rho Pi
+	#define	RPindex			rTempTer+0
+	#define	RPTemp			rTempTer+1
+
+	sbiw	rY, 					32
+
+	ld		rTemp+0,			Y+	
+	ld		rTemp+1,			Y+	
+	ld		rTemp+2,			Y+	
+	ld		rTemp+3,			Y+	
+	ld		rTemp+4,			Y+	
+	ld		rTemp+5,			Y+	
+	ld		rTemp+6,			Y+	
+	ld		rTemp+7,			Y+	
+	
+	ldi		rZ,						lo8(KeccakF_RhoPiConstants)
+	ldi		rZ+1,					hi8(KeccakF_RhoPiConstants)
+
+KeccakRhoPi_Loop:
+	;	do bit rotation
+	lpm		RPTemp,				Z+		;get nuber of bits to rotate
+	cpi		RPTemp,				5
+	brcs	rotate64_nbit_leftOrNot
+	neg		RPTemp
+	andi	RPTemp,				3
+
+rotate64_nbit_right:
+	bst		rTemp, 				0
+	ror		rTemp+7
+	ror		rTemp+6
+	ror		rTemp+5
+	ror		rTemp+4
+	ror		rTemp+3
+	ror		rTemp+2
+	ror		rTemp+1
+	ror		rTemp
+	bld		rTemp+7, 			7
+	dec		RPTemp
+	brne	rotate64_nbit_right
+	rjmp	KeccakRhoPi_RhoBitRotateDone
+
+rotate64_nbit_leftOrNot:
+	tst		RPTemp
+	breq	KeccakRhoPi_RhoBitRotateDone
+rotate64_nbit_left:
+	lsl		rTemp
+	rol		rTemp+1
+	rol		rTemp+2
+	rol		rTemp+3
+	rol 	rTemp+4
+	rol		rTemp+5
+	rol		rTemp+6
+	rol		rTemp+7
+	adc		rTemp, 				r1
+	dec		RPTemp
+	brne	rotate64_nbit_left
+
+KeccakRhoPi_RhoBitRotateDone:
+	lpm		r0,						Z+		;get number of bytes to rotate
+	lpm		RPindex,			Z+		;get index in state
+	movw	rY,						rpState
+	add		rY,						RPindex
+	adc		rY+1, 				zero
+	
+	ldi		rX,						rTempBis
+	add		rX,						r0
+	mov		rX+1,					zero
+	ldi		RPTemp,				8
+KeccakRhoPi_PiByteRotLoop:
+	ld		r0,						Y+
+	st		X+,						r0
+	cpi		rX,						rTempBis+8
+	brne	KeccakRhoPi_PiByteRotFirst
+	ldi		rX,						rTempBis
+KeccakRhoPi_PiByteRotFirst:
+	dec		RPTemp
+	brne	KeccakRhoPi_PiByteRotLoop
+
+	sbiw	rY, 					8
+	st		Y+,						rTemp+0
+	st		Y+,						rTemp+1
+	st		Y+,						rTemp+2
+	st		Y+,						rTemp+3
+	st		Y+,						rTemp+4
+	st		Y+,						rTemp+5
+	st		Y+,						rTemp+6
+	st		Y+,						rTemp+7
+
+	movw	rTemp+0,			rTempBis+0
+	movw	rTemp+2,			rTempBis+2
+	movw	rTemp+4,			rTempBis+4
+	movw	rTemp+6,			rTempBis+6
+KeccakRhoPi_RhoDone:
+	subi	RPindex, 			8
+	brne	KeccakRhoPi_Loop
+
+	#undef	RPindex			
+	#undef	RPTemp
+
+
+	//	Chi Iota prepare Theta
+	#define	CIPTa0			rTemp
+	#define	CIPTa1			rTemp+1
+	#define	CIPTa2			rTemp+2
+	#define	CIPTa3			rTemp+3
+	#define	CIPTa4			rTemp+4
+	#define	CIPTc0			rTempBis
+	#define	CIPTc1			rTempBis+1
+	#define	CIPTc2			rTempBis+2
+	#define	CIPTc3			rTempBis+3
+	#define	CIPTc4			rTempBis+4
+	#define	CIPTz				rTempBis+6
+	#define	CIPTy				rTempBis+7
+
+	movw	rY,						rpState
+	movw	rX,			 			rpState			; 5 * C
+	sbiw	rX,						40
+	movw	rZ, 					pRound
+
+	ldi		CIPTz,				8
+KeccakChiIotaPrepareTheta_zLoop:
+	mov		CIPTc0,				zero
+	mov		CIPTc1,				zero
+	movw	CIPTc2,				CIPTc0
+	mov		CIPTc4,				zero
+
+	ldi		CIPTy,				5
+KeccakChiIotaPrepareTheta_yLoop:
+	ld		CIPTa0,				Y
+	ldd		CIPTa1,				Y+8
+	ldd		CIPTa2,				Y+16
+	ldd		CIPTa3,				Y+24
+	ldd		CIPTa4,				Y+32
+	
+	;*p			= t = a0 ^ ((~a1) & a2); c0 ^= t;
+	mov		r0, 					CIPTa1
+	com		r0      			
+	and		r0,						CIPTa2
+	eor		r0,						CIPTa0
+	eor		CIPTc0,				r0
+	st		Y,  					r0
+	
+	;*(p+8)	= t = a1 ^ ((~a2) & a3); c1 ^= t;
+	mov		r0, 					CIPTa2
+	com		r0      			
+	and		r0,						CIPTa3
+	eor		r0,						CIPTa1
+	eor		CIPTc1,				r0
+	std		Y+8, 					r0
+
+	;*(p+16) = a2 ^= ((~a3) & a4); c2 ^= a2;
+	mov		r0, 					CIPTa3
+	com		r0      			
+	and		r0,						CIPTa4
+	eor		r0,						CIPTa2
+	eor		CIPTc2,				r0
+	std		Y+16,					r0
+
+	;*(p+24) = a3 ^= ((~a4) & a0); c3 ^= a3;
+	mov		r0, 					CIPTa4
+	com		r0      			
+	and		r0,						CIPTa0
+	eor		r0,						CIPTa3
+	eor		CIPTc3,				r0
+	std		Y+24,					r0
+
+	;*(p+32) = a4 ^= ((~a0) & a1); c4 ^= a4;
+	com		CIPTa0
+	and		CIPTa0,				CIPTa1
+	eor		CIPTa0,				CIPTa4
+	eor		CIPTc4,				CIPTa0
+	std		Y+32,					CIPTa0
+	
+	adiw	rY,						40	
+	dec		CIPTy
+	brne	KeccakChiIotaPrepareTheta_yLoop
+
+	subi	rY,						200
+	sbc		rY+1,					zero	
+
+	lpm		r0, 					Z+		;Round Constant
+	ld		CIPTa0,				Y
+	eor		CIPTa0,				r0
+	st		Y+,						CIPTa0
+                			
+	movw	pRound,				rZ
+	movw	rZ,						rX
+	eor		CIPTc0,				r0
+	st		Z+,						CIPTc0	
+	std		Z+7,					CIPTc1
+	std		Z+15,					CIPTc2
+	std		Z+23,					CIPTc3
+	std		Z+31,					CIPTc4
+	movw	rX,						rZ
+	movw	rZ,						pRound
+
+	dec		CIPTz
+	brne	KeccakChiIotaPrepareTheta_zLoop
+
+	#undef	CIPTa0
+	#undef	CIPTa1
+	#undef	CIPTa2
+	#undef	CIPTa3
+	#undef	CIPTa4
+	#undef	CIPTc0
+	#undef	CIPTc1
+	#undef	CIPTc2
+	#undef	CIPTc3
+	#undef	CIPTc4
+	#undef	CIPTz
+	#undef	CIPTy
+
+
+	;Check for terminator
+	lpm		r0,						Z
+	inc		r0					
+	breq	Keccak_Done
+	rjmp	Keccak_RoundLoop
+Keccak_Done:
+	ret
+
+	#undef	rTemp			
+	#undef	rTempBis	
+	#undef	rTempTer
+	#undef	pRound		
+
+	#undef	rpState		
+	#undef	zero			
+	#undef	rX				
+	#undef	rY				
+	#undef	rZ				
diff --git a/c_src/KeccakF-1600-avr8asm-fast.s b/c_src/KeccakF-1600-avr8asm-fast.s
new file mode 100755
index 0000000..e27f174
--- /dev/null
+++ b/c_src/KeccakF-1600-avr8asm-fast.s
@@ -0,0 +1,934 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "Keccak-avr8-settings.h"
+#include "crypto_hash.h"
+
+#define	cKeccakR_SizeInBytes	(cKeccakR/8)
+
+#ifndef crypto_hash_BYTES
+    #ifdef cKeccakFixedOutputLengthInBytes
+        #define crypto_hash_BYTES cKeccakFixedOutputLengthInBytes
+    #else
+        #define crypto_hash_BYTES cKeccakR_SizeInBytes
+    #endif
+#endif
+
+//	Registers used in all routines
+#define	zero			1
+#define	rpState		24
+#define	rX				26
+#define	rY				28
+#define	rZ				30
+
+
+/*
+ * int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen )
+ *
+ * argument out   is passed in r24:r25
+ * argument in    is passed in r22:r23
+ * argument inlen is passed in r14:r21, only lowest 16-bits (r14-r15) are used
+ */
+.global crypto_hash		// populate.py, please update crypto_hash
+crypto_hash:			// populate.py, please update crypto_hash
+
+	//	crypto_hash only registers
+	#define	rInLen		16 //(2 regs)
+	#define	rT1				18
+	#define	rT2				19
+	#define	rT3				20
+	#define	sp				0x3D
+
+	push	r2
+	push	r3
+	push	r4
+	push	r5
+	push	r6
+	push	r7
+	push	r8
+	push	r9
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	r16
+	push	r17
+	push	r28
+	push	r29
+
+	//	Allocate state (25*8) + C variables (5*8)
+	in		rZ,			 			sp
+	in		rZ+1, 				sp+1
+	subi  rZ,			  		240
+	sbci  rZ+1,					0
+	in		r0,						0x3F
+	cli         				
+	out		sp+1, 				rZ+1
+	out		sp,						rZ
+	out		0x3F,					r0
+	adiw	rZ,						41			// pointer to start of state, end of C, compensate post decrement
+
+	push	r24										// save out pointer
+	push	r25
+
+	movw	rpState,			rZ
+	movw	rY, 					r22				//y contains in pointer
+	movw	rInLen,				r14
+
+	ldi		rT3,					5*5*2			//clear state (4 bytes each iteration)
+clearStateLoop:
+	st		z+,						zero
+	st		z+,						zero
+	st		z+,						zero
+	st		z+,						zero
+	dec		rT3
+	brne	clearStateLoop
+
+	//	Full blocks
+	cpi		rInLen,				cKeccakR_SizeInBytes
+	cpc		rInLen+1,			zero         
+	brcs	ch_lastblock
+
+ch_FullRateLoop:
+	ldi		rT3, 					cKeccakR_SizeInBytes/8
+	movw	rZ, 					rpState
+ch_XorLanesLoop:
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+
+	subi	rT3, 					1
+	brne	ch_XorLanesLoop
+	
+	push	rY
+	push	rY+1
+	push	rInLen
+	push	rInLen+1
+	call  KeccakF
+	pop		rInLen+1
+	pop		rInLen
+	pop		rY+1
+	pop		rY
+
+	subi  rInLen,   		cKeccakR_SizeInBytes
+	sbci  rInLen+1, 		0
+	cpi   rInLen,   		cKeccakR_SizeInBytes
+	cpc   rInLen+1, 		zero         
+	brcc	ch_FullRateLoop
+
+ch_lastblock:					//	XOR last uncomplete block into state
+	movw	rZ, 					rpState
+
+	lsr 	rInLen
+	brcc	ch_xorBytes2
+	ld		rT1, 					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	subi	rInLen,				0
+ch_xorBytes2:
+	breq	ch_Padding
+ch_xorBytes2Loop:
+	ld		rT1, 					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	ld		rT1, 					Y+
+	ld		rT2, 					Z
+	eor		rT1, 					rT2
+	st		Z+, 					rT1
+	subi	rInLen,				1
+	brne	ch_xorBytes2Loop
+
+ch_Padding:
+	ldi		rT1,					1
+	ld		rT2,					Z
+	eor		rT1,					rT2
+	st		Z,						rT1
+
+	ldi		rZ,						cKeccakR_SizeInBytes-1
+	add		rZ,						rpState
+	mov		rZ+1,					rpState+1
+	adc		rZ+1,					zero
+	ld		rT1,					Z
+	subi	rT1,					0x80
+	st		Z,						rT1
+
+	call  KeccakF
+
+	//output 
+	ldi		rT3,			 		crypto_hash_BYTES/4			; copy 4 bytes per iteration
+	movw	rY, 					rpState
+	pop		rZ+1																	; restore out pointer
+	pop		rZ
+outputLoop:
+	ld		rT1,					Y+
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	st		Z+, 					rT1
+	ld		rT1,					Y+
+	st		Z+, 					rT1
+	dec		rT3
+	brne  outputLoop
+
+
+	//	Free state and pop registers
+	ldi		rZ,						199
+	add		rpState,			rZ
+	adc		rpState+1,		zero
+	in		r0,						0x3F
+	cli         				
+	out		sp+1, 				rpState+1
+	out		sp,						rpState
+	out		0x3F,					r0
+
+	pop		r29
+	pop		r28
+	pop		r17
+	pop		r16
+	pop		r15
+	pop		r14
+	pop		r13
+	pop		r12
+	pop		r11
+	pop		r10
+	pop		r9
+	pop		r8
+	pop		r7
+	pop		r6
+	pop		r5
+	pop		r4
+	pop		r3
+	pop		r2
+
+	// return 0
+	mov		r24, 					zero
+	mov		r25, 					zero
+
+	#undef	rInLen
+	#undef	rT1
+	#undef	rT2
+	#undef	rT3
+	#undef	sp				
+
+	ret
+
+
+#define ROT_BIT(a)	 ((a) & 7)
+#define ROT_BYTE(a)	 ((((a)/8 + !!(((a)%8) > 4)) & 7) * 9)
+
+KeccakF_RhoPiConstants:
+	.BYTE	 ROT_BIT( 1), ROT_BYTE( 3),	10 * 8
+	.BYTE	 ROT_BIT( 3), ROT_BYTE( 6),	 7 * 8
+	.BYTE	 ROT_BIT( 6), ROT_BYTE(10),	11 * 8
+	.BYTE	 ROT_BIT(10), ROT_BYTE(15),	17 * 8
+	.BYTE	 ROT_BIT(15), ROT_BYTE(21),	18 * 8
+	.BYTE	 ROT_BIT(21), ROT_BYTE(28),	 3 * 8
+	.BYTE	 ROT_BIT(28), ROT_BYTE(36),	 5 * 8
+	.BYTE	 ROT_BIT(36), ROT_BYTE(45),	16 * 8
+	.BYTE	 ROT_BIT(45), ROT_BYTE(55),	 8 * 8
+	.BYTE	 ROT_BIT(55), ROT_BYTE( 2),	21 * 8 
+	.BYTE	 ROT_BIT( 2), ROT_BYTE(14),	24 * 8
+	.BYTE	 ROT_BIT(14), ROT_BYTE(27),	 4 * 8
+	.BYTE	 ROT_BIT(27), ROT_BYTE(41),	15 * 8
+	.BYTE	 ROT_BIT(41), ROT_BYTE(56),	23 * 8
+	.BYTE	 ROT_BIT(56), ROT_BYTE( 8),	19 * 8 
+	.BYTE	 ROT_BIT( 8), ROT_BYTE(25),	13 * 8
+	.BYTE	 ROT_BIT(25), ROT_BYTE(43),	12 * 8
+	.BYTE	 ROT_BIT(43), ROT_BYTE(62),	 2 * 8
+	.BYTE	 ROT_BIT(62), ROT_BYTE(18),	20 * 8
+	.BYTE	 ROT_BIT(18), ROT_BYTE(39),	14 * 8
+	.BYTE	 ROT_BIT(39), ROT_BYTE(61),	22 * 8
+	.BYTE	 ROT_BIT(61), ROT_BYTE(20),	 9 * 8
+	.BYTE	 ROT_BIT(20), ROT_BYTE(44),	 6 * 8
+	.BYTE	 ROT_BIT(44), ROT_BYTE( 1),	 1 * 8
+
+
+KeccakF_RoundConstants:
+	.BYTE	   0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x82, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x8a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x00, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x8b, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x09, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x8a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x09, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x8b, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x89, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x03, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x02, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x0a, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x0a, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x81, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00
+	.BYTE	   0x08, 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
+	.BYTE	   0xFF, 0		//terminator
+
+	.text
+
+
+ 
+// KeccakF
+// Not callable from C!
+//
+// argument rpState is passed in r24:r25
+//
+KeccakF:
+
+	//	Variables used in multiple operations
+	#define	rTemp			2			// 8 regs (2-9)
+	#define	rTempBis	10		// 8 regs (10-17)
+	#define	rTempTer	18		// 4 regs (18-21)
+	#define	pRound		22		// 2 regs (22-23)
+
+	//	Initial Prepare Theta
+	#define	TCIPx				rTempTer
+	
+	movw	rZ, 					rpState				// Z points to 8 C
+	sbiw	rZ,						40
+	ldi		TCIPx,				5
+	movw	rY,						rpState
+KeccakInitialPrepTheta_Loop:
+	ld		rTemp+0,			Y+	;state[x]
+	ld		rTemp+1,			Y+
+	ld		rTemp+2,			Y+
+	ld		rTemp+3,			Y+
+	ld		rTemp+4,			Y+
+	ld		rTemp+5,			Y+
+	ld		rTemp+6,			Y+
+	ld		rTemp+7,			Y+
+                    	
+	adiw	rY,						32
+	ld		r0,						Y+	;state[5+x]
+	eor		rTemp+0,			r0
+	ld		r0,						Y+
+	eor		rTemp+1,			r0
+	ld		r0,						Y+
+	eor		rTemp+2,			r0
+	ld		r0,						Y+
+	eor		rTemp+3,			r0
+	ld		r0,						Y+
+	eor		rTemp+4,			r0
+	ld		r0,						Y+
+	eor		rTemp+5,			r0
+	ld		r0,						Y+
+	eor		rTemp+6,			r0
+	ld		r0,						Y+
+	eor		rTemp+7,			r0
+                    	
+	adiw	rY,						32
+	ld		r0,						Y+	;state[10+x]
+	eor		rTemp+0,			r0
+	ld		r0,						Y+
+	eor		rTemp+1,			r0
+	ld		r0,						Y+
+	eor		rTemp+2,			r0
+	ld		r0,						Y+
+	eor		rTemp+3,			r0
+	ld		r0,						Y+
+	eor		rTemp+4,			r0
+	ld		r0,						Y+
+	eor		rTemp+5,			r0
+	ld		r0,						Y+
+	eor		rTemp+6,			r0
+	ld		r0,						Y+
+	eor		rTemp+7,			r0
+                    	
+	adiw	rY,						32
+	ld		r0,						Y+	;state[15+x]
+	eor		rTemp+0,			r0
+	ld		r0,						Y+
+	eor		rTemp+1,			r0
+	ld		r0,						Y+
+	eor		rTemp+2,			r0
+	ld		r0,						Y+
+	eor		rTemp+3,			r0
+	ld		r0,						Y+
+	eor		rTemp+4,			r0
+	ld		r0,						Y+
+	eor		rTemp+5,			r0
+	ld		r0,						Y+
+	eor		rTemp+6,			r0
+	ld		r0,						Y+
+	eor		rTemp+7,			r0
+                    	
+	adiw	rY,						32
+	ld		r0,						Y+	;state[20+x]
+	eor		rTemp+0,			r0
+	ld		r0,						Y+
+	eor		rTemp+1,			r0
+	ld		r0,						Y+
+	eor		rTemp+2,			r0
+	ld		r0,						Y+
+	eor		rTemp+3,			r0
+	ld		r0,						Y+
+	eor		rTemp+4,			r0
+	ld		r0,						Y+
+	eor		rTemp+5,			r0
+	ld		r0,						Y+
+	eor		rTemp+6,			r0
+	ld		r0,						Y+
+	eor		rTemp+7,			r0
+                    	
+	st		Z+, 					rTemp+0
+	st		Z+, 					rTemp+1
+	st		Z+, 					rTemp+2
+	st		Z+, 					rTemp+3
+	st		Z+, 					rTemp+4
+	st		Z+, 					rTemp+5
+	st		Z+, 					rTemp+6
+	st		Z+, 					rTemp+7
+	
+	subi	rY,						160
+	sbc		rY+1,					zero
+
+	subi	TCIPx, 				1
+	breq	KeccakInitialPrepTheta_Done
+	rjmp	KeccakInitialPrepTheta_Loop
+KeccakInitialPrepTheta_Done:
+	#undef	TCIPx
+
+	ldi		pRound,				lo8(KeccakF_RoundConstants)
+	ldi		pRound+1,			hi8(KeccakF_RoundConstants)
+Keccak_RoundLoop:
+
+	//	Theta
+	#define	TCplus			rX
+	#define	TCminus			rZ
+	#define	TCcoordX		rTempTer
+	#define	TCcoordY		rTempTer+1
+
+	movw	TCminus,			rpState
+	sbiw	TCminus,			1*8
+	movw	TCplus,				rpState
+	sbiw	TCplus,				4*8
+	movw	rY,						rpState
+
+	ldi		TCcoordX,			0x16
+KeccakTheta_Loop1:
+	ld		rTemp+0,			X+	
+	ld		rTemp+1,			X+	
+	ld		rTemp+2,			X+	
+	ld		rTemp+3,			X+	
+	ld		rTemp+4,			X+	
+	ld		rTemp+5,			X+	
+	ld		rTemp+6,			X+	
+	ld		rTemp+7,			X+	
+
+	lsl		rTemp+0
+	rol		rTemp+1
+	rol		rTemp+2
+	rol		rTemp+3
+	rol		rTemp+4
+	rol		rTemp+5
+	rol		rTemp+6
+	rol		rTemp+7
+	adc		rTemp+0, 			zero
+
+	ld		r0,						Z+	
+	eor		rTemp+0,			r0
+	ld		r0,						Z+	
+	eor		rTemp+1,			r0
+	ld		r0,						Z+	
+	eor		rTemp+2,			r0
+	ld		r0,						Z+	
+	eor		rTemp+3,			r0
+	ld		r0,						Z+	
+	eor		rTemp+4,			r0
+	ld		r0,						Z+	
+	eor		rTemp+5,			r0
+	ld		r0,						Z+	
+	eor		rTemp+6,			r0
+	ld		r0,						Z+	
+	eor		rTemp+7,			r0
+
+	ldi		TCcoordY,			5
+KeccakTheta_Loop2:
+	ld		r0,						Y
+	eor		r0, 					rTemp+0
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+1
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+2
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+3
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+4
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+5
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+6
+	st		Y+, 					r0
+	ld		r0,						Y
+	eor		r0, 					rTemp+7
+	st		Y+, 					r0
+	adiw	rY, 					32
+
+	dec		TCcoordY
+	brne	KeccakTheta_Loop2
+
+	subi	rY, 					200-8
+	sbc		rY+1, 				zero
+
+	lsr		TCcoordX
+	brcc	1f
+	breq	KeccakTheta_End
+	rjmp	KeccakTheta_Loop1
+1:	
+	cpi		TCcoordX, 		0x0B
+	brne	2f
+	sbiw	TCminus, 			40
+	rjmp	KeccakTheta_Loop1
+2:
+	sbiw	TCplus,				40
+	rjmp	KeccakTheta_Loop1
+
+KeccakTheta_End:	
+	#undef	TCplus
+	#undef	TCminus
+	#undef	TCcoordX
+	#undef	TCcoordY
+
+
+	//	Rho Pi
+	#define	RPpConst		rTempTer		//	2 regs
+	#define	RPindex			rTempTer+2
+	#define	RPpBitRot		rX
+	#define	RPpByteRot	pRound
+
+	sbiw	rY, 					32
+
+	ld		rTemp+0,			Y+	
+	ld		rTemp+1,			Y+	
+	ld		rTemp+2,			Y+	
+	ld		rTemp+3,			Y+	
+	ld		rTemp+4,			Y+	
+	ld		rTemp+5,			Y+	
+	ld		rTemp+6,			Y+	
+	ld		rTemp+7,			Y+	
+	
+	push	pRound
+	push	pRound+1
+	ldi		RPpConst,			lo8(KeccakF_RhoPiConstants)
+	ldi		RPpConst+1,		hi8(KeccakF_RhoPiConstants)
+	ldi		RPpBitRot,		pm_lo8(bit_rot_jmp_table)
+	ldi		RPpBitRot+1,	pm_hi8(bit_rot_jmp_table)
+	ldi		RPpByteRot, 	pm_lo8(rotate64_0byte_left)
+	ldi		RPpByteRot+1, pm_hi8(rotate64_0byte_left)
+
+KeccakRhoPi_Loop:
+	;	get rotation codes and state index
+	movw	rZ, 					RPpConst
+	lpm		r0, 					Z+		;bits
+	lpm		rTempBis,			Z+		;bytes
+	lpm		RPindex,			Z+
+	movw	RPpConst,			rZ
+
+	;	do bit rotation
+	movw	rZ,						RPpBitRot
+	add		rZ,						r0
+	adc		rZ+1,					zero
+	ijmp
+
+KeccakRhoPi_RhoBitRotateDone:
+	movw	rY,						rpState
+	add		rY,						RPindex
+	adc		rY+1, 				zero
+	
+	movw	rZ, 					RPpByteRot
+	add		rZ,						rTempBis
+	adc		rZ+1,					zero
+	ijmp
+
+KeccakRhoPi_PiStore:
+	sbiw	rY, 					8
+	st		Y+,						rTemp+0
+	st		Y+,						rTemp+1
+	st		Y+,						rTemp+2
+	st		Y+,						rTemp+3
+	st		Y+,						rTemp+4
+	st		Y+,						rTemp+5
+	st		Y+,						rTemp+6
+	st		Y+,						rTemp+7
+
+	movw	rTemp+0,			rTempBis+0
+	movw	rTemp+2,			rTempBis+2
+	movw	rTemp+4,			rTempBis+4
+	movw	rTemp+6,			rTempBis+6
+KeccakRhoPi_RhoDone:
+	subi	RPindex, 			8
+	brne	KeccakRhoPi_Loop
+	pop		pRound+1
+	pop		pRound
+
+	#undef	RPpConst		
+	#undef	RPindex			
+	#undef	RPpBitRot		
+	#undef	RPpByteRot	
+
+
+	//	Chi Iota prepare Theta
+	#define	CIPTa0			rTemp
+	#define	CIPTa1			rTemp+1
+	#define	CIPTa2			rTemp+2
+	#define	CIPTa3			rTemp+3
+	#define	CIPTa4			rTemp+4
+	#define	CIPTc0			rTempBis
+	#define	CIPTc1			rTempBis+1
+	#define	CIPTc2			rTempBis+2
+	#define	CIPTc3			rTempBis+3
+	#define	CIPTc4			rTempBis+4
+	#define	CIPTz				rTempBis+6
+	#define	CIPTy				rTempBis+7
+
+	movw	rY,						rpState
+	movw	rX,			 			rpState			; 5 * C
+	sbiw	rX,						40
+	movw	rZ, 					pRound
+
+	ldi		CIPTz,				8
+KeccakChiIotaPrepareTheta_zLoop:
+	mov		CIPTc0,				zero
+	mov		CIPTc1,				zero
+	movw	CIPTc2,				CIPTc0
+	mov		CIPTc4,				zero
+
+	ldi		CIPTy,				5
+KeccakChiIotaPrepareTheta_yLoop:
+	ld		CIPTa0,				Y
+	ldd		CIPTa1,				Y+8
+	ldd		CIPTa2,				Y+16
+	ldd		CIPTa3,				Y+24
+	ldd		CIPTa4,				Y+32
+	
+	;*p			= t = a0 ^ ((~a1) & a2); c0 ^= t;
+	mov		r0, 					CIPTa1
+	com		r0      			
+	and		r0,						CIPTa2
+	eor		r0,						CIPTa0
+	eor		CIPTc0,				r0
+	st		Y,  					r0
+	
+	;*(p+8)	= t = a1 ^ ((~a2) & a3); c1 ^= t;
+	mov		r0, 					CIPTa2
+	com		r0      			
+	and		r0,						CIPTa3
+	eor		r0,						CIPTa1
+	eor		CIPTc1,				r0
+	std		Y+8, 					r0
+
+	;*(p+16) = a2 ^= ((~a3) & a4); c2 ^= a2;
+	mov		r0, 					CIPTa3
+	com		r0      			
+	and		r0,						CIPTa4
+	eor		r0,						CIPTa2
+	eor		CIPTc2,				r0
+	std		Y+16,					r0
+
+	;*(p+24) = a3 ^= ((~a4) & a0); c3 ^= a3;
+	mov		r0, 					CIPTa4
+	com		r0      			
+	and		r0,						CIPTa0
+	eor		r0,						CIPTa3
+	eor		CIPTc3,				r0
+	std		Y+24,					r0
+
+	;*(p+32) = a4 ^= ((~a0) & a1); c4 ^= a4;
+	com		CIPTa0
+	and		CIPTa0,				CIPTa1
+	eor		CIPTa0,				CIPTa4
+	eor		CIPTc4,				CIPTa0
+	std		Y+32,					CIPTa0
+	
+	adiw	rY,						40	
+	dec		CIPTy
+	brne	KeccakChiIotaPrepareTheta_yLoop
+
+	subi	rY,						200
+	sbc		rY+1,					zero	
+
+	lpm		r0, 					Z+		;Round Constant
+	ld		CIPTa0,				Y
+	eor		CIPTa0,				r0
+	st		Y+,						CIPTa0
+                			
+	movw	pRound,				rZ
+	movw	rZ,						rX
+	eor		CIPTc0,				r0
+	st		Z+,						CIPTc0	
+	std		Z+7,					CIPTc1
+	std		Z+15,					CIPTc2
+	std		Z+23,					CIPTc3
+	std		Z+31,					CIPTc4
+	movw	rX,						rZ
+	movw	rZ,						pRound
+
+	dec		CIPTz
+	brne	KeccakChiIotaPrepareTheta_zLoop
+
+	#undef	CIPTa0
+	#undef	CIPTa1
+	#undef	CIPTa2
+	#undef	CIPTa3
+	#undef	CIPTa4
+	#undef	CIPTc0
+	#undef	CIPTc1
+	#undef	CIPTc2
+	#undef	CIPTc3
+	#undef	CIPTc4
+	#undef	CIPTz
+	#undef	CIPTy
+
+
+	;Check for terminator
+	lpm		r0,						Z
+	inc		r0					
+	breq	Keccak_Done
+	rjmp	Keccak_RoundLoop
+Keccak_Done:
+	ret
+
+
+bit_rot_jmp_table:
+	rjmp	KeccakRhoPi_RhoBitRotateDone
+	rjmp	rotate64_1bit_left
+	rjmp	rotate64_2bit_left
+	rjmp	rotate64_3bit_left
+	rjmp	rotate64_4bit_left
+	rjmp	rotate64_3bit_right
+	rjmp	rotate64_2bit_right
+	rjmp	rotate64_1bit_right
+
+rotate64_4bit_left:
+	lsl rTemp
+	rol rTemp+1
+	rol rTemp+2
+	rol rTemp+3
+	rol rTemp+4
+	rol rTemp+5
+	rol rTemp+6
+	rol rTemp+7
+	adc rTemp, r1
+rotate64_3bit_left:
+	lsl rTemp
+	rol rTemp+1
+	rol rTemp+2
+	rol rTemp+3
+	rol rTemp+4
+	rol rTemp+5
+	rol rTemp+6
+	rol rTemp+7
+	adc rTemp, r1
+rotate64_2bit_left:
+	lsl rTemp
+	rol rTemp+1
+	rol rTemp+2
+	rol rTemp+3
+	rol rTemp+4
+	rol rTemp+5
+	rol rTemp+6
+	rol rTemp+7
+	adc rTemp, r1
+rotate64_1bit_left:
+	lsl rTemp
+	rol rTemp+1
+	rol rTemp+2
+	rol rTemp+3
+	rol rTemp+4
+	rol rTemp+5
+	rol rTemp+6
+	rol rTemp+7
+	adc rTemp, r1
+	rjmp	KeccakRhoPi_RhoBitRotateDone
+
+rotate64_3bit_right:
+	bst rTemp, 0
+	ror rTemp+7
+	ror rTemp+6
+	ror rTemp+5
+	ror rTemp+4
+	ror rTemp+3
+	ror rTemp+2
+	ror rTemp+1
+	ror rTemp
+	bld rTemp+7, 7
+rotate64_2bit_right:
+	bst rTemp, 0
+	ror rTemp+7
+	ror rTemp+6
+	ror rTemp+5
+	ror rTemp+4
+	ror rTemp+3
+	ror rTemp+2
+	ror rTemp+1
+	ror rTemp
+	bld rTemp+7, 7
+rotate64_1bit_right:
+	bst rTemp, 0
+	ror rTemp+7
+	ror rTemp+6
+	ror rTemp+5
+	ror rTemp+4
+	ror rTemp+3
+	ror rTemp+2
+	ror rTemp+1
+	ror rTemp
+	bld rTemp+7, 7
+	rjmp	KeccakRhoPi_RhoBitRotateDone
+
+/*
+**	Each byte rotate routine must be 9 instructions long.
+*/
+rotate64_0byte_left:
+	ld		rTempBis+0,	Y+	
+	ld		rTempBis+1,	Y+	
+	ld		rTempBis+2,	Y+	
+	ld		rTempBis+3,	Y+	
+	ld		rTempBis+4,	Y+	
+	ld		rTempBis+5,	Y+	
+	ld		rTempBis+6,	Y+	
+	ld		rTempBis+7,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+rotate64_1byte_left:
+	ld		rTempBis+1,	Y+	
+	ld		rTempBis+2,	Y+	
+	ld		rTempBis+3,	Y+	
+	ld		rTempBis+4,	Y+	
+	ld		rTempBis+5,	Y+	
+	ld		rTempBis+6,	Y+	
+	ld		rTempBis+7,	Y+	
+	ld		rTempBis+0,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+rotate64_2byte_left:
+	ld		rTempBis+2,	Y+	
+	ld		rTempBis+3,	Y+	
+	ld		rTempBis+4,	Y+	
+	ld		rTempBis+5,	Y+	
+	ld		rTempBis+6,	Y+	
+	ld		rTempBis+7,	Y+	
+	ld		rTempBis+0,	Y+	
+	ld		rTempBis+1,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+rotate64_3byte_left:
+	ld		rTempBis+3,	Y+	
+	ld		rTempBis+4,	Y+	
+	ld		rTempBis+5,	Y+	
+	ld		rTempBis+6,	Y+	
+	ld		rTempBis+7,	Y+	
+	ld		rTempBis+0,	Y+	
+	ld		rTempBis+1,	Y+	
+	ld		rTempBis+2,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+rotate64_4byte_left:
+	ld		rTempBis+4,	Y+	
+	ld		rTempBis+5,	Y+	
+	ld		rTempBis+6,	Y+	
+	ld		rTempBis+7,	Y+	
+	ld		rTempBis+0,	Y+	
+	ld		rTempBis+1,	Y+	
+	ld		rTempBis+2,	Y+	
+	ld		rTempBis+3,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+rotate64_5byte_left:
+	ld		rTempBis+5,	Y+	
+	ld		rTempBis+6,	Y+	
+	ld		rTempBis+7,	Y+	
+	ld		rTempBis+0,	Y+	
+	ld		rTempBis+1,	Y+	
+	ld		rTempBis+2,	Y+	
+	ld		rTempBis+3,	Y+	
+	ld		rTempBis+4,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+rotate64_6byte_left:
+	ld		rTempBis+6,	Y+	
+	ld		rTempBis+7,	Y+	
+	ld		rTempBis+0,	Y+	
+	ld		rTempBis+1,	Y+	
+	ld		rTempBis+2,	Y+	
+	ld		rTempBis+3,	Y+	
+	ld		rTempBis+4,	Y+	
+	ld		rTempBis+5,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+rotate64_7byte_left:
+	ld		rTempBis+7,	Y+	
+	ld		rTempBis+0,	Y+	
+	ld		rTempBis+1,	Y+	
+	ld		rTempBis+2,	Y+	
+	ld		rTempBis+3,	Y+	
+	ld		rTempBis+4,	Y+	
+	ld		rTempBis+5,	Y+	
+	ld		rTempBis+6,	Y+	
+	rjmp	KeccakRhoPi_PiStore
+
+	#undef	rTemp			
+	#undef	rTempBis	
+	#undef	rTempTer
+	#undef	pRound		
+
+	#undef	rpState		
+	#undef	zero			
+	#undef	rX				
+	#undef	rY				
+	#undef	rZ				
diff --git a/c_src/KeccakF-1600-inplace-armgcc-ARMv7A-NEON.s b/c_src/KeccakF-1600-inplace-armgcc-ARMv7A-NEON.s
new file mode 100755
index 0000000..539e8ea
--- /dev/null
+++ b/c_src/KeccakF-1600-inplace-armgcc-ARMv7A-NEON.s
@@ -0,0 +1,446 @@
+@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+@ Michaël Peeters and Gilles Van Assche. For more information, feedback or
+@ questions, please refer to our website: http://keccak.noekeon.org/
+@
+@ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+@
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+.equ DO1STROUNDING, 0
+
+  @ PRESERVE8
+.text
+
+@//  --- offsets in state
+.equ Aba, 0*8
+.equ Aga, 1*8
+.equ Aka, 2*8
+.equ Ama, 3*8
+.equ Asa, 4*8
+
+@// --- macros
+
+.macro    KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5
+
+    @Prepare Theta
+    @Ca = Aba^Aga^Aka^Ama^Asa@
+    @Ce = Abe^Age^Ake^Ame^Ase@
+    @Ci = Abi^Agi^Aki^Ami^Asi@
+    @Co = Abo^Ago^Ako^Amo^Aso@
+    @Cu = Abu^Agu^Aku^Amu^Asu@
+    @De = Ca^ROL64(Ci, 1)@
+    @Di = Ce^ROL64(Co, 1)@
+    @Do = Ci^ROL64(Cu, 1)@
+    @Du = Co^ROL64(Ca, 1)@
+    @Da = Cu^ROL64(Ce, 1)@
+
+    veor.64 q4, q6, q7
+    veor.64 q5, q9, q10
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d1,  d8,   d16
+    veor.64 d2,  d10,   d17
+
+    veor.64 q4, q11, q12
+    veor.64 q5, q14, q15
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d3,  d8,   d26
+
+    vadd.u64 q4, q1, q1
+    veor.64 d4,  d10,   d27
+    vmov.64  d0, d5
+    vsri.64 q4, q1, #63
+
+    vadd.u64 q5, q2, q2
+    veor.64 q4, q4, q0
+    vsri.64 q5, q2, #63
+    vadd.u64 d7, d1, d1
+    veor.64 \argA2, \argA2, d8
+    veor.64 q5, q5, q1
+
+    vsri.64 d7, d1, #63
+    vshl.u64 d1, \argA2, #44
+    veor.64 \argA3, \argA3, d9
+    veor.64 d7, d7, d4
+
+    @Ba = argA1^Da@
+    @Be = ROL64((argA2^De), 44)@
+    @Bi = ROL64((argA3^Di), 43)@
+    @Bo = ROL64((argA4^Do), 21)@
+    @Bu = ROL64((argA5^Du), 14)@
+    @argA2 =   Be ^((~Bi)& Bo )@
+    @argA3 =   Bi ^((~Bo)& Bu )@
+    @argA4 =   Bo ^((~Bu)& Ba )@
+    @argA5 =   Bu ^((~Ba)& Be )@ 
+    @argA1 =   Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@
+    vsri.64 d1, \argA2, #64-44
+    vshl.u64 d2, \argA3, #43
+    vldr.64 d0, [sp, #\argA1]
+    veor.64 \argA4, \argA4, d10
+    vsri.64 d2, \argA3, #64-43
+    vshl.u64 d3, \argA4, #21
+    veor.64 \argA5, \argA5, d11
+    veor.64 d0, d0, d7
+    vsri.64 d3, \argA4, #64-21
+    vbic.64   d5, d2, d1
+    vshl.u64 d4, \argA5, #14
+    vbic.64   \argA2, d3, d2
+    vld1.64   d6, [r3]!
+    veor.64   d5, d0
+    vsri.64 d4, \argA5, #64-14
+    veor.64   d5, d6
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA2, d1
+    vstr.64   d5, [sp, #\argA1]
+    veor.64   \argA3, d2    
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi1   argA1, argA2, argA3, argA4, argA5
+
+    @d2 = ROL64((argA1^Da), 3)@
+    @d3 = ROL64((argA2^De), 45)@
+    @d4 = ROL64((argA3^Di), 61)@
+    @d0 = ROL64((argA4^Do), 28)@
+    @d1 = ROL64((argA5^Du), 20)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d3, \argA2, #45
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d4, \argA3, #61
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d3, \argA2, #64-45
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d4, \argA3, #64-61
+    vshl.u64  d0, \argA4, #28
+    veor.64 d6, d6, d7
+    vshl.u64  d1, \argA5, #20
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA4, #64-28
+    vbic.64   \argA4, d0, d4
+    vshl.u64  d2, d6, #3
+    vsri.64  d1, \argA5, #64-20
+    veor.64   \argA4, d3
+    vsri.64  d2, d6, #64-3
+    vbic.64   \argA5, d1, d0
+    vbic.64   d6, d2, d1
+    vbic.64   \argA2, d3, d2
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5
+
+    @d4 = ROL64((argA1^Da), 18)@
+    @d0 = ROL64((argA2^De), 1)@
+    @d1 = ROL64((argA3^Di), 6)@
+    @d2 = ROL64((argA4^Do), 25)@
+    @d3 = ROL64((argA5^Du), 8)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA3, \argA3, d9
+    veor.64 \argA4, \argA4, d10
+    vshl.u64  d1, \argA3, #6
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d2, \argA4, #25
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d1, \argA3, #64-6
+    veor.64 \argA2, \argA2, d8
+    vsri.64  d2, \argA4, #64-25
+    vext.8  d3, \argA5, \argA5, #7
+    veor.64 d6, d6, d7
+    vbic.64  \argA3, d2, d1
+    vadd.u64  d0, \argA2, \argA2
+    vbic.64   \argA4, d3, d2
+    vsri.64  d0, \argA2, #64-1
+    vshl.u64  d4, d6, #18
+    veor.64  \argA2, d1, \argA4
+    veor.64  \argA3, d0
+    vsri.64  d4, d6, #64-18
+    vstr.64   \argA3, [sp, #\argA1]
+    veor.64  d5, \argA3
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5
+
+    @d1 = ROL64((argA1^Da), 36)@
+    @d2 = ROL64((argA2^De), 10)@
+    @d3 = ROL64((argA3^Di), 15)@
+    @d4 = ROL64((argA4^Do), 56)@
+    @d0 = ROL64((argA5^Du), 27)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d2, \argA2, #10
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d3, \argA3, #15
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d2, \argA2, #64-10
+    vsri.64  d3, \argA3, #64-15
+    veor.64 \argA5, \argA5, d11
+    vext.8  d4, \argA4, \argA4, #1
+    vbic.64   \argA2, d3, d2
+    vshl.u64  d0, \argA5, #27
+    veor.64 d6, d6, d7
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA5, #64-27
+    vshl.u64  d1, d6, #36
+    veor.64   \argA3, d2
+    vbic.64   \argA4, d0, d4
+    vsri.64  d1, d6, #64-36
+    
+    veor.64   \argA4, d3
+    vbic.64   d6, d2, d1
+    vbic.64   \argA5, d1, d0
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5
+
+    @d3 = ROL64((argA1^Da), 41)@
+    @d4 = ROL64((argA2^De), 2)@
+    @d0 = ROL64((argA3^Di), 62)@
+    @d1 = ROL64((argA4^Do), 55)@
+    @d2 = ROL64((argA5^Du), 39)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d4, \argA2, #2
+    veor.64 \argA5, \argA5, d11
+    vshl.u64  d0, \argA3, #62
+    vldr.64 d6, [sp, #\argA1]
+    vsri.64  d4, \argA2, #64-2
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d0, \argA3, #64-62
+
+    vshl.u64  d1, \argA4, #55
+    veor.64 d6, d6, d7
+    vshl.u64  d2, \argA5, #39
+    vsri.64  d1, \argA4, #64-55
+    vbic.64  \argA4, d0, d4
+    vsri.64  d2, \argA5, #64-39
+    vbic.64  \argA2, d1, d0
+    vshl.u64  d3, d6, #41
+    veor.64  \argA5, d4, \argA2
+    vbic.64  \argA2, d2, d1
+    vsri.64  d3, d6, #64-41
+    veor.64  d6, d0, \argA2
+    
+    vbic.64 \argA2, d3, d2
+    vbic.64 \argA3, d4, d3
+    veor.64 \argA2, d1
+    vstr.64 d6, [sp, #\argA1]
+    veor.64 d5, d6
+    veor.64 \argA3, d2
+    veor.64 \argA4, d3
+
+    .endm
+
+@// --- constants
+
+
+  .align 8
+    .ltorg
+KeccakF1600RoundConstantsWithTerminator:
+    .quad  0x0000000000000001
+    .quad  0x0000000000008082
+    .quad  0x800000000000808a
+    .quad  0x8000000080008000
+    .quad  0x000000000000808b
+    .quad  0x0000000080000001
+    .quad  0x8000000080008081
+    .quad  0x8000000000008009
+    .quad  0x000000000000008a
+    .quad  0x0000000000000088
+    .quad  0x0000000080008009
+    .quad  0x000000008000000a
+    .quad  0x000000008000808b
+    .quad  0x800000000000008b
+    .quad  0x8000000000008089
+    .quad  0x8000000000008003
+    .quad  0x8000000000008002
+    .quad  0x8000000000000080
+    .quad  0x000000000000800a
+    .quad  0x800000008000000a
+    .quad  0x8000000080008081
+    .quad  0x8000000000008080
+    .quad  0x0000000080000001
+    .quad  0x8000000080008008
+    .quad   0xFFFFFFFFFFFFFFFF  @//terminator
+
+  .align 8
+
+@// --- code 
+
+@not callable from C!
+.global   KeccakF_armv7a_neon_asm
+KeccakF_armv7a_neon_asm:  @
+
+      adr  r3, KeccakF1600RoundConstantsWithTerminator
+roundLoop:
+
+    KeccakThetaRhoPiChiIota  Aba, d13, d19, d25, d31
+    KeccakThetaRhoPiChi1    Aka, d15, d21, d22, d28
+    KeccakThetaRhoPiChi2    Asa, d12, d18, d24, d30
+    KeccakThetaRhoPiChi3    Aga, d14, d20, d26, d27
+    KeccakThetaRhoPiChi4    Ama, d16, d17, d23, d29
+
+    KeccakThetaRhoPiChiIota  Aba, d15, d18, d26, d29
+    KeccakThetaRhoPiChi1    Asa, d14, d17, d25, d28
+    KeccakThetaRhoPiChi2    Ama, d13, d21, d24, d27
+    KeccakThetaRhoPiChi3    Aka, d12, d20, d23, d31
+    KeccakThetaRhoPiChi4    Aga, d16, d19, d22, d30
+
+    KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30
+    KeccakThetaRhoPiChi1    Ama, d12, d19, d26, d28
+    KeccakThetaRhoPiChi2    Aga, d15, d17, d24, d31
+    KeccakThetaRhoPiChi3    Asa, d13, d20, d22, d29
+    KeccakThetaRhoPiChi4    Aka, d16, d18, d25, d27
+
+    KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27
+    KeccakThetaRhoPiChi1    Aga, d13, d18, d23, d28
+    KeccakThetaRhoPiChi2    Aka, d14, d19, d24, d29
+    ldr    r0, [r3]
+    KeccakThetaRhoPiChi3    Ama, d15, d20, d25, d30
+    cmp    r0, #0xFFFFFFFF
+    KeccakThetaRhoPiChi4    Asa, d16, d21, d26, d31
+
+    bne    roundLoop
+    bx    lr
+
+  @
+  .align 8
+
+@//void KeccakF_armv7a( tKeccakLane * state )  callable from C
+.global   KeccakF_armv7a_neon
+KeccakF_armv7a_neon:  @
+
+    vpush  {q4-q7}
+    sub    sp,sp, #5*8
+
+    vldr.64  d0,  [r0, #0*8]
+    vldr.64  d12, [r0, #1*8]
+    vldr.64  d17, [r0, #2*8]
+    vldr.64  d22, [r0, #3*8]
+    vldr.64  d27, [r0, #4*8]
+
+    vldr.64  d1,  [r0, #5*8]
+    vldr.64  d13, [r0, #6*8]
+    vldr.64  d18, [r0, #7*8]
+    vldr.64  d23, [r0, #8*8]
+    vldr.64  d28, [r0, #9*8]
+
+    vldr.64  d2,  [r0, #10*8]
+    vldr.64  d14, [r0, #11*8]
+    vldr.64  d19, [r0, #12*8]
+    vldr.64  d24, [r0, #13*8]
+    vldr.64  d29, [r0, #14*8]
+
+    vldr.64  d3,  [r0, #15*8]
+    vldr.64  d15, [r0, #16*8]
+    vldr.64  d20, [r0, #17*8]
+    vldr.64  d25, [r0, #18*8]
+    vldr.64  d30, [r0, #19*8]
+
+    vldr.64  d4,  [r0, #20*8]
+    vldr.64  d16, [r0, #21*8]
+    vldr.64  d21, [r0, #22*8]
+    vldr.64  d26, [r0, #23*8]
+    vldr.64  d31, [r0, #24*8]
+
+    vstr.64  d0, [sp, #Aba]
+    mov      r2, lr
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    mov      r1, r0
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl      KeccakF_armv7a_neon_asm
+
+    vpop.64  { d0- d4 }
+
+    vstr.64  d0,  [r1, #0*8]
+    vstr.64  d12, [r1, #1*8]
+    vstr.64  d17, [r1, #2*8]
+    vstr.64  d22, [r1, #3*8]
+    vstr.64  d27, [r1, #4*8]
+
+    vstr.64  d1,  [r1, #5*8]
+    vstr.64  d13, [r1, #6*8]
+    vstr.64  d18, [r1, #7*8]
+    vstr.64  d23, [r1, #8*8]
+    vstr.64  d28, [r1, #9*8]
+
+    vstr.64  d2,  [r1, #10*8]
+    vstr.64  d14, [r1, #11*8]
+    vstr.64  d19, [r1, #12*8]
+    vstr.64  d24, [r1, #13*8]
+    vstr.64  d29, [r1, #14*8]
+
+    vstr.64  d3,  [r1, #15*8]
+    vstr.64  d15, [r1, #16*8]
+    vstr.64  d20, [r1, #17*8]
+    vstr.64  d25, [r1, #18*8]
+    vstr.64  d30, [r1, #19*8]
+
+    vstr.64  d4,  [r1, #20*8]
+    vstr.64  d16, [r1, #21*8]
+    vstr.64  d21, [r1, #22*8]
+    vstr.64  d26, [r1, #23*8]
+    vstr.64  d31, [r1, #24*8]
+
+    vpop  {q4-q7}
+    bx    r2
+
+  @
+
diff --git a/c_src/KeccakF-1600-int-set.h b/c_src/KeccakF-1600-int-set.h
new file mode 100755
index 0000000..0ed1d80
--- /dev/null
+++ b/c_src/KeccakF-1600-int-set.h
@@ -0,0 +1,6 @@
+#define ProvideFast576
+#define ProvideFast832
+#define ProvideFast1024
+#define ProvideFast1088
+#define ProvideFast1152
+#define ProvideFast1344
diff --git a/c_src/KeccakF-1600-interface.h b/c_src/KeccakF-1600-interface.h
new file mode 100755
index 0000000..22185a4
--- /dev/null
+++ b/c_src/KeccakF-1600-interface.h
@@ -0,0 +1,46 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakPermutationInterface_h_
+#define _KeccakPermutationInterface_h_
+
+#include "KeccakF-1600-int-set.h"
+
+void KeccakInitialize( void );
+void KeccakInitializeState(unsigned char *state);
+void KeccakPermutation(unsigned char *state);
+#ifdef ProvideFast576
+void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast832
+void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1024
+void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1088
+void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1152
+void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data);
+#endif
+#ifdef ProvideFast1344
+void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data);
+#endif
+void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount);
+#ifdef ProvideFast1024
+void KeccakExtract1024bits(const unsigned char *state, unsigned char *data);
+#endif
+void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount);
+
+#endif
diff --git a/c_src/KeccakF-1600-opt32-settings.h b/c_src/KeccakF-1600-opt32-settings.h
new file mode 100755
index 0000000..b135918
--- /dev/null
+++ b/c_src/KeccakF-1600-opt32-settings.h
@@ -0,0 +1,4 @@
+#define Unrolling 2
+//#define UseBebigokimisa
+//#define UseInterleaveTables
+#define UseSchedule 3
diff --git a/c_src/KeccakF-1600-opt32.c b/c_src/KeccakF-1600-opt32.c
new file mode 100755
index 0000000..aded3a9
--- /dev/null
+++ b/c_src/KeccakF-1600-opt32.c
@@ -0,0 +1,524 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "brg_endian.h"
+#include "KeccakF-1600-opt32-settings.h"
+#include "KeccakF-1600-interface.h"
+
+typedef unsigned char UINT8;
+typedef unsigned short UINT16;
+typedef unsigned int UINT32;
+typedef unsigned long long int UINT64;
+
+#ifdef UseInterleaveTables
+int interleaveTablesBuilt = 0;
+UINT16 interleaveTable[65536];
+UINT16 deinterleaveTable[65536];
+
+void buildInterleaveTables()
+{
+    UINT32 i, j;
+    UINT16 x;
+
+    if (!interleaveTablesBuilt) {
+        for(i=0; i<65536; i++) {
+            x = 0;
+            for(j=0; j<16; j++) {
+                if (i & (1 << j))
+                    x |= (1 << (j/2 + 8*(j%2)));
+            }
+            interleaveTable[i] = x;
+            deinterleaveTable[x] = (UINT16)i;
+        }
+        interleaveTablesBuilt = 1;
+    }
+}
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+
+#define xor2bytesIntoInterleavedWords(even, odd, source, j) \
+    i##j = interleaveTable[((const UINT16*)source)[j]]; \
+    ((UINT8*)even)[j] ^= i##j & 0xFF; \
+    ((UINT8*)odd)[j] ^= i##j >> 8;
+
+#define setInterleavedWordsInto2bytes(dest, even, odd, j) \
+    d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \
+    ((UINT16*)dest)[j] = d##j;
+
+#else // (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+
+#define xor2bytesIntoInterleavedWords(even, odd, source, j) \
+    i##j = interleaveTable[source[2*j] ^ ((UINT16)source[2*j+1] << 8)]; \
+    *even ^= (i##j & 0xFF) << (j*8); \
+    *odd ^= ((i##j >> 8) & 0xFF) << (j*8);
+
+#define setInterleavedWordsInto2bytes(dest, even, odd, j) \
+    d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \
+    dest[2*j] = d##j & 0xFF; \
+    dest[2*j+1] = d##j >> 8;
+
+#endif // Endianness
+
+void xor8bytesIntoInterleavedWords(UINT32 *even, UINT32 *odd, const UINT8* source)
+{
+    UINT16 i0, i1, i2, i3;
+
+    xor2bytesIntoInterleavedWords(even, odd, source, 0)
+    xor2bytesIntoInterleavedWords(even, odd, source, 1)
+    xor2bytesIntoInterleavedWords(even, odd, source, 2)
+    xor2bytesIntoInterleavedWords(even, odd, source, 3)
+}
+
+#define xorLanesIntoState(laneCount, state, input) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            xor8bytesIntoInterleavedWords(state+i*2, state+i*2+1, input+i*8); \
+    }
+
+void setInterleavedWordsInto8bytes(UINT8* dest, UINT32 even, UINT32 odd)
+{
+    UINT16 d0, d1, d2, d3;
+
+    setInterleavedWordsInto2bytes(dest, even, odd, 0)
+    setInterleavedWordsInto2bytes(dest, even, odd, 1)
+    setInterleavedWordsInto2bytes(dest, even, odd, 2)
+    setInterleavedWordsInto2bytes(dest, even, odd, 3)
+}
+
+#define extractLanes(laneCount, state, data) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            setInterleavedWordsInto8bytes(data+i*8, ((UINT32*)state)[i*2], ((UINT32*)state)[i*2+1]); \
+    }
+
+#else // No interleaving tables
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+
+// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+#define xorInterleavedLE(rateInLanes, state, input) \
+	{ \
+		const UINT32 * pI = (const UINT32 *)input; \
+		UINT32 * pS = state; \
+		UINT32 t, x0, x1; \
+	    int i; \
+	    for (i = (rateInLanes)-1; i >= 0; --i) \
+		{ \
+			x0 = *(pI++); \
+			t = (x0 ^ (x0 >>  1)) & 0x22222222UL;  x0 = x0 ^ t ^ (t <<  1); \
+			t = (x0 ^ (x0 >>  2)) & 0x0C0C0C0CUL;  x0 = x0 ^ t ^ (t <<  2); \
+			t = (x0 ^ (x0 >>  4)) & 0x00F000F0UL;  x0 = x0 ^ t ^ (t <<  4); \
+			t = (x0 ^ (x0 >>  8)) & 0x0000FF00UL;  x0 = x0 ^ t ^ (t <<  8); \
+ 			x1 = *(pI++); \
+			t = (x1 ^ (x1 >>  1)) & 0x22222222UL;  x1 = x1 ^ t ^ (t <<  1); \
+			t = (x1 ^ (x1 >>  2)) & 0x0C0C0C0CUL;  x1 = x1 ^ t ^ (t <<  2); \
+			t = (x1 ^ (x1 >>  4)) & 0x00F000F0UL;  x1 = x1 ^ t ^ (t <<  4); \
+			t = (x1 ^ (x1 >>  8)) & 0x0000FF00UL;  x1 = x1 ^ t ^ (t <<  8); \
+			*(pS++) ^= (UINT16)x0 | (x1 << 16); \
+			*(pS++) ^= (x0 >> 16) | (x1 & 0xFFFF0000); \
+		} \
+	}
+
+#define xorLanesIntoState(laneCount, state, input) \
+    xorInterleavedLE(laneCount, state, input)
+
+#else // (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+
+// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+UINT64 toInterleaving(UINT64 x) 
+{
+   UINT64 t;
+
+   t = (x ^ (x >>  1)) & 0x2222222222222222ULL;  x = x ^ t ^ (t <<  1);
+   t = (x ^ (x >>  2)) & 0x0C0C0C0C0C0C0C0CULL;  x = x ^ t ^ (t <<  2);
+   t = (x ^ (x >>  4)) & 0x00F000F000F000F0ULL;  x = x ^ t ^ (t <<  4);
+   t = (x ^ (x >>  8)) & 0x0000FF000000FF00ULL;  x = x ^ t ^ (t <<  8);
+   t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL;  x = x ^ t ^ (t << 16);
+
+   return x;
+}
+
+void xor8bytesIntoInterleavedWords(UINT32* evenAndOdd, const UINT8* source)
+{
+    // This can be optimized
+    UINT64 sourceWord =
+        (UINT64)source[0]
+        ^ (((UINT64)source[1]) <<  8)
+        ^ (((UINT64)source[2]) << 16)
+        ^ (((UINT64)source[3]) << 24)
+        ^ (((UINT64)source[4]) << 32)
+        ^ (((UINT64)source[5]) << 40)
+        ^ (((UINT64)source[6]) << 48)
+        ^ (((UINT64)source[7]) << 56);
+    UINT64 evenAndOddWord = toInterleaving(sourceWord);
+    evenAndOdd[0] ^= (UINT32)evenAndOddWord;
+    evenAndOdd[1] ^= (UINT32)(evenAndOddWord >> 32);
+}
+
+#define xorLanesIntoState(laneCount, state, input) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            xor8bytesIntoInterleavedWords(state+i*2, input+i*8); \
+    }
+
+#endif // Endianness
+
+// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+UINT64 fromInterleaving(UINT64 x)
+{
+   UINT64 t;
+
+   t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL;  x = x ^ t ^ (t << 16);
+   t = (x ^ (x >>  8)) & 0x0000FF000000FF00ULL;  x = x ^ t ^ (t <<  8);
+   t = (x ^ (x >>  4)) & 0x00F000F000F000F0ULL;  x = x ^ t ^ (t <<  4);
+   t = (x ^ (x >>  2)) & 0x0C0C0C0C0C0C0C0CULL;  x = x ^ t ^ (t <<  2);
+   t = (x ^ (x >>  1)) & 0x2222222222222222ULL;  x = x ^ t ^ (t <<  1);
+
+   return x;
+}
+
+void setInterleavedWordsInto8bytes(UINT8* dest, UINT32* evenAndOdd)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    ((UINT64*)dest)[0] = fromInterleaving(*(UINT64*)evenAndOdd);
+#else // (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+    // This can be optimized
+    UINT64 evenAndOddWord = (UINT64)evenAndOdd[0] ^ ((UINT64)evenAndOdd[1] << 32);
+    UINT64 destWord = fromInterleaving(evenAndOddWord);
+    dest[0] = destWord & 0xFF;
+    dest[1] = (destWord >> 8) & 0xFF;
+    dest[2] = (destWord >> 16) & 0xFF;
+    dest[3] = (destWord >> 24) & 0xFF;
+    dest[4] = (destWord >> 32) & 0xFF;
+    dest[5] = (destWord >> 40) & 0xFF;
+    dest[6] = (destWord >> 48) & 0xFF;
+    dest[7] = (destWord >> 56) & 0xFF;
+#endif // Endianness
+}
+
+#define extractLanes(laneCount, state, data) \
+    { \
+        int i; \
+        for(i=0; i<(laneCount); i++) \
+            setInterleavedWordsInto8bytes(data+i*8, (UINT32*)state+i*2); \
+    }
+
+#endif // With or without interleaving tables
+
+#if defined(_MSC_VER)
+#define ROL32(a, offset) _rotl(a, offset)
+#elif (defined (__arm__) && defined(__ARMCC_VERSION))
+#define ROL32(a, offset) __ror(a, 32-(offset))
+#else
+#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset))))
+#endif
+
+#include "KeccakF-1600-unrolling.macros"
+#include "KeccakF-1600-32.macros"
+
+#if (UseSchedule == 3)
+
+#ifdef UseBebigokimisa
+#error "No lane complementing with schedule 3."
+#endif
+
+#if (Unrolling != 2)
+#error "Only unrolling 2 is supported by schedule 3."
+#endif
+
+void KeccakPermutationOnWords(UINT32 *state)
+{
+    rounds
+}
+
+void KeccakPermutationOnWordsAfterXoring(UINT32 *state, const UINT8 *input, unsigned int laneCount)
+{
+    xorLanesIntoState(laneCount, state, input)
+    rounds
+}
+
+#ifdef ProvideFast576
+void KeccakPermutationOnWordsAfterXoring576bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(9, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakPermutationOnWordsAfterXoring832bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(13, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakPermutationOnWordsAfterXoring1024bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(16, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakPermutationOnWordsAfterXoring1088bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(17, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakPermutationOnWordsAfterXoring1152bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(18, state, input)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakPermutationOnWordsAfterXoring1344bits(UINT32 *state, const UINT8 *input)
+{
+    xorLanesIntoState(21, state, input)
+    rounds
+}
+#endif
+
+#else // (Schedule != 3)
+
+void KeccakPermutationOnWords(UINT32 *state)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromState(A, state)
+    rounds
+}
+
+void KeccakPermutationOnWordsAfterXoring(UINT32 *state, const UINT8 *input, unsigned int laneCount)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(laneCount, state, input)
+    copyFromState(A, state)
+    rounds
+}
+
+#ifdef ProvideFast576
+void KeccakPermutationOnWordsAfterXoring576bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(9, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakPermutationOnWordsAfterXoring832bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(13, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakPermutationOnWordsAfterXoring1024bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(16, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakPermutationOnWordsAfterXoring1088bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(17, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakPermutationOnWordsAfterXoring1152bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(18, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakPermutationOnWordsAfterXoring1344bits(UINT32 *state, const UINT8 *input)
+{
+    declareABCDE
+    unsigned int i;
+
+    xorLanesIntoState(21, state, input)
+    copyFromState(A, state)
+    rounds
+}
+#endif
+
+#endif
+
+void KeccakInitialize()
+{
+#ifdef UseInterleaveTables
+    buildInterleaveTables();
+#endif
+}
+
+void KeccakInitializeState(unsigned char *state)
+{
+    memset(state, 0, 200);
+#ifdef UseBebigokimisa
+    ((UINT32*)state)[ 2] = ~(UINT32)0;
+    ((UINT32*)state)[ 3] = ~(UINT32)0;
+    ((UINT32*)state)[ 4] = ~(UINT32)0;
+    ((UINT32*)state)[ 5] = ~(UINT32)0;
+    ((UINT32*)state)[16] = ~(UINT32)0;
+    ((UINT32*)state)[17] = ~(UINT32)0;
+    ((UINT32*)state)[24] = ~(UINT32)0;
+    ((UINT32*)state)[25] = ~(UINT32)0;
+    ((UINT32*)state)[34] = ~(UINT32)0;
+    ((UINT32*)state)[35] = ~(UINT32)0;
+    ((UINT32*)state)[40] = ~(UINT32)0;
+    ((UINT32*)state)[41] = ~(UINT32)0;
+#endif
+}
+
+void KeccakPermutation(unsigned char *state)
+{
+    // We assume the state is always stored as interleaved 32-bit words
+    KeccakPermutationOnWords((UINT32*)state);
+}
+
+#ifdef ProvideFast576
+void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring576bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring832bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1024bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1088bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1152bits((UINT32*)state, data);
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationOnWordsAfterXoring1344bits((UINT32*)state, data);
+}
+#endif
+
+void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
+{
+    KeccakPermutationOnWordsAfterXoring((UINT32*)state, data, laneCount);
+}
+
+#ifdef ProvideFast1024
+void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
+{
+    extractLanes(16, state, data)
+#ifdef UseBebigokimisa
+    ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2];
+    ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3];
+    ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4];
+    ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5];
+    ((UINT32*)data)[16] = ~((UINT32*)data)[16];
+    ((UINT32*)data)[17] = ~((UINT32*)data)[17];
+    ((UINT32*)data)[24] = ~((UINT32*)data)[24];
+    ((UINT32*)data)[25] = ~((UINT32*)data)[25];
+#endif
+}
+#endif
+
+void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+    extractLanes(laneCount, state, data)
+#ifdef UseBebigokimisa
+    if (laneCount > 1) {
+        ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2];
+        ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3];
+        if (laneCount > 2) {
+            ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4];
+            ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5];
+            if (laneCount > 8) {
+                ((UINT32*)data)[16] = ~((UINT32*)data)[16];
+                ((UINT32*)data)[17] = ~((UINT32*)data)[17];
+                if (laneCount > 12) {
+                    ((UINT32*)data)[24] = ~((UINT32*)data)[24];
+                    ((UINT32*)data)[25] = ~((UINT32*)data)[25];
+                    if (laneCount > 17) {
+                        ((UINT32*)data)[34] = ~((UINT32*)data)[34];
+                        ((UINT32*)data)[35] = ~((UINT32*)data)[35];
+                        if (laneCount > 20) {
+                            ((UINT32*)data)[40] = ~((UINT32*)data)[40];
+                            ((UINT32*)data)[41] = ~((UINT32*)data)[41];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
diff --git a/c_src/KeccakF-1600-opt64-settings.h b/c_src/KeccakF-1600-opt64-settings.h
new file mode 100755
index 0000000..8f16ada
--- /dev/null
+++ b/c_src/KeccakF-1600-opt64-settings.h
@@ -0,0 +1,7 @@
+#define Unrolling 24
+#define UseBebigokimisa
+//#define UseSSE
+//#define UseOnlySIMD64
+//#define UseMMX
+//#define UseSHLD
+//#define UseXOP
diff --git a/c_src/KeccakF-1600-opt64.c b/c_src/KeccakF-1600-opt64.c
new file mode 100755
index 0000000..9349f03
--- /dev/null
+++ b/c_src/KeccakF-1600-opt64.c
@@ -0,0 +1,504 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "brg_endian.h"
+#include "KeccakF-1600-opt64-settings.h"
+#include "KeccakF-1600-interface.h"
+
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+#if defined(UseSSE)
+    #include <x86intrin.h>
+    typedef __m128i V64;
+    typedef __m128i V128;
+    typedef union {
+        V128 v128;
+        UINT64 v64[2];
+    } V6464;
+
+    #define ANDnu64(a, b)       _mm_andnot_si128(a, b)
+    #define LOAD64(a)           _mm_loadl_epi64((const V64 *)&(a))
+    #define CONST64(a)          _mm_loadl_epi64((const V64 *)&(a))
+    #define ROL64(a, o)         _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
+    #define STORE64(a, b)       _mm_storel_epi64((V64 *)&(a), b)
+    #define XOR64(a, b)         _mm_xor_si128(a, b)
+    #define XOReq64(a, b)       a = _mm_xor_si128(a, b)
+    #define SHUFFLEBYTES128(a, b)   _mm_shuffle_epi8(a, b)
+
+    #define ANDnu128(a, b)      _mm_andnot_si128(a, b)
+    #define LOAD6464(a, b)      _mm_set_epi64((__m64)(a), (__m64)(b))
+    #define CONST128(a)         _mm_load_si128((const V128 *)&(a))
+    #define LOAD128(a)          _mm_load_si128((const V128 *)&(a))
+    #define LOAD128u(a)         _mm_loadu_si128((const V128 *)&(a))
+    #define ROL64in128(a, o)    _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
+    #define STORE128(a, b)      _mm_store_si128((V128 *)&(a), b)
+    #define XOR128(a, b)        _mm_xor_si128(a, b)
+    #define XOReq128(a, b)      a = _mm_xor_si128(a, b)
+    #define GET64LOLO(a, b)     _mm_unpacklo_epi64(a, b)
+    #define GET64HIHI(a, b)     _mm_unpackhi_epi64(a, b)
+    #define COPY64HI2LO(a)      _mm_shuffle_epi32(a, 0xEE)
+    #define COPY64LO2HI(a)      _mm_shuffle_epi32(a, 0x44)
+    #define ZERO128()           _mm_setzero_si128()
+
+    #ifdef UseOnlySIMD64
+    #include "KeccakF-1600-simd64.macros"
+    #else
+ALIGN const UINT64 rho8_56[2] = {0x0605040302010007, 0x080F0E0D0C0B0A09};
+    #include "KeccakF-1600-simd128.macros"
+    #endif
+
+    #ifdef UseBebigokimisa
+    #error "UseBebigokimisa cannot be used in combination with UseSSE"
+    #endif
+#elif defined(UseXOP)
+    #include <x86intrin.h>
+    typedef __m128i V64;
+    typedef __m128i V128;
+   
+    #define LOAD64(a)           _mm_loadl_epi64((const V64 *)&(a))
+    #define CONST64(a)          _mm_loadl_epi64((const V64 *)&(a))
+    #define STORE64(a, b)       _mm_storel_epi64((V64 *)&(a), b)
+    #define XOR64(a, b)         _mm_xor_si128(a, b)
+    #define XOReq64(a, b)       a = _mm_xor_si128(a, b)
+
+    #define ANDnu128(a, b)      _mm_andnot_si128(a, b)
+    #define LOAD6464(a, b)      _mm_set_epi64((__m64)(a), (__m64)(b))
+    #define CONST128(a)         _mm_load_si128((const V128 *)&(a))
+    #define LOAD128(a)          _mm_load_si128((const V128 *)&(a))
+    #define LOAD128u(a)         _mm_loadu_si128((const V128 *)&(a))
+    #define STORE128(a, b)      _mm_store_si128((V128 *)&(a), b)
+    #define XOR128(a, b)        _mm_xor_si128(a, b)
+    #define XOReq128(a, b)      a = _mm_xor_si128(a, b)
+    #define ZERO128()           _mm_setzero_si128()
+
+    #define SWAP64(a)           _mm_shuffle_epi32(a, 0x4E)
+    #define GET64LOLO(a, b)     _mm_unpacklo_epi64(a, b)
+    #define GET64HIHI(a, b)     _mm_unpackhi_epi64(a, b)
+    #define GET64LOHI(a, b)     ((__m128i)_mm_blend_pd((__m128d)a, (__m128d)b, 2))
+    #define GET64HILO(a, b)     SWAP64(GET64LOHI(b, a))
+    #define COPY64HI2LO(a)      _mm_shuffle_epi32(a, 0xEE)
+    #define COPY64LO2HI(a)      _mm_shuffle_epi32(a, 0x44)
+ 
+    #define ROL6464same(a, o)   _mm_roti_epi64(a, o)
+    #define ROL6464(a, r1, r2)  _mm_rot_epi64(a, CONST128( rot_##r1##_##r2 ))
+ALIGN const UINT64 rot_0_20[2]  = { 0, 20};
+ALIGN const UINT64 rot_44_3[2]  = {44,  3};
+ALIGN const UINT64 rot_43_45[2] = {43, 45};
+ALIGN const UINT64 rot_21_61[2] = {21, 61};
+ALIGN const UINT64 rot_14_28[2] = {14, 28};
+ALIGN const UINT64 rot_1_36[2]  = { 1, 36};
+ALIGN const UINT64 rot_6_10[2]  = { 6, 10};
+ALIGN const UINT64 rot_25_15[2] = {25, 15};
+ALIGN const UINT64 rot_8_56[2]  = { 8, 56};
+ALIGN const UINT64 rot_18_27[2] = {18, 27};
+ALIGN const UINT64 rot_62_55[2] = {62, 55};
+ALIGN const UINT64 rot_39_41[2] = {39, 41};
+
+#if defined(UseSimulatedXOP)
+    // For debugging purposes, when XOP is not available
+    #undef ROL6464
+    #undef ROL6464same
+    #define ROL6464same(a, o)   _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
+    V128 ROL6464(V128 a, int r0, int r1)
+    {
+        V128 a0 = ROL64(a, r0);
+        V128 a1 = COPY64HI2LO(ROL64(a, r1));
+        return GET64LOLO(a0, a1);
+    }
+#endif
+    
+    #include "KeccakF-1600-xop.macros"
+
+    #ifdef UseBebigokimisa
+    #error "UseBebigokimisa cannot be used in combination with UseXOP"
+    #endif
+#elif defined(UseMMX)
+    #include <mmintrin.h>
+    typedef __m64 V64;
+    #define ANDnu64(a, b)       _mm_andnot_si64(a, b)
+
+    #if (defined(_MSC_VER) || defined (__INTEL_COMPILER))
+        #define LOAD64(a)       *(V64*)&(a)
+        #define CONST64(a)      *(V64*)&(a)
+        #define STORE64(a, b)   *(V64*)&(a) = b
+    #else
+        #define LOAD64(a)       (V64)a
+        #define CONST64(a)      (V64)a
+        #define STORE64(a, b)   a = (UINT64)b
+    #endif
+    #define ROL64(a, o)         _mm_or_si64(_mm_slli_si64(a, o), _mm_srli_si64(a, 64-(o)))
+    #define XOR64(a, b)         _mm_xor_si64(a, b)
+    #define XOReq64(a, b)       a = _mm_xor_si64(a, b)
+
+    #include "KeccakF-1600-simd64.macros"
+
+    #ifdef UseBebigokimisa
+    #error "UseBebigokimisa cannot be used in combination with UseMMX"
+    #endif
+#else
+    #if defined(_MSC_VER)
+    #define ROL64(a, offset) _rotl64(a, offset)
+    #elif defined(UseSHLD)
+      #define ROL64(x,N) ({ \
+        register UINT64 __out; \
+        register UINT64 __in = x; \
+        __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
+        __out; \
+      })
+    #else
+    #define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
+    #endif
+
+    #include "KeccakF-1600-64.macros"
+#endif
+
+#include "KeccakF-1600-unrolling.macros"
+
+void KeccakPermutationOnWords(UINT64 *state)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromState(A, state)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+
+void KeccakPermutationOnWordsAfterXoring(UINT64 *state, const UINT64 *input, unsigned int laneCount)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+	unsigned int j;
+
+    for(j=0; j<laneCount; j++)
+        state[j] ^= input[j];	
+    copyFromState(A, state)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+
+#ifdef ProvideFast576
+void KeccakPermutationOnWordsAfterXoring576bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor576bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakPermutationOnWordsAfterXoring832bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor832bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakPermutationOnWordsAfterXoring1024bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1024bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakPermutationOnWordsAfterXoring1088bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1088bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakPermutationOnWordsAfterXoring1152bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1152bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakPermutationOnWordsAfterXoring1344bits(UINT64 *state, const UINT64 *input)
+{
+    declareABCDE
+#if (Unrolling != 24)
+    unsigned int i;
+#endif
+
+    copyFromStateAndXor1344bits(A, state, input)
+    rounds
+#if defined(UseMMX)
+    _mm_empty();
+#endif
+}
+#endif
+
+void KeccakInitialize()
+{
+}
+
+void KeccakInitializeState(unsigned char *state)
+{
+    memset(state, 0, 200);
+#ifdef UseBebigokimisa
+    ((UINT64*)state)[ 1] = ~(UINT64)0;
+    ((UINT64*)state)[ 2] = ~(UINT64)0;
+    ((UINT64*)state)[ 8] = ~(UINT64)0;
+    ((UINT64*)state)[12] = ~(UINT64)0;
+    ((UINT64*)state)[17] = ~(UINT64)0;
+    ((UINT64*)state)[20] = ~(UINT64)0;
+#endif
+}
+
+void KeccakPermutation(unsigned char *state)
+{
+    // We assume the state is always stored as words
+    KeccakPermutationOnWords((UINT64*)state);
+}
+
+void fromBytesToWord(UINT64 *word, const UINT8 *bytes)
+{
+    unsigned int i;
+
+    *word = 0;
+    for(i=0; i<(64/8); i++)
+        *word |= (UINT64)(bytes[i]) << (8*i);
+}
+
+#ifdef ProvideFast576
+void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring576bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[9];
+    unsigned int i;
+
+    for(i=0; i<9; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring576bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring832bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[13];
+    unsigned int i;
+
+    for(i=0; i<13; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring832bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[16];
+    unsigned int i;
+
+    for(i=0; i<16; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1088bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[17];
+    unsigned int i;
+
+    for(i=0; i<17; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1088bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1152bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[18];
+    unsigned int i;
+
+    for(i=0; i<18; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1152bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring1344bits((UINT64*)state, (const UINT64*)data);
+#else
+    UINT64 dataAsWords[21];
+    unsigned int i;
+
+    for(i=0; i<21; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring1344bits((UINT64*)state, dataAsWords);
+#endif
+}
+#endif
+
+void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWordsAfterXoring((UINT64*)state, (const UINT64*)data, laneCount);
+#else
+    UINT64 dataAsWords[25];
+    unsigned int i;
+
+    for(i=0; i<laneCount; i++)
+        fromBytesToWord(dataAsWords+i, data+(i*8));
+    KeccakPermutationOnWordsAfterXoring((UINT64*)state, dataAsWords, laneCount);
+#endif
+}
+
+void fromWordToBytes(UINT8 *bytes, const UINT64 word)
+{
+    unsigned int i;
+
+    for(i=0; i<(64/8); i++)
+        bytes[i] = (word >> (8*i)) & 0xFF;
+}
+
+#ifdef ProvideFast1024
+void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    memcpy(data, state, 128);
+#else
+    unsigned int i;
+
+    for(i=0; i<16; i++)
+        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+#endif
+#ifdef UseBebigokimisa
+    ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+    ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+    ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+#endif
+}
+#endif
+
+void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    memcpy(data, state, laneCount*8);
+#else
+    unsigned int i;
+
+    for(i=0; i<laneCount; i++)
+        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+#endif
+#ifdef UseBebigokimisa
+    if (laneCount > 1) {
+        ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+        if (laneCount > 2) {
+            ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+            if (laneCount > 8) {
+                ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+                if (laneCount > 12) {
+                    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+                    if (laneCount > 17) {
+                        ((UINT64*)data)[17] = ~((UINT64*)data)[17];
+                        if (laneCount > 20) {
+                            ((UINT64*)data)[20] = ~((UINT64*)data)[20];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
diff --git a/c_src/KeccakF-1600-reference.c b/c_src/KeccakF-1600-reference.c
new file mode 100755
index 0000000..628f710
--- /dev/null
+++ b/c_src/KeccakF-1600-reference.c
@@ -0,0 +1,300 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include "brg_endian.h"
+#include "displayIntermediateValues.h"
+#include "KeccakNISTInterface.h"
+#include "KeccakF-1600-interface.h"
+
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+
+#define nrRounds 24
+UINT64 KeccakRoundConstants[nrRounds];
+#define nrLanes 25
+unsigned int KeccakRhoOffsets[nrLanes];
+
+void KeccakPermutationOnWords(UINT64 *state);
+void theta(UINT64 *A);
+void rho(UINT64 *A);
+void pi(UINT64 *A);
+void chi(UINT64 *A);
+void iota(UINT64 *A, unsigned int indexRound);
+
+void fromBytesToWords(UINT64 *stateAsWords, const unsigned char *state)
+{
+    unsigned int i, j;
+
+    for(i=0; i<(KeccakPermutationSize/64); i++) {
+        stateAsWords[i] = 0;
+        for(j=0; j<(64/8); j++)
+            stateAsWords[i] |= (UINT64)(state[i*(64/8)+j]) << (8*j);
+    }
+}
+
+void fromWordsToBytes(unsigned char *state, const UINT64 *stateAsWords)
+{
+    unsigned int i, j;
+
+    for(i=0; i<(KeccakPermutationSize/64); i++)
+        for(j=0; j<(64/8); j++)
+            state[i*(64/8)+j] = (stateAsWords[i] >> (8*j)) & 0xFF;
+}
+
+void KeccakPermutation(unsigned char *state)
+{
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+    UINT64 stateAsWords[KeccakPermutationSize/64];
+#endif
+
+    displayStateAsBytes(1, "Input of permutation", state);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    KeccakPermutationOnWords((UINT64*)state);
+#else
+    fromBytesToWords(stateAsWords, state);
+    KeccakPermutationOnWords(stateAsWords);
+    fromWordsToBytes(state, stateAsWords);
+#endif
+    displayStateAsBytes(1, "State after permutation", state);
+}
+
+void KeccakPermutationAfterXor(unsigned char *state, const unsigned char *data, unsigned int dataLengthInBytes)
+{
+    unsigned int i;
+
+    for(i=0; i<dataLengthInBytes; i++)
+        state[i] ^= data[i];
+    KeccakPermutation(state);
+}
+
+void KeccakPermutationOnWords(UINT64 *state)
+{
+    unsigned int i;
+
+    displayStateAs64bitWords(3, "Same, with lanes as 64-bit words", state);
+
+    for(i=0; i<nrRounds; i++) {
+        displayRoundNumber(3, i);
+
+        theta(state);
+        displayStateAs64bitWords(3, "After theta", state);
+
+        rho(state);
+        displayStateAs64bitWords(3, "After rho", state);
+
+        pi(state);
+        displayStateAs64bitWords(3, "After pi", state);
+
+        chi(state);
+        displayStateAs64bitWords(3, "After chi", state);
+
+        iota(state, i);
+        displayStateAs64bitWords(3, "After iota", state);
+    }
+}
+
+#define index(x, y) (((x)%5)+5*((y)%5))
+#define ROL64(a, offset) ((offset != 0) ? ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset))) : a)
+
+void theta(UINT64 *A)
+{
+    unsigned int x, y;
+    UINT64 C[5], D[5];
+
+    for(x=0; x<5; x++) {
+        C[x] = 0; 
+        for(y=0; y<5; y++) 
+            C[x] ^= A[index(x, y)];
+    }
+    for(x=0; x<5; x++)
+        D[x] = ROL64(C[(x+1)%5], 1) ^ C[(x+4)%5];
+    for(x=0; x<5; x++)
+        for(y=0; y<5; y++)
+            A[index(x, y)] ^= D[x];
+}
+
+void rho(UINT64 *A)
+{
+    unsigned int x, y;
+
+    for(x=0; x<5; x++) for(y=0; y<5; y++)
+        A[index(x, y)] = ROL64(A[index(x, y)], KeccakRhoOffsets[index(x, y)]);
+}
+
+void pi(UINT64 *A)
+{
+    unsigned int x, y;
+    UINT64 tempA[25];
+
+    for(x=0; x<5; x++) for(y=0; y<5; y++)
+        tempA[index(x, y)] = A[index(x, y)];
+    for(x=0; x<5; x++) for(y=0; y<5; y++)
+        A[index(0*x+1*y, 2*x+3*y)] = tempA[index(x, y)];
+}
+
+void chi(UINT64 *A)
+{
+    unsigned int x, y;
+    UINT64 C[5];
+
+    for(y=0; y<5; y++) { 
+        for(x=0; x<5; x++)
+            C[x] = A[index(x, y)] ^ ((~A[index(x+1, y)]) & A[index(x+2, y)]);
+        for(x=0; x<5; x++)
+            A[index(x, y)] = C[x];
+    }
+}
+
+void iota(UINT64 *A, unsigned int indexRound)
+{
+    A[index(0, 0)] ^= KeccakRoundConstants[indexRound];
+}
+
+int LFSR86540(UINT8 *LFSR)
+{
+    int result = ((*LFSR) & 0x01) != 0;
+    if (((*LFSR) & 0x80) != 0)
+        // Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
+        (*LFSR) = ((*LFSR) << 1) ^ 0x71;
+    else
+        (*LFSR) <<= 1;
+    return result;
+}
+
+void KeccakInitializeRoundConstants()
+{
+    UINT8 LFSRstate = 0x01;
+    unsigned int i, j, bitPosition;
+
+    for(i=0; i<nrRounds; i++) {
+        KeccakRoundConstants[i] = 0;
+        for(j=0; j<7; j++) {
+            bitPosition = (1<<j)-1; //2^j-1
+            if (LFSR86540(&LFSRstate))
+                KeccakRoundConstants[i] ^= (UINT64)1<<bitPosition;
+        }
+    }
+}
+
+void KeccakInitializeRhoOffsets()
+{
+    unsigned int x, y, t, newX, newY;
+
+    KeccakRhoOffsets[index(0, 0)] = 0;
+    x = 1;
+    y = 0;
+    for(t=0; t<24; t++) {
+        KeccakRhoOffsets[index(x, y)] = ((t+1)*(t+2)/2) % 64;
+        newX = (0*x+1*y) % 5;
+        newY = (2*x+3*y) % 5;
+        x = newX;
+        y = newY;
+    }
+}
+
+void KeccakInitialize()
+{
+    KeccakInitializeRoundConstants();
+    KeccakInitializeRhoOffsets();
+}
+
+void displayRoundConstants(FILE *f)
+{
+    unsigned int i;
+
+    for(i=0; i<nrRounds; i++) {
+        fprintf(f, "RC[%02i][0][0] = ", i);
+        fprintf(f, "%08X", (unsigned int)(KeccakRoundConstants[i] >> 32));
+        fprintf(f, "%08X", (unsigned int)(KeccakRoundConstants[i] & 0xFFFFFFFFULL));
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+void displayRhoOffsets(FILE *f)
+{
+    unsigned int x, y;
+
+    for(y=0; y<5; y++) for(x=0; x<5; x++) {
+        fprintf(f, "RhoOffset[%i][%i] = ", x, y);
+        fprintf(f, "%2i", KeccakRhoOffsets[index(x, y)]);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+void KeccakInitializeState(unsigned char *state)
+{
+    memset(state, 0, KeccakPermutationSizeInBytes);
+}
+
+#ifdef ProvideFast576
+void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 72);
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 104);
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 128);
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 136);
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 144);
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 168);
+}
+#endif
+
+void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
+{
+    KeccakPermutationAfterXor(state, data, laneCount*8);
+}
+
+#ifdef ProvideFast1024
+void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
+{
+    memcpy(data, state, 128);
+}
+#endif
+
+void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+    memcpy(data, state, laneCount*8);
+}
diff --git a/c_src/KeccakF-1600-reference.h b/c_src/KeccakF-1600-reference.h
new file mode 100755
index 0000000..698bab8
--- /dev/null
+++ b/c_src/KeccakF-1600-reference.h
@@ -0,0 +1,20 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakPermutationReference_h_
+#define _KeccakPermutationReference_h_
+
+void displayRoundConstants(FILE *f);
+void displayRhoOffsets(FILE *f);
+
+#endif
diff --git a/c_src/KeccakF-1600-reference32BI.c b/c_src/KeccakF-1600-reference32BI.c
new file mode 100755
index 0000000..1ec4c23
--- /dev/null
+++ b/c_src/KeccakF-1600-reference32BI.c
@@ -0,0 +1,371 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include "brg_endian.h"
+#include "displayIntermediateValues.h"
+#include "KeccakNISTInterface.h"
+#include "KeccakF-1600-interface.h"
+
+typedef unsigned char UINT8;
+typedef unsigned int UINT32;
+
+#define nrRounds 24
+UINT32 KeccakRoundConstants[nrRounds][2];
+#define nrLanes 25
+unsigned int KeccakRhoOffsets[nrLanes];
+
+void KeccakPermutationOnWords(UINT32 *state);
+void theta(UINT32 *A);
+void rho(UINT32 *A);
+void pi(UINT32 *A);
+void chi(UINT32 *A);
+void iota(UINT32 *A, unsigned int indexRound);
+
+void toBitInterleaving(UINT32 low, UINT32 high, UINT32 *even, UINT32 *odd)
+{
+    unsigned int i;
+
+    *even = 0;
+    *odd = 0;
+    for(i=0; i<64; i++) {
+        unsigned int inBit;
+        if (i < 32)
+            inBit = (low >> i) & 1;
+        else
+            inBit = (high >> (i-32)) & 1;
+        if ((i % 2) == 0)
+            *even |= inBit << (i/2);
+        else
+            *odd |= inBit << ((i-1)/2);
+    }
+}
+
+void fromBitInterleaving(UINT32 even, UINT32 odd, UINT32 *low, UINT32 *high)
+{
+    unsigned int i;
+
+    *low = 0;
+    *high = 0;
+    for(i=0; i<64; i++) {
+        unsigned int inBit;
+        if ((i % 2) == 0)
+            inBit = (even >> (i/2)) & 1;
+        else
+            inBit = (odd >> ((i-1)/2)) & 1;
+        if (i < 32)
+            *low |= inBit << i;
+        else
+            *high |= inBit << (i-32);
+    }
+}
+
+void fromBytesToWords(UINT32 *stateAsWords, const unsigned char *state)
+{
+    unsigned int i, j;
+    UINT32 low, high;
+    UINT32 even, odd;
+
+    for(i=0; i<(KeccakPermutationSize/64); i++) {
+        low = 0;
+        high = 0;
+        for(j=0; j<(32/8); j++)
+            low |= (UINT32)(state[i*(64/8)+j]) << (8*j);
+        for(j=(32/8); j<(64/8); j++)
+            high |= (UINT32)(state[i*(64/8)+j]) << (8*j-32);
+        toBitInterleaving(low, high, &even, &odd);
+        stateAsWords[2*i+0] = even;
+        stateAsWords[2*i+1] = odd;
+    }
+}
+
+void fromWordsToBytes(unsigned char *state, const UINT32 *stateAsWords)
+{
+    unsigned int i, j;
+    UINT32 low, high;
+
+    for(i=0; i<(KeccakPermutationSize/64); i++) {
+        fromBitInterleaving(stateAsWords[2*i+0], stateAsWords[2*i+1], &low, &high);
+        for(j=0; j<(32/8); j++)
+            state[i*(64/8)+j] = (low >> (8*j)) & 0xFF;
+        for(j=32/8; j<(64/8); j++)
+            state[i*(64/8)+j] = (high >> (8*j-32)) & 0xFF;
+    }
+}
+
+void KeccakPermutation(unsigned char *state)
+{
+    UINT32 stateAsWords[KeccakPermutationSize/32];
+
+    displayStateAsBytes(1, "Input of permutation", state);
+    fromBytesToWords(stateAsWords, state);
+    KeccakPermutationOnWords(stateAsWords);
+    fromWordsToBytes(state, stateAsWords);
+    displayStateAsBytes(1, "State after permutation", state);
+}
+
+void KeccakPermutationAfterXor(unsigned char *state, const unsigned char *data, unsigned int dataLengthInBytes)
+{
+    unsigned int i;
+
+    for(i=0; i<dataLengthInBytes; i++)
+        state[i] ^= data[i];
+    KeccakPermutation(state);
+}
+
+void KeccakPermutationOnWords(UINT32 *state)
+{
+    unsigned int i;
+
+    displayStateAs32bitWords(3, "Same, with lanes as pairs of 32-bit words (bit interleaving)", state);
+
+    for(i=0; i<nrRounds; i++) {
+        displayRoundNumber(3, i);
+
+        theta(state);
+        displayStateAs32bitWords(3, "After theta", state);
+
+        rho(state);
+        displayStateAs32bitWords(3, "After rho", state);
+
+        pi(state);
+        displayStateAs32bitWords(3, "After pi", state);
+
+        chi(state);
+        displayStateAs32bitWords(3, "After chi", state);
+
+        iota(state, i);
+        displayStateAs32bitWords(3, "After iota", state);
+    }
+}
+
+#define index(x, y,z) ((((x)%5)+5*((y)%5))*2 + z)
+#define ROL32(a, offset) ((offset != 0) ? ((((UINT32)a) << offset) ^ (((UINT32)a) >> (32-offset))) : a)
+
+void ROL64(UINT32 inEven, UINT32 inOdd, UINT32 *outEven, UINT32 *outOdd, unsigned int offset)
+{
+    if ((offset % 2) == 0) {
+        *outEven = ROL32(inEven, offset/2);
+        *outOdd = ROL32(inOdd, offset/2);
+    }
+    else {
+        *outEven = ROL32(inOdd, (offset+1)/2);
+        *outOdd = ROL32(inEven, (offset-1)/2);
+    }
+}
+
+void theta(UINT32 *A)
+{
+    unsigned int x, y, z;
+    UINT32 C[5][2], D[5][2];
+
+    for(x=0; x<5; x++) {
+        for(z=0; z<2; z++) {
+            C[x][z] = 0; 
+            for(y=0; y<5; y++)
+                C[x][z] ^= A[index(x, y, z)];
+        }
+    }
+    for(x=0; x<5; x++) {
+        ROL64(C[(x+1)%5][0], C[(x+1)%5][1], &(D[x][0]), &(D[x][1]), 1);
+        for(z=0; z<2; z++)
+            D[x][z] ^= C[(x+4)%5][z];
+    }
+    for(x=0; x<5; x++)
+        for(y=0; y<5; y++)
+            for(z=0; z<2; z++)
+                A[index(x, y, z)] ^= D[x][z];
+}
+
+void rho(UINT32 *A)
+{
+    unsigned int x, y;
+
+    for(x=0; x<5; x++) for(y=0; y<5; y++)
+        ROL64(A[index(x, y, 0)], A[index(x, y, 1)], &(A[index(x, y, 0)]), &(A[index(x, y, 1)]), KeccakRhoOffsets[5*y+x]);
+}
+
+void pi(UINT32 *A)
+{
+    unsigned int x, y, z;
+    UINT32 tempA[50];
+
+    for(x=0; x<5; x++) for(y=0; y<5; y++) for(z=0; z<2; z++)
+        tempA[index(x, y, z)] = A[index(x, y, z)];
+    for(x=0; x<5; x++) for(y=0; y<5; y++) for(z=0; z<2; z++)
+        A[index(0*x+1*y, 2*x+3*y, z)] = tempA[index(x, y, z)];
+}
+
+void chi(UINT32 *A)
+{
+    unsigned int x, y, z;
+    UINT32 C[5][2];
+
+    for(y=0; y<5; y++) { 
+        for(x=0; x<5; x++)
+            for(z=0; z<2; z++)
+                C[x][z] = A[index(x, y, z)] ^ ((~A[index(x+1, y, z)]) & A[index(x+2, y, z)]);
+        for(x=0; x<5; x++)
+            for(z=0; z<2; z++)
+                A[index(x, y, z)] = C[x][z];
+    }
+}
+
+void iota(UINT32 *A, unsigned int indexRound)
+{
+    A[index(0, 0, 0)] ^= KeccakRoundConstants[indexRound][0];
+    A[index(0, 0, 1)] ^= KeccakRoundConstants[indexRound][1];
+}
+
+int LFSR86540(UINT8 *LFSR)
+{
+    int result = ((*LFSR) & 0x01) != 0;
+    if (((*LFSR) & 0x80) != 0)
+        // Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
+        (*LFSR) = ((*LFSR) << 1) ^ 0x71;
+    else
+        (*LFSR) <<= 1;
+    return result;
+}
+
+void KeccakInitializeRoundConstants()
+{
+    UINT8 LFSRstate = 0x01;
+    unsigned int i, j, bitPosition;
+    UINT32 low, high;
+
+    for(i=0; i<nrRounds; i++) {
+        low = high = 0;
+        for(j=0; j<7; j++) {
+            bitPosition = (1<<j)-1; //2^j-1
+            if (LFSR86540(&LFSRstate)) {
+                if (bitPosition < 32)
+                    low ^= (UINT32)1 << bitPosition;
+                else
+                    high ^= (UINT32)1 << (bitPosition-32);
+            }
+        }
+        toBitInterleaving(low, high, &(KeccakRoundConstants[i][0]), &(KeccakRoundConstants[i][1]));
+    }
+}
+
+void KeccakInitializeRhoOffsets()
+{
+    unsigned int x, y, t, newX, newY;
+
+    KeccakRhoOffsets[0] = 0;
+    x = 1;
+    y = 0;
+    for(t=0; t<24; t++) {
+        KeccakRhoOffsets[5*y+x] = ((t+1)*(t+2)/2) % 64;
+        newX = (0*x+1*y) % 5;
+        newY = (2*x+3*y) % 5;
+        x = newX;
+        y = newY;
+    }
+}
+
+void KeccakInitialize()
+{
+    KeccakInitializeRoundConstants();
+    KeccakInitializeRhoOffsets();
+}
+
+void displayRoundConstants(FILE *f)
+{
+    unsigned int i;
+
+    for(i=0; i<nrRounds; i++) {
+        fprintf(f, "RC[%02i][0][0] = ", i);
+        fprintf(f, "%08X:%08X", (unsigned int)(KeccakRoundConstants[i][0]), (unsigned int)(KeccakRoundConstants[i][1]));
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+void displayRhoOffsets(FILE *f)
+{
+    unsigned int x, y;
+
+    for(y=0; y<5; y++) for(x=0; x<5; x++) {
+        fprintf(f, "RhoOffset[%i][%i] = ", x, y);
+        fprintf(f, "%2i", KeccakRhoOffsets[5*y+x]);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+void KeccakInitializeState(unsigned char *state)
+{
+    memset(state, 0, KeccakPermutationSizeInBytes);
+}
+
+#ifdef ProvideFast576
+void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 72);
+}
+#endif
+
+#ifdef ProvideFast832
+void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 104);
+}
+#endif
+
+#ifdef ProvideFast1024
+void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 128);
+}
+#endif
+
+#ifdef ProvideFast1088
+void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 136);
+}
+#endif
+
+#ifdef ProvideFast1152
+void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 144);
+}
+#endif
+
+#ifdef ProvideFast1344
+void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
+{
+    KeccakPermutationAfterXor(state, data, 168);
+}
+#endif
+
+void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
+{
+    KeccakPermutationAfterXor(state, data, laneCount*8);
+}
+
+#ifdef ProvideFast1024
+void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
+{
+    memcpy(data, state, 128);
+}
+#endif
+
+void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+    memcpy(data, state, laneCount*8);
+}
diff --git a/c_src/KeccakF-1600-simd128.macros b/c_src/KeccakF-1600-simd128.macros
new file mode 100755
index 0000000..6301622
--- /dev/null
+++ b/c_src/KeccakF-1600-simd128.macros
@@ -0,0 +1,651 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    V6464 Abage, Abegi, Abigo, Abogu, Abuga; \
+    V6464 Akame, Akemi, Akimo, Akomu, Akuma; \
+    V6464 Abae, Abio, Agae, Agio, Akae, Akio, Amae, Amio, Asae, Asio; \
+    V64 Aba, Abe, Abi, Abo, Abu; \
+    V64 Aga, Age, Agi, Ago, Agu; \
+    V64 Aka, Ake, Aki, Ako, Aku; \
+    V64 Ama, Ame, Ami, Amo, Amu; \
+    V64 Asa, Ase, Asi, Aso, Asu; \
+    V128 Bbage, Bbegi, Bbigo, Bbogu, Bbuga; \
+    V128 Bkame, Bkemi, Bkimo, Bkomu, Bkuma; \
+    V64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    V64 Bga, Bge, Bgi, Bgo, Bgu; \
+    V64 Bka, Bke, Bki, Bko, Bku; \
+    V64 Bma, Bme, Bmi, Bmo, Bmu; \
+    V64 Bsa, Bse, Bsi, Bso, Bsu; \
+    V128 Cae, Cei, Cio, Cou, Cua, Dei, Dou; \
+    V64 Ca, Ce, Ci, Co, Cu; \
+    V64 Da, De, Di, Do, Du; \
+    V6464 Ebage, Ebegi, Ebigo, Ebogu, Ebuga; \
+    V6464 Ekame, Ekemi, Ekimo, Ekomu, Ekuma; \
+    V64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    V64 Ega, Ege, Egi, Ego, Egu; \
+    V64 Eka, Eke, Eki, Eko, Eku; \
+    V64 Ema, Eme, Emi, Emo, Emu; \
+    V64 Esa, Ese, Esi, Eso, Esu; \
+    V128 Zero;
+
+#define prepareTheta
+
+#define computeD \
+    Cua = GET64LOLO(Cu, Cae); \
+    Dei = XOR128(Cae, ROL64in128(Cio, 1)); \
+    Dou = XOR128(Cio, ROL64in128(Cua, 1)); \
+    Da = XOR64(Cu, ROL64in128(COPY64HI2LO(Cae), 1)); \
+    De = Dei; \
+    Di = COPY64HI2LO(Dei); \
+    Do = Dou; \
+    Du = COPY64HI2LO(Dou);
+
+// --- Theta Rho Pi Chi Iota Prepare-theta
+// --- 64-bit lanes mapped to 64-bit and 128-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    computeD \
+    \
+    A##ba = LOAD64(A##bage.v64[0]); \
+    XOReq64(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq64(A##gu, Du); \
+    Bge = ROL64(A##gu, 20); \
+    Bbage = GET64LOLO(Bba, Bge); \
+    A##ge = LOAD64(A##bage.v64[1]); \
+    XOReq64(A##ge, De); \
+    Bbe = ROL64(A##ge, 44); \
+    A##ka = LOAD64(A##kame.v64[0]); \
+    XOReq64(A##ka, Da); \
+    Bgi = ROL64(A##ka, 3); \
+    Bbegi = GET64LOLO(Bbe, Bgi); \
+    XOReq64(A##ki, Di); \
+    Bbi = ROL64(A##ki, 43); \
+    A##me = LOAD64(A##kame.v64[1]); \
+    XOReq64(A##me, De); \
+    Bgo = ROL64(A##me, 45); \
+    Bbigo = GET64LOLO(Bbi, Bgo); \
+    E##bage.v128 = XOR128(Bbage, ANDnu128(Bbegi, Bbigo)); \
+    XOReq128(E##bage.v128, CONST64(KeccakF1600RoundConstants[i])); \
+    Cae = E##bage.v128; \
+    XOReq64(A##mo, Do); \
+    Bbo = ROL64(A##mo, 21); \
+    XOReq64(A##si, Di); \
+    Bgu = ROL64(A##si, 61); \
+    Bbogu = GET64LOLO(Bbo, Bgu); \
+    E##begi.v128 = XOR128(Bbegi, ANDnu128(Bbigo, Bbogu)); \
+    Cei = E##begi.v128; \
+    XOReq64(A##su, Du); \
+    Bbu = ROL64(A##su, 14); \
+    XOReq64(A##bo, Do); \
+    Bga = ROL64(A##bo, 28); \
+    Bbuga = GET64LOLO(Bbu, Bga); \
+    E##bigo.v128 = XOR128(Bbigo, ANDnu128(Bbogu, Bbuga)); \
+    E##bi = E##bigo.v128; \
+    E##go = GET64HIHI(E##bigo.v128, E##bigo.v128); \
+    Cio = E##bigo.v128; \
+    E##bogu.v128 = XOR128(Bbogu, ANDnu128(Bbuga, Bbage)); \
+    E##bo = E##bogu.v128; \
+    E##gu = GET64HIHI(E##bogu.v128, E##bogu.v128); \
+    Cou = E##bogu.v128; \
+    E##buga.v128 = XOR128(Bbuga, ANDnu128(Bbage, Bbegi)); \
+    E##bu = E##buga.v128; \
+    E##ga = GET64HIHI(E##buga.v128, E##buga.v128); \
+    Cua = E##buga.v128; \
+\
+    A##be = LOAD64(A##begi.v64[0]); \
+    XOReq64(A##be, De); \
+    Bka = ROL64(A##be, 1); \
+    XOReq64(A##ga, Da); \
+    Bme = ROL64(A##ga, 36); \
+    Bkame = GET64LOLO(Bka, Bme); \
+    A##gi = LOAD64(A##begi.v64[1]); \
+    XOReq64(A##gi, Di); \
+    Bke = ROL64(A##gi, 6); \
+    A##ke = LOAD64(A##kemi.v64[0]); \
+    XOReq64(A##ke, De); \
+    Bmi = ROL64(A##ke, 10); \
+    Bkemi = GET64LOLO(Bke, Bmi); \
+    XOReq64(A##ko, Do); \
+    Bki = ROL64(A##ko, 25); \
+    A##mi = LOAD64(A##kemi.v64[1]); \
+    XOReq64(A##mi, Di); \
+    Bmo = ROL64(A##mi, 15); \
+    Bkimo = GET64LOLO(Bki, Bmo); \
+    E##kame.v128 = XOR128(Bkame, ANDnu128(Bkemi, Bkimo)); \
+    XOReq128(Cae, E##kame.v128); \
+    Bkomu = GET64LOLO(XOR64(A##mu, Du), XOR64(A##so, Do)); \
+    Bkomu = SHUFFLEBYTES128(Bkomu, CONST128(rho8_56)); \
+    E##kemi.v128 = XOR128(Bkemi, ANDnu128(Bkimo, Bkomu)); \
+    XOReq128(Cei, E##kemi.v128); \
+    XOReq64(A##sa, Da); \
+    Bku = ROL64(A##sa, 18); \
+    XOReq64(A##bu, Du); \
+    Bma = ROL64(A##bu, 27); \
+    Bkuma = GET64LOLO(Bku, Bma); \
+    E##kimo.v128 = XOR128(Bkimo, ANDnu128(Bkomu, Bkuma)); \
+    E##ki = E##kimo.v128; \
+    E##mo = GET64HIHI(E##kimo.v128, E##kimo.v128); \
+    XOReq128(Cio, E##kimo.v128); \
+    E##komu.v128 = XOR128(Bkomu, ANDnu128(Bkuma, Bkame)); \
+    E##ko = E##komu.v128; \
+    E##mu = GET64HIHI(E##komu.v128, E##komu.v128); \
+    XOReq128(Cou, E##komu.v128); \
+    E##kuma.v128 = XOR128(Bkuma, ANDnu128(Bkame, Bkemi)); \
+    E##ku = E##kuma.v128; \
+    E##ma = GET64HIHI(E##kuma.v128, E##kuma.v128); \
+    XOReq128(Cua, E##kuma.v128); \
+\
+    XOReq64(A##bi, Di); \
+    Bsa = ROL64(A##bi, 62); \
+    XOReq64(A##go, Do); \
+    Bse = ROL64(A##go, 55); \
+    XOReq64(A##ku, Du); \
+    Bsi = ROL64(A##ku, 39); \
+    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
+    Ca = E##sa; \
+    XOReq64(A##ma, Da); \
+    Bso = ROL64(A##ma, 41); \
+    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
+    Ce = E##se; \
+    XOReq128(Cae, GET64LOLO(Ca, Ce)); \
+    XOReq64(A##se, De); \
+    Bsu = ROL64(A##se, 2); \
+    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
+    Ci = E##si; \
+    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
+    Co = E##so; \
+    XOReq128(Cio, GET64LOLO(Ci, Co)); \
+    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
+    Cu = E##su; \
+\
+    Zero = ZERO128(); \
+    XOReq128(Cae, GET64HIHI(Cua, Zero)); \
+    XOReq128(Cae, GET64LOLO(Zero, Cei)); \
+    XOReq128(Cio, GET64HIHI(Cei, Zero)); \
+    XOReq128(Cio, GET64LOLO(Zero, Cou)); \
+    XOReq128(Cua, GET64HIHI(Cou, Zero)); \
+    XOReq64(Cu, Cua); \
+
+// --- Theta Rho Pi Chi Iota
+// --- 64-bit lanes mapped to 64-bit and 128-bit words
+#define thetaRhoPiChiIota(i, A, E) thetaRhoPiChiIotaPrepareTheta(i, A, E)
+
+const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = LOAD64(state[ 9]); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = LOAD128(state[10]); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = LOAD128(state[12]); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = LOAD64(state[14]); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = LOAD128u(state[15]); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD64(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = LOAD64(state[14]); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = LOAD128u(state[15]); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD64(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD128u(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD128u(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = XOR128(LOAD128u(state[17]), LOAD64(input[17])); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cu = X##bu; \
+    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD128u(input[15])); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = XOR128(LOAD128u(state[17]), LOAD128u(input[17])); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = XOR64(LOAD64(state[19]), LOAD64(input[19])); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = XOR128(LOAD128(state[20]), LOAD64(input[20])); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyFromState(X, state) \
+    X##bae.v128 = LOAD128(state[ 0]); \
+    X##ba = X##bae.v128; \
+    X##be = GET64HIHI(X##bae.v128, X##bae.v128); \
+    Cae = X##bae.v128; \
+    X##bio.v128 = LOAD128(state[ 2]); \
+    X##bi = X##bio.v128; \
+    X##bo = GET64HIHI(X##bio.v128, X##bio.v128); \
+    Cio = X##bio.v128; \
+    X##bu = LOAD64(state[ 4]); \
+    Cu = X##bu; \
+    X##gae.v128 = LOAD128u(state[ 5]); \
+    X##ga = X##gae.v128; \
+    X##ge = GET64HIHI(X##gae.v128, X##gae.v128); \
+    X##bage.v128 = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae.v128); \
+    X##gio.v128 = LOAD128u(state[ 7]); \
+    X##gi = X##gio.v128; \
+    X##begi.v128 = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio.v128, X##gio.v128); \
+    XOReq128(Cio, X##gio.v128); \
+    X##gu = LOAD64(state[ 9]); \
+    XOReq64(Cu, X##gu); \
+    X##kae.v128 = LOAD128(state[10]); \
+    X##ka = X##kae.v128; \
+    X##ke = GET64HIHI(X##kae.v128, X##kae.v128); \
+    XOReq128(Cae, X##kae.v128); \
+    X##kio.v128 = LOAD128(state[12]); \
+    X##ki = X##kio.v128; \
+    X##ko = GET64HIHI(X##kio.v128, X##kio.v128); \
+    XOReq128(Cio, X##kio.v128); \
+    X##ku = LOAD64(state[14]); \
+    XOReq64(Cu, X##ku); \
+    X##mae.v128 = LOAD128u(state[15]); \
+    X##ma = X##mae.v128; \
+    X##me = GET64HIHI(X##mae.v128, X##mae.v128); \
+    X##kame.v128 = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, X##mae.v128); \
+    X##mio.v128 = LOAD128u(state[17]); \
+    X##mi = X##mio.v128; \
+    X##kemi.v128 = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio.v128, X##mio.v128); \
+    XOReq128(Cio, X##mio.v128); \
+    X##mu = LOAD64(state[19]); \
+    XOReq64(Cu, X##mu); \
+    X##sae.v128 = LOAD128(state[20]); \
+    X##sa = X##sae.v128; \
+    X##se = GET64HIHI(X##sae.v128, X##sae.v128); \
+    XOReq128(Cae, X##sae.v128); \
+    X##sio.v128 = LOAD128(state[22]); \
+    X##si = X##sio.v128; \
+    X##so = GET64HIHI(X##sio.v128, X##sio.v128); \
+    XOReq128(Cio, X##sio.v128); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cu, X##su); \
+
+#define copyToState(state, X) \
+    state[ 0] = A##bage.v64[0]; \
+    state[ 1] = A##begi.v64[0]; \
+    STORE64(state[ 2], X##bi); \
+    STORE64(state[ 3], X##bo); \
+    STORE64(state[ 4], X##bu); \
+    STORE64(state[ 5], X##ga); \
+    state[ 6] = A##bage.v64[1]; \
+    state[ 7] = A##begi.v64[1]; \
+    STORE64(state[ 8], X##go); \
+    STORE64(state[ 9], X##gu); \
+    state[10] = X##kame.v64[0]; \
+    state[11] = X##kemi.v64[0]; \
+    STORE64(state[12], X##ki); \
+    STORE64(state[13], X##ko); \
+    STORE64(state[14], X##ku); \
+    STORE64(state[15], X##ma); \
+    state[16] = X##kame.v64[1]; \
+    state[17] = X##kemi.v64[1]; \
+    STORE64(state[18], X##mo); \
+    STORE64(state[19], X##mu); \
+    STORE64(state[20], X##sa); \
+    STORE64(state[21], X##se); \
+    STORE64(state[22], X##si); \
+    STORE64(state[23], X##so); \
+    STORE64(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+    X##bage = Y##bage; \
+    X##begi = Y##begi; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##kame = Y##kame; \
+    X##kemi = Y##kemi; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
diff --git a/c_src/KeccakF-1600-simd64.macros b/c_src/KeccakF-1600-simd64.macros
new file mode 100755
index 0000000..c067304
--- /dev/null
+++ b/c_src/KeccakF-1600-simd64.macros
@@ -0,0 +1,517 @@
+/*
+Code automatically generated by KeccakTools!
+
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    V64 Aba, Abe, Abi, Abo, Abu; \
+    V64 Aga, Age, Agi, Ago, Agu; \
+    V64 Aka, Ake, Aki, Ako, Aku; \
+    V64 Ama, Ame, Ami, Amo, Amu; \
+    V64 Asa, Ase, Asi, Aso, Asu; \
+    V64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    V64 Bga, Bge, Bgi, Bgo, Bgu; \
+    V64 Bka, Bke, Bki, Bko, Bku; \
+    V64 Bma, Bme, Bmi, Bmo, Bmu; \
+    V64 Bsa, Bse, Bsi, Bso, Bsu; \
+    V64 Ca, Ce, Ci, Co, Cu; \
+    V64 Da, De, Di, Do, Du; \
+    V64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    V64 Ega, Ege, Egi, Ego, Egu; \
+    V64 Eka, Eke, Eki, Eko, Eku; \
+    V64 Ema, Eme, Emi, Emo, Emu; \
+    V64 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = XOR64(Aba, XOR64(Aga, XOR64(Aka, XOR64(Ama, Asa)))); \
+    Ce = XOR64(Abe, XOR64(Age, XOR64(Ake, XOR64(Ame, Ase)))); \
+    Ci = XOR64(Abi, XOR64(Agi, XOR64(Aki, XOR64(Ami, Asi)))); \
+    Co = XOR64(Abo, XOR64(Ago, XOR64(Ako, XOR64(Amo, Aso)))); \
+    Cu = XOR64(Abu, XOR64(Agu, XOR64(Aku, XOR64(Amu, Asu)))); \
+
+// --- Code for round, with prepare-theta
+// --- 64-bit lanes mapped to 64-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = XOR64(Cu, ROL64(Ce, 1)); \
+    De = XOR64(Ca, ROL64(Ci, 1)); \
+    Di = XOR64(Ce, ROL64(Co, 1)); \
+    Do = XOR64(Ci, ROL64(Cu, 1)); \
+    Du = XOR64(Co, ROL64(Ca, 1)); \
+\
+    XOReq64(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq64(A##ge, De); \
+    Bbe = ROL64(A##ge, 44); \
+    XOReq64(A##ki, Di); \
+    Bbi = ROL64(A##ki, 43); \
+    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
+    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
+    Ca = E##ba; \
+    XOReq64(A##mo, Do); \
+    Bbo = ROL64(A##mo, 21); \
+    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
+    Ce = E##be; \
+    XOReq64(A##su, Du); \
+    Bbu = ROL64(A##su, 14); \
+    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
+    Ci = E##bi; \
+    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
+    Co = E##bo; \
+    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
+    Cu = E##bu; \
+\
+    XOReq64(A##bo, Do); \
+    Bga = ROL64(A##bo, 28); \
+    XOReq64(A##gu, Du); \
+    Bge = ROL64(A##gu, 20); \
+    XOReq64(A##ka, Da); \
+    Bgi = ROL64(A##ka, 3); \
+    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
+    XOReq64(Ca, E##ga); \
+    XOReq64(A##me, De); \
+    Bgo = ROL64(A##me, 45); \
+    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
+    XOReq64(Ce, E##ge); \
+    XOReq64(A##si, Di); \
+    Bgu = ROL64(A##si, 61); \
+    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
+    XOReq64(Ci, E##gi); \
+    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
+    XOReq64(Co, E##go); \
+    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
+    XOReq64(Cu, E##gu); \
+\
+    XOReq64(A##be, De); \
+    Bka = ROL64(A##be, 1); \
+    XOReq64(A##gi, Di); \
+    Bke = ROL64(A##gi, 6); \
+    XOReq64(A##ko, Do); \
+    Bki = ROL64(A##ko, 25); \
+    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
+    XOReq64(Ca, E##ka); \
+    XOReq64(A##mu, Du); \
+    Bko = ROL64(A##mu, 8); \
+    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
+    XOReq64(Ce, E##ke); \
+    XOReq64(A##sa, Da); \
+    Bku = ROL64(A##sa, 18); \
+    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
+    XOReq64(Ci, E##ki); \
+    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
+    XOReq64(Co, E##ko); \
+    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
+    XOReq64(Cu, E##ku); \
+\
+    XOReq64(A##bu, Du); \
+    Bma = ROL64(A##bu, 27); \
+    XOReq64(A##ga, Da); \
+    Bme = ROL64(A##ga, 36); \
+    XOReq64(A##ke, De); \
+    Bmi = ROL64(A##ke, 10); \
+    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
+    XOReq64(Ca, E##ma); \
+    XOReq64(A##mi, Di); \
+    Bmo = ROL64(A##mi, 15); \
+    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
+    XOReq64(Ce, E##me); \
+    XOReq64(A##so, Do); \
+    Bmu = ROL64(A##so, 56); \
+    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
+    XOReq64(Ci, E##mi); \
+    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
+    XOReq64(Co, E##mo); \
+    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
+    XOReq64(Cu, E##mu); \
+\
+    XOReq64(A##bi, Di); \
+    Bsa = ROL64(A##bi, 62); \
+    XOReq64(A##go, Do); \
+    Bse = ROL64(A##go, 55); \
+    XOReq64(A##ku, Du); \
+    Bsi = ROL64(A##ku, 39); \
+    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
+    XOReq64(Ca, E##sa); \
+    XOReq64(A##ma, Da); \
+    Bso = ROL64(A##ma, 41); \
+    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
+    XOReq64(Ce, E##se); \
+    XOReq64(A##se, De); \
+    Bsu = ROL64(A##se, 2); \
+    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
+    XOReq64(Ci, E##si); \
+    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
+    XOReq64(Co, E##so); \
+    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
+    XOReq64(Cu, E##su); \
+\
+
+// --- Code for round
+// --- 64-bit lanes mapped to 64-bit words
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = XOR64(Cu, ROL64(Ce, 1)); \
+    De = XOR64(Ca, ROL64(Ci, 1)); \
+    Di = XOR64(Ce, ROL64(Co, 1)); \
+    Do = XOR64(Ci, ROL64(Cu, 1)); \
+    Du = XOR64(Co, ROL64(Ca, 1)); \
+\
+    XOReq64(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq64(A##ge, De); \
+    Bbe = ROL64(A##ge, 44); \
+    XOReq64(A##ki, Di); \
+    Bbi = ROL64(A##ki, 43); \
+    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
+    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
+    XOReq64(A##mo, Do); \
+    Bbo = ROL64(A##mo, 21); \
+    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
+    XOReq64(A##su, Du); \
+    Bbu = ROL64(A##su, 14); \
+    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
+    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
+    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
+\
+    XOReq64(A##bo, Do); \
+    Bga = ROL64(A##bo, 28); \
+    XOReq64(A##gu, Du); \
+    Bge = ROL64(A##gu, 20); \
+    XOReq64(A##ka, Da); \
+    Bgi = ROL64(A##ka, 3); \
+    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
+    XOReq64(A##me, De); \
+    Bgo = ROL64(A##me, 45); \
+    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
+    XOReq64(A##si, Di); \
+    Bgu = ROL64(A##si, 61); \
+    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
+    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
+    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
+\
+    XOReq64(A##be, De); \
+    Bka = ROL64(A##be, 1); \
+    XOReq64(A##gi, Di); \
+    Bke = ROL64(A##gi, 6); \
+    XOReq64(A##ko, Do); \
+    Bki = ROL64(A##ko, 25); \
+    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
+    XOReq64(A##mu, Du); \
+    Bko = ROL64(A##mu, 8); \
+    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
+    XOReq64(A##sa, Da); \
+    Bku = ROL64(A##sa, 18); \
+    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
+    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
+    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
+\
+    XOReq64(A##bu, Du); \
+    Bma = ROL64(A##bu, 27); \
+    XOReq64(A##ga, Da); \
+    Bme = ROL64(A##ga, 36); \
+    XOReq64(A##ke, De); \
+    Bmi = ROL64(A##ke, 10); \
+    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
+    XOReq64(A##mi, Di); \
+    Bmo = ROL64(A##mi, 15); \
+    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
+    XOReq64(A##so, Do); \
+    Bmu = ROL64(A##so, 56); \
+    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
+    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
+    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
+\
+    XOReq64(A##bi, Di); \
+    Bsa = ROL64(A##bi, 62); \
+    XOReq64(A##go, Do); \
+    Bse = ROL64(A##go, 55); \
+    XOReq64(A##ku, Du); \
+    Bsi = ROL64(A##ku, 39); \
+    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
+    XOReq64(A##ma, Da); \
+    Bso = ROL64(A##ma, 41); \
+    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
+    XOReq64(A##se, De); \
+    Bsu = ROL64(A##se, 2); \
+    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
+    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
+    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
+\
+
+const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = LOAD64(state[ 9]); \
+    X##ka = LOAD64(state[10]); \
+    X##ke = LOAD64(state[11]); \
+    X##ki = LOAD64(state[12]); \
+    X##ko = LOAD64(state[13]); \
+    X##ku = LOAD64(state[14]); \
+    X##ma = LOAD64(state[15]); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = LOAD64(state[13]); \
+    X##ku = LOAD64(state[14]); \
+    X##ma = LOAD64(state[15]); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##mi = XOR64(LOAD64(state[17]), LOAD64(input[17])); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
+    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
+    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
+    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
+    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
+    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
+    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
+    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
+    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
+    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
+    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
+    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##mi = XOR64(LOAD64(state[17]), LOAD64(input[17])); \
+    X##mo = XOR64(LOAD64(state[18]), LOAD64(input[18])); \
+    X##mu = XOR64(LOAD64(state[19]), LOAD64(input[19])); \
+    X##sa = XOR64(LOAD64(state[20]), LOAD64(input[20])); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyFromState(X, state) \
+    X##ba = LOAD64(state[ 0]); \
+    X##be = LOAD64(state[ 1]); \
+    X##bi = LOAD64(state[ 2]); \
+    X##bo = LOAD64(state[ 3]); \
+    X##bu = LOAD64(state[ 4]); \
+    X##ga = LOAD64(state[ 5]); \
+    X##ge = LOAD64(state[ 6]); \
+    X##gi = LOAD64(state[ 7]); \
+    X##go = LOAD64(state[ 8]); \
+    X##gu = LOAD64(state[ 9]); \
+    X##ka = LOAD64(state[10]); \
+    X##ke = LOAD64(state[11]); \
+    X##ki = LOAD64(state[12]); \
+    X##ko = LOAD64(state[13]); \
+    X##ku = LOAD64(state[14]); \
+    X##ma = LOAD64(state[15]); \
+    X##me = LOAD64(state[16]); \
+    X##mi = LOAD64(state[17]); \
+    X##mo = LOAD64(state[18]); \
+    X##mu = LOAD64(state[19]); \
+    X##sa = LOAD64(state[20]); \
+    X##se = LOAD64(state[21]); \
+    X##si = LOAD64(state[22]); \
+    X##so = LOAD64(state[23]); \
+    X##su = LOAD64(state[24]); \
+
+#define copyToState(state, X) \
+    STORE64(state[ 0], X##ba); \
+    STORE64(state[ 1], X##be); \
+    STORE64(state[ 2], X##bi); \
+    STORE64(state[ 3], X##bo); \
+    STORE64(state[ 4], X##bu); \
+    STORE64(state[ 5], X##ga); \
+    STORE64(state[ 6], X##ge); \
+    STORE64(state[ 7], X##gi); \
+    STORE64(state[ 8], X##go); \
+    STORE64(state[ 9], X##gu); \
+    STORE64(state[10], X##ka); \
+    STORE64(state[11], X##ke); \
+    STORE64(state[12], X##ki); \
+    STORE64(state[13], X##ko); \
+    STORE64(state[14], X##ku); \
+    STORE64(state[15], X##ma); \
+    STORE64(state[16], X##me); \
+    STORE64(state[17], X##mi); \
+    STORE64(state[18], X##mo); \
+    STORE64(state[19], X##mu); \
+    STORE64(state[20], X##sa); \
+    STORE64(state[21], X##se); \
+    STORE64(state[22], X##si); \
+    STORE64(state[23], X##so); \
+    STORE64(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
diff --git a/c_src/KeccakF-1600-unrolling.macros b/c_src/KeccakF-1600-unrolling.macros
new file mode 100755
index 0000000..83c694c
--- /dev/null
+++ b/c_src/KeccakF-1600-unrolling.macros
@@ -0,0 +1,124 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (Unrolling == 24)
+#define rounds \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+    copyToState(state, A)
+#elif (Unrolling == 12)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=12) { \
+        thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 8)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=8) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+7, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 6)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 4)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 3)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 2)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+    copyToState(state, A)
+#elif (Unrolling == 1)
+#define rounds \
+    prepareTheta \
+    for(i=0; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+    copyToState(state, A)
+#else
+#error "Unrolling is not correctly specified!"
+#endif
diff --git a/c_src/KeccakF-1600-x86-64-asm.c b/c_src/KeccakF-1600-x86-64-asm.c
new file mode 100755
index 0000000..68fb9bd
--- /dev/null
+++ b/c_src/KeccakF-1600-x86-64-asm.c
@@ -0,0 +1,62 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by Ronny Van Keer,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakF-1600-interface.h"
+
+#define	UseBebigokimisa
+
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+
+void KeccakInitialize()
+{
+}
+
+void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
+{
+    memcpy(data, state, laneCount*8);
+#ifdef UseBebigokimisa
+    if (laneCount > 8) 
+    {
+        ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+        ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+        ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+
+        if (laneCount > 12) 
+        {
+            ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+            if (laneCount > 17) 
+            {
+                ((UINT64*)data)[17] = ~((UINT64*)data)[17];
+                if (laneCount > 20) 
+                {
+                    ((UINT64*)data)[20] = ~((UINT64*)data)[20];
+                }
+            }
+        }
+    }
+    else
+    {
+		if (laneCount > 1) 
+		{
+			((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+			if (laneCount > 2) 
+			{
+				((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+            }
+        }
+    }
+
+#endif
+}
diff --git a/c_src/KeccakF-1600-x86-64-gas.s b/c_src/KeccakF-1600-x86-64-gas.s
new file mode 100755
index 0000000..289a84e
--- /dev/null
+++ b/c_src/KeccakF-1600-x86-64-gas.s
@@ -0,0 +1,766 @@
+#
+# The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+# Michaël Peeters and Gilles Van Assche. For more information, feedback or
+# questions, please refer to our website: http://keccak.noekeon.org/
+#
+# Implementation by Ronny Van Keer,
+# hereby denoted as "the implementer".
+# 
+# To the extent possible under law, the implementer has waived all copyright
+# and related or neighboring rights to the source code in this file.
+# http://creativecommons.org/publicdomain/zero/1.0/
+#
+
+	.text
+
+
+#// --- defines
+
+.equ UseSIMD, 1
+
+
+.equ _ba,  0*8
+.equ _be,  1*8
+.equ _bi,  2*8
+.equ _bo,  3*8
+.equ _bu,  4*8
+.equ _ga,  5*8
+.equ _ge,  6*8
+.equ _gi,  7*8
+.equ _go,  8*8
+.equ _gu,  9*8
+.equ _ka, 10*8
+.equ _ke, 11*8
+.equ _ki, 12*8
+.equ _ko, 13*8
+.equ _ku, 14*8
+.equ _ma, 15*8
+.equ _me, 16*8
+.equ _mi, 17*8
+.equ _mo, 18*8
+.equ _mu, 19*8
+.equ _sa, 20*8
+.equ _se, 21*8
+.equ _si, 22*8
+.equ _so, 23*8
+.equ _su, 24*8
+
+
+#	arguments
+.equ apState,		%rdi
+.equ apInput,		%rsi
+.equ aNbrWords,		%rdx
+
+#	xor input into state section
+.equ xpState,		%r9
+
+# round vars
+.equ rT1,		%rax
+.equ rpState,		%rdi
+.equ rpStack,		%rsp
+
+.equ rDa,		%rbx
+.equ rDe,		%rcx
+.equ rDi,		%rdx
+.equ rDo,		%r8
+.equ rDu,		%r9
+
+.equ rBa,		%r10 
+.equ rBe,		%r11
+.equ rBi,		%r12
+.equ rBo,		%r13
+.equ rBu,		%r14
+
+.equ rCa,		%rsi
+.equ rCe,		%rbp
+.equ rCi,		rBi
+.equ rCo,		rBo
+.equ rCu,		%r15
+
+.macro	mKeccakRound	iState, oState, rc, lastRound
+
+		movq		rCe, rDa
+		rolq		rDa
+
+		movq		_bi(\iState), rCi
+		xorq		_gi(\iState), rDi
+		xorq		rCu, rDa
+		xorq		_ki(\iState), rCi
+		xorq		_mi(\iState), rDi
+		xorq		rDi, rCi
+
+		movq		rCi, rDe
+		rolq		rDe
+
+		movq		_bo(\iState), rCo
+		xorq		_go(\iState), rDo
+		xorq		rCa, rDe
+		xorq		_ko(\iState), rCo
+		xorq		_mo(\iState), rDo
+		xorq		rDo, rCo
+
+		movq		rCo, rDi
+		rolq		rDi
+
+		movq		rCu, rDo
+		xorq		rCe, rDi
+		rolq		rDo
+
+		movq		rCa, rDu
+		xorq		rCi, rDo
+		rolq		rDu
+
+		movq		_ba(\iState), rBa
+		movq		_ge(\iState), rBe
+		xorq		rCo, rDu
+		movq		_ki(\iState), rBi
+		movq		_mo(\iState), rBo
+		movq		_su(\iState), rBu
+		xorq		rDe, rBe
+		rolq		$44, rBe
+		xorq		rDi, rBi
+		xorq		rDa, rBa
+		rolq		$43, rBi
+
+		movq		rBe, rCa
+		movq		$\rc, rT1
+		orq		rBi, rCa
+		xorq		rBa, rT1
+		xorq		rT1, rCa
+		movq		rCa, _ba(\oState)
+
+		xorq		rDu, rBu
+		rolq		$14, rBu
+		movq		rBa, rCu
+		andq		rBe, rCu
+		xorq		rBu, rCu
+		movq		rCu, _bu(\oState)
+
+		xorq		rDo, rBo
+		rolq		$21, rBo
+		movq		rBo, rT1
+		andq		rBu, rT1
+		xorq		rBi, rT1
+		movq		rT1, _bi(\oState)
+
+		notq		rBi
+		orq		rBa, rBu
+		orq		rBo, rBi
+		xorq		rBo, rBu
+		xorq		rBe, rBi
+		movq		rBu, _bo(\oState)
+		movq		rBi, _be(\oState)
+		.if		\lastRound == 0
+		movq		rBi, rCe
+		.endif
+
+
+		movq		_gu(\iState), rBe
+		xorq		rDu, rBe
+		movq		_ka(\iState), rBi
+		rolq		$20, rBe
+		xorq		rDa, rBi
+		rolq		$3,  rBi
+		movq		_bo(\iState), rBa
+		movq		rBe, rT1
+		orq		rBi, rT1
+		xorq		rDo, rBa
+		movq		_me(\iState), rBo
+		movq		_si(\iState), rBu
+		rolq		$28, rBa
+		xorq		rBa, rT1
+		movq		rT1, _ga(\oState)
+		.if		\lastRound == 0
+		xor 		rT1, rCa
+		.endif
+
+		xorq		rDe, rBo
+		rolq		$45, rBo
+		movq		rBi, rT1
+		andq		rBo, rT1
+		xorq		rBe, rT1
+		movq		rT1, _ge(\oState)
+		.if		\lastRound == 0
+		xorq		rT1, rCe
+		.endif
+
+		xorq		rDi, rBu
+		rolq		$61, rBu
+		movq		rBu, rT1
+		orq		rBa, rT1
+		xorq		rBo, rT1
+		movq		rT1, _go(\oState)
+
+		andq		rBe, rBa
+		xorq		rBu, rBa
+		movq		rBa, _gu(\oState)
+		notq		rBu
+		.if		\lastRound == 0
+		xorq		rBa, rCu
+		.endif
+
+		orq		rBu, rBo
+		xorq		rBi, rBo
+		movq		rBo, _gi(\oState)
+
+
+		movq		_be(\iState), rBa
+		movq		_gi(\iState), rBe
+		movq		_ko(\iState), rBi
+		movq		_mu(\iState), rBo
+		movq		_sa(\iState), rBu
+		xorq		rDi, rBe
+		rolq		$6,  rBe
+		xorq		rDo, rBi
+		rolq		$25, rBi
+		movq		rBe, rT1
+		orq		rBi, rT1
+		xorq		rDe, rBa
+		rolq		$1,  rBa
+		xorq		rBa, rT1
+		movq		rT1, _ka(\oState)
+		.if		\lastRound == 0
+		xor 		rT1, rCa
+		.endif
+
+		xorq		rDu, rBo
+		rolq		$8,  rBo
+		movq		rBi, rT1
+		andq		rBo, rT1
+		xorq		rBe, rT1
+		movq		rT1, _ke(\oState)
+		.if		\lastRound == 0
+		xorq		rT1, rCe
+		.endif
+
+		xorq		rDa, rBu
+		rolq		$18, rBu
+		notq		rBo
+		movq		rBo, rT1
+		andq		rBu, rT1
+		xorq		rBi, rT1
+		movq		rT1, _ki(\oState)
+
+		movq		rBu, rT1
+		orq		rBa, rT1
+		xorq		rBo, rT1
+		movq		rT1, _ko(\oState)
+
+		andq		rBe, rBa
+		xorq		rBu, rBa
+		movq		rBa, _ku(\oState)
+		.if		\lastRound == 0
+		xorq		rBa, rCu
+		.endif
+
+		movq		_ga(\iState), rBe
+		xorq		rDa, rBe
+		movq		_ke(\iState), rBi
+		rolq		$36, rBe
+		xorq		rDe, rBi
+		movq		_bu(\iState), rBa
+		rolq		$10, rBi
+		movq		rBe, rT1
+		movq		_mi(\iState), rBo
+		andq		rBi, rT1
+		xorq		rDu, rBa
+		movq		_so(\iState), rBu
+		rolq		$27, rBa
+		xorq		rBa, rT1
+		movq		rT1, _ma(\oState)
+		.if		\lastRound == 0
+		xor 		rT1, rCa
+		.endif
+
+		xorq		rDi, rBo
+		rolq		$15, rBo
+		movq		rBi, rT1
+		orq		rBo, rT1
+		xorq		rBe, rT1
+		movq		rT1, _me(\oState)
+		.if		\lastRound == 0
+		xorq		rT1, rCe
+		.endif
+
+		xorq		rDo, rBu
+		rolq		$56, rBu
+		notq		rBo
+		movq		rBo, rT1
+		orq		rBu, rT1
+		xorq		rBi, rT1
+		movq		rT1, _mi(\oState)
+
+		orq		rBa, rBe
+		xorq		rBu, rBe
+		movq		rBe, _mu(\oState)
+
+		andq		rBa, rBu
+		xorq		rBo, rBu
+		movq		rBu, _mo(\oState)
+		.if		\lastRound == 0
+		xorq		rBe, rCu
+		.endif
+
+
+		movq		_bi(\iState), rBa
+		movq		_go(\iState), rBe
+		movq		_ku(\iState), rBi
+		xorq		rDi, rBa
+		movq		_ma(\iState), rBo
+		rolq		$62, rBa
+		xorq		rDo, rBe
+		movq		_se(\iState), rBu
+		rolq		$55, rBe
+
+		xorq		rDu, rBi
+		movq		rBa, rDu
+		xorq		rDe, rBu
+		rolq		$2,  rBu
+		andq		rBe, rDu
+		xorq		rBu, rDu
+		movq		rDu, _su(\oState)
+
+		rolq		$39, rBi
+		.if		\lastRound == 0
+		xorq		rDu, rCu
+		.endif
+		notq		rBe
+		xorq		rDa, rBo
+		movq		rBe, rDa
+		andq		rBi, rDa
+		xorq		rBa, rDa
+		movq		rDa, _sa(\oState)
+		.if		\lastRound == 0
+		xor 		rDa, rCa
+		.endif
+
+		rolq		$41, rBo
+		movq		rBi, rDe
+		orq		rBo, rDe
+		xorq		rBe, rDe
+		movq		rDe, _se(\oState)
+		.if		\lastRound == 0
+		xorq		rDe, rCe
+		.endif
+
+		movq		rBo, rDi
+		movq		rBu, rDo
+		andq		rBu, rDi
+		orq		rBa, rDo
+		xorq		rBi, rDi
+		xorq		rBo, rDo
+		movq		rDi, _si(\oState)
+		movq		rDo, _so(\oState)
+
+		.endm
+
+.macro	mKeccakPermutation	
+
+		subq		$8*25, %rsp
+
+		movq		_ba(rpState), rCa             
+		movq		_be(rpState), rCe
+		movq		_bu(rpState), rCu
+
+		xorq		_ga(rpState), rCa             
+		xorq		_ge(rpState), rCe
+		xorq		_gu(rpState), rCu             
+
+		xorq		_ka(rpState), rCa             
+		xorq		_ke(rpState), rCe
+		xorq		_ku(rpState), rCu             
+
+		xorq		_ma(rpState), rCa             
+		xorq		_me(rpState), rCe
+		xorq		_mu(rpState), rCu             
+
+		xorq		_sa(rpState), rCa
+		xorq		_se(rpState), rCe
+		movq		_si(rpState), rDi
+		movq		_so(rpState), rDo
+		xorq		_su(rpState), rCu             
+
+
+		mKeccakRound	rpState, rpStack, 0x0000000000000001, 0
+		mKeccakRound	rpStack, rpState, 0x0000000000008082, 0
+		mKeccakRound	rpState, rpStack, 0x800000000000808a, 0
+		mKeccakRound	rpStack, rpState, 0x8000000080008000, 0
+		mKeccakRound	rpState, rpStack, 0x000000000000808b, 0
+		mKeccakRound	rpStack, rpState, 0x0000000080000001, 0
+
+		mKeccakRound	rpState, rpStack, 0x8000000080008081, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000008009, 0
+		mKeccakRound	rpState, rpStack, 0x000000000000008a, 0
+		mKeccakRound	rpStack, rpState, 0x0000000000000088, 0
+		mKeccakRound	rpState, rpStack, 0x0000000080008009, 0
+		mKeccakRound	rpStack, rpState, 0x000000008000000a, 0
+
+		mKeccakRound	rpState, rpStack, 0x000000008000808b, 0
+		mKeccakRound	rpStack, rpState, 0x800000000000008b, 0
+		mKeccakRound	rpState, rpStack, 0x8000000000008089, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000008003, 0
+		mKeccakRound	rpState, rpStack, 0x8000000000008002, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000000080, 0
+
+		mKeccakRound	rpState, rpStack, 0x000000000000800a, 0
+		mKeccakRound	rpStack, rpState, 0x800000008000000a, 0
+		mKeccakRound	rpState, rpStack, 0x8000000080008081, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000008080, 0
+		mKeccakRound	rpState, rpStack, 0x0000000080000001, 0
+		mKeccakRound	rpStack, rpState, 0x8000000080008008, 1
+
+		addq		$8*25, %rsp
+
+		.endm
+
+.macro	mPushRegs	
+
+	pushq		%rbx
+	pushq		%rbp
+	pushq		%r12
+	pushq		%r13
+	pushq		%r14
+	pushq		%r15
+
+	.endm
+
+
+.macro	mPopRegs	
+
+	popq		%r15
+	popq		%r14
+	popq		%r13
+	popq		%r12
+	popq		%rbp
+	popq		%rbx
+
+	.endm
+
+
+.macro	mXorState128	input, state, offset
+	.if 		UseSIMD == 0
+	movq		\offset(\input), %rax
+	movq		\offset+8(\input), %rcx
+	xorq		%rax, \offset(\state)
+	xorq		%rcx, \offset+8(\state)
+	.else
+	movdqu		\offset(\input), %xmm0
+	pxor		\offset(\state), %xmm0
+	movdqu		%xmm0, \offset(\state)
+	.endif
+	.endm
+
+.macro	mXorState256	input, state, offset
+	.if 		UseSIMD == 0
+	movq		\offset(\input), %rax
+	movq		\offset+8(\input), %r10
+	movq		\offset+16(\input), %rcx
+	movq		\offset+24(\input), %r8
+	xorq		%rax, \offset(\state)
+	xorq		%r10, \offset+8(\state)
+	xorq		%rcx, \offset+16(\state)
+	xorq		%r8,  \offset+24(\state)
+	.else
+	movdqu		\offset(\input), %xmm0
+	pxor		\offset(\state), %xmm0
+	movdqu		\offset+16(\input), %xmm1
+	pxor		\offset+16(\state), %xmm1
+	movdqu		%xmm0, \offset(\state)
+	movdqu		%xmm1, \offset+16(\state)
+	.endif
+	.endm
+
+.macro	mXorState512	input, state, offset
+	.if 		UseSIMD == 0
+	mXorState256	\input, \state, \offset
+	mXorState256	\input, \state, \offset+32
+	.else
+	movdqu		\offset(\input), %xmm0
+	movdqu		\offset+16(\input), %xmm1
+	pxor		\offset(\state), %xmm0
+	movdqu		\offset+32(\input), %xmm2
+	pxor		\offset+16(\state), %xmm1
+	movdqu		%xmm0, \offset(\state)
+	movdqu		\offset+48(\input), %xmm3
+	pxor		\offset+32(\state), %xmm2
+	movdqu		%xmm1, \offset+16(\state)
+	pxor		\offset+48(\state), %xmm3
+	movdqu		%xmm2, \offset+32(\state)
+	movdqu		%xmm3, \offset+48(\state)
+	.endif
+	.endm
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakPermutation, .-KeccakPermutation
+	.align	2
+	.global	KeccakPermutation
+	.type	KeccakPermutation, %function
+KeccakPermutation:
+
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb576bits, .-KeccakAbsorb576bits
+	.align	2
+	.global	KeccakAbsorb576bits
+	.type	KeccakAbsorb576bits, %function
+KeccakAbsorb576bits:
+
+	mXorState512	apInput, apState, 0
+	movq		64(apInput), %rax
+	xorq		%rax, 64(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb832bits, .-KeccakAbsorb832bits
+	.align	2
+	.global	KeccakAbsorb832bits
+	.type	KeccakAbsorb832bits, %function
+KeccakAbsorb832bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState256	apInput, apState, 64
+	movq		96(apInput), %rax
+	xorq		%rax, 96(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1024bits, .-KeccakAbsorb1024bits
+	.align	2
+	.global	KeccakAbsorb1024bits
+	.type	KeccakAbsorb1024bits, %function
+KeccakAbsorb1024bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1088bits, .-KeccakAbsorb1088bits
+	.align	2
+	.global	KeccakAbsorb1088bits
+	.type	KeccakAbsorb1088bits, %function
+KeccakAbsorb1088bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	movq		128(apInput), %rax
+	xorq		%rax, 128(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1152bits, .-KeccakAbsorb1152bits
+	.align	2
+	.global	KeccakAbsorb1152bits
+	.type	KeccakAbsorb1152bits, %function
+KeccakAbsorb1152bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	mXorState128	apInput, apState, 128
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1344bits, .-KeccakAbsorb1344bits
+	.align	2
+	.global	KeccakAbsorb1344bits
+	.type	KeccakAbsorb1344bits, %function
+KeccakAbsorb1344bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	mXorState256	apInput, apState, 128
+	movq		160(apInput), %rax
+	xorq		%rax, 160(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb, .-KeccakAbsorb
+	.align	2
+	.global	KeccakAbsorb
+	.type	KeccakAbsorb, %function
+KeccakAbsorb:
+
+	movq		apState, xpState
+
+	test		$16, aNbrWords
+	jz		xorInputToState8
+	mXorState512	apInput, xpState, 0
+	mXorState512	apInput, xpState, 64
+	addq		$128, apInput
+	addq		$128, xpState
+
+xorInputToState8:
+	test		$8, aNbrWords
+	jz		xorInputToState4
+	mXorState512	apInput, xpState, 0
+	addq		$64, apInput
+	addq		$64, xpState
+
+xorInputToState4:
+	test		$4, aNbrWords
+	jz		xorInputToState2
+	mXorState256	apInput, xpState, 0
+	addq		$32, apInput
+	addq		$32, xpState
+
+xorInputToState2:
+	test		$2, aNbrWords
+	jz		xorInputToState1
+	mXorState128	apInput, xpState, 0
+	addq		$16, apInput
+	addq		$16, xpState
+
+xorInputToState1:
+	test		$1, aNbrWords
+	jz		xorInputToStateDone
+	movq		(apInput), %rax
+	xorq		%rax, (xpState)
+
+xorInputToStateDone:
+
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakInitializeState, .-KeccakInitializeState
+	.align	2
+	.global	KeccakInitializeState
+	.type	KeccakInitializeState, %function
+KeccakInitializeState:
+	xorq		%rax, %rax
+	xorq		%rcx, %rcx
+	notq		%rcx
+
+	.if 		UseSIMD == 0
+	movq		%rax,   0*8(apState)
+	movq		%rcx,   1*8(apState)
+	movq		%rcx,   2*8(apState)
+	movq		%rax,   3*8(apState)
+	movq		%rax,   4*8(apState)
+	movq		%rax,   5*8(apState)
+	movq		%rax,   6*8(apState)
+	movq		%rax,   7*8(apState)
+	movq		%rcx,   8*8(apState)
+	movq		%rax,   9*8(apState)
+	movq		%rax,  10*8(apState)
+	movq		%rax,  11*8(apState)
+	movq		%rcx,  12*8(apState)
+	movq		%rax,  13*8(apState)
+	movq		%rax,  14*8(apState)
+	movq		%rax,  15*8(apState)
+	movq		%rax,  16*8(apState)
+	movq		%rcx,  17*8(apState)
+	movq		%rax,  18*8(apState)
+	movq		%rax,  19*8(apState)
+	movq		%rcx,  20*8(apState)
+	movq		%rax,  21*8(apState)
+	movq		%rax,  22*8(apState)
+	movq		%rax,  23*8(apState)
+	movq		%rax,  24*8(apState)
+	.else
+	pxor		%xmm0, %xmm0
+
+	movq		%rax,   0*8(apState)
+	movq		%rcx,   1*8(apState)
+	movq		%rcx,   2*8(apState)
+	movq		%rax,   3*8(apState)
+	movdqu		%xmm0,  4*8(apState)
+	movdqu		%xmm0,  6*8(apState)
+	movq		%rcx,   8*8(apState)
+	movq		%rax,   9*8(apState)
+	movdqu		%xmm0, 10*8(apState)
+	movq		%rcx,  12*8(apState)
+	movq		%rax,  13*8(apState)
+	movdqu		%xmm0, 14*8(apState)
+	movq		%rax,  16*8(apState)
+	movq		%rcx,  17*8(apState)
+	movdqu		%xmm0, 18*8(apState)
+	movq		%rcx,  20*8(apState)
+	movq		%rax,  21*8(apState)
+	movdqu		%xmm0, 22*8(apState)
+	movq		%rax,  24*8(apState)
+	.endif
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakExtract1024bits, .-KeccakExtract1024bits
+	.align	2
+	.global	KeccakExtract1024bits
+	.type	KeccakExtract1024bits, %function
+KeccakExtract1024bits:
+
+	movq		0*8(apState), %rax
+	movq		1*8(apState), %rcx
+	movq		2*8(apState), %rdx
+	movq		3*8(apState), %r8
+	notq		%rcx
+	notq		%rdx
+	movq		%rax, 0*8(%rsi)
+	movq		%rcx, 1*8(%rsi)
+	movq		%rdx, 2*8(%rsi)
+	movq		%r8,  3*8(%rsi)
+
+	movq		4*8(apState), %rax
+	movq		5*8(apState), %rcx
+	movq		6*8(apState), %rdx
+	movq		7*8(apState), %r8
+	movq		%rax, 4*8(%rsi)
+	movq		%rcx, 5*8(%rsi)
+	movq		%rdx, 6*8(%rsi)
+	movq		%r8,  7*8(%rsi)
+
+	movq		 8*8(apState), %rax
+	movq		 9*8(apState), %rcx
+	movq		10*8(apState), %rdx
+	movq		11*8(apState), %r8
+	notq		%rax
+	movq		%rax,  8*8(%rsi)
+	movq		%rcx,  9*8(%rsi)
+	movq		%rdx, 10*8(%rsi)
+	movq		%r8,  11*8(%rsi)
+
+	movq		12*8(apState), %rax
+	movq		13*8(apState), %rcx
+	movq		14*8(apState), %rdx
+	movq		15*8(apState), %r8
+	notq		%rax
+	movq		%rax, 12*8(%rsi)
+	movq		%rcx, 13*8(%rsi)
+	movq		%rdx, 14*8(%rsi)
+	movq		%r8,  15*8(%rsi)
+	ret
+
diff --git a/c_src/KeccakF-1600-x86-64-shld-gas.s b/c_src/KeccakF-1600-x86-64-shld-gas.s
new file mode 100755
index 0000000..bc84762
--- /dev/null
+++ b/c_src/KeccakF-1600-x86-64-shld-gas.s
@@ -0,0 +1,766 @@
+#
+# The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+# Michaël Peeters and Gilles Van Assche. For more information, feedback or
+# questions, please refer to our website: http://keccak.noekeon.org/
+#
+# Implementation by Ronny Van Keer,
+# hereby denoted as "the implementer".
+# 
+# To the extent possible under law, the implementer has waived all copyright
+# and related or neighboring rights to the source code in this file.
+# http://creativecommons.org/publicdomain/zero/1.0/
+#
+
+	.text
+
+
+#// --- defines
+
+.equ UseSIMD, 1
+
+
+.equ _ba,  0*8
+.equ _be,  1*8
+.equ _bi,  2*8
+.equ _bo,  3*8
+.equ _bu,  4*8
+.equ _ga,  5*8
+.equ _ge,  6*8
+.equ _gi,  7*8
+.equ _go,  8*8
+.equ _gu,  9*8
+.equ _ka, 10*8
+.equ _ke, 11*8
+.equ _ki, 12*8
+.equ _ko, 13*8
+.equ _ku, 14*8
+.equ _ma, 15*8
+.equ _me, 16*8
+.equ _mi, 17*8
+.equ _mo, 18*8
+.equ _mu, 19*8
+.equ _sa, 20*8
+.equ _se, 21*8
+.equ _si, 22*8
+.equ _so, 23*8
+.equ _su, 24*8
+
+
+#	arguments
+.equ apState,		%rdi
+.equ apInput,		%rsi
+.equ aNbrWords,		%rdx
+
+#	xor input into state section
+.equ xpState,		%r9
+
+# round vars
+.equ rT1,		%rax
+.equ rpState,		%rdi
+.equ rpStack,		%rsp
+
+.equ rDa,		%rbx
+.equ rDe,		%rcx
+.equ rDi,		%rdx
+.equ rDo,		%r8
+.equ rDu,		%r9
+
+.equ rBa,		%r10 
+.equ rBe,		%r11
+.equ rBi,		%r12
+.equ rBo,		%r13
+.equ rBu,		%r14
+
+.equ rCa,		%rsi
+.equ rCe,		%rbp
+.equ rCi,		rBi
+.equ rCo,		rBo
+.equ rCu,		%r15
+
+.macro	mKeccakRound	iState, oState, rc, lastRound
+
+		movq		rCe, rDa
+		shld		$1, rDa, rDa
+
+		movq		_bi(\iState), rCi
+		xorq		_gi(\iState), rDi
+		xorq		_ki(\iState), rCi
+		xorq		rCu, rDa
+		xorq		_mi(\iState), rDi
+		xorq		rDi, rCi
+
+		movq		rCi, rDe
+		shld		$1, rDe, rDe
+
+		movq		_bo(\iState), rCo
+		xorq		_go(\iState), rDo
+		xorq		_ko(\iState), rCo
+		xorq		rCa, rDe
+		xorq		_mo(\iState), rDo
+		xorq		rDo, rCo
+
+		movq		rCo, rDi
+		shld		$1, rDi, rDi
+
+		movq		rCu, rDo
+		xorq		rCe, rDi
+		shld		$1, rDo, rDo
+
+		movq		rCa, rDu
+		xorq		rCi, rDo
+		shld		$1, rDu, rDu
+
+		movq		_ba(\iState), rBa
+		movq		_ge(\iState), rBe
+		xorq		rCo, rDu
+		movq		_ki(\iState), rBi
+		movq		_mo(\iState), rBo
+		movq		_su(\iState), rBu
+		xorq		rDe, rBe
+		shld		$44, rBe, rBe
+		xorq		rDi, rBi
+		xorq		rDa, rBa
+		shld		$43, rBi, rBi
+
+		movq		rBe, rCa
+		movq		$\rc, rT1
+		orq		rBi, rCa
+		xorq		rBa, rT1
+		xorq		rT1, rCa
+		movq		rCa, _ba(\oState)
+
+		xorq		rDu, rBu
+		shld		$14, rBu, rBu
+		movq		rBa, rCu
+		andq		rBe, rCu
+		xorq		rBu, rCu
+		movq		rCu, _bu(\oState)
+
+		xorq		rDo, rBo
+		shld		$21, rBo, rBo
+		movq		rBo, rT1
+		andq		rBu, rT1
+		xorq		rBi, rT1
+		movq		rT1, _bi(\oState)
+
+		notq		rBi
+		orq		rBa, rBu
+		orq		rBo, rBi
+		xorq		rBo, rBu
+		xorq		rBe, rBi
+		movq		rBu, _bo(\oState)
+		movq		rBi, _be(\oState)
+		.if		\lastRound == 0
+		movq		rBi, rCe
+		.endif
+
+
+		movq		_gu(\iState), rBe
+		xorq		rDu, rBe
+		movq		_ka(\iState), rBi
+		shld		$20, rBe, rBe
+		xorq		rDa, rBi
+		shld		$3,  rBi, rBi
+		movq		_bo(\iState), rBa
+		movq		rBe, rT1
+		orq		rBi, rT1
+		xorq		rDo, rBa
+		movq		_me(\iState), rBo
+		movq		_si(\iState), rBu
+		shld		$28, rBa, rBa
+		xorq		rBa, rT1
+		movq		rT1, _ga(\oState)
+		.if		\lastRound == 0
+		xor 		rT1, rCa
+		.endif
+
+		xorq		rDe, rBo
+		shld		$45, rBo, rBo
+		movq		rBi, rT1
+		andq		rBo, rT1
+		xorq		rBe, rT1
+		movq		rT1, _ge(\oState)
+		.if		\lastRound == 0
+		xorq		rT1, rCe
+		.endif
+
+		xorq		rDi, rBu
+		shld		$61, rBu, rBu
+		movq		rBu, rT1
+		orq		rBa, rT1
+		xorq		rBo, rT1
+		movq		rT1, _go(\oState)
+
+		andq		rBe, rBa
+		xorq		rBu, rBa
+		movq		rBa, _gu(\oState)
+		notq		rBu
+		.if		\lastRound == 0
+		xorq		rBa, rCu
+		.endif
+
+		orq		rBu, rBo
+		xorq		rBi, rBo
+		movq		rBo, _gi(\oState)
+
+
+		movq		_be(\iState), rBa
+		movq		_gi(\iState), rBe
+		movq		_ko(\iState), rBi
+		movq		_mu(\iState), rBo
+		movq		_sa(\iState), rBu
+		xorq		rDi, rBe
+		shld		$6,  rBe, rBe
+		xorq		rDo, rBi
+		shld		$25, rBi, rBi
+		movq		rBe, rT1
+		orq		rBi, rT1
+		xorq		rDe, rBa
+		shld		$1,  rBa, rBa
+		xorq		rBa, rT1
+		movq		rT1, _ka(\oState)
+		.if		\lastRound == 0
+		xor 		rT1, rCa
+		.endif
+
+		xorq		rDu, rBo
+		shld		$8,  rBo, rBo
+		movq		rBi, rT1
+		andq		rBo, rT1
+		xorq		rBe, rT1
+		movq		rT1, _ke(\oState)
+		.if		\lastRound == 0
+		xorq		rT1, rCe
+		.endif
+
+		xorq		rDa, rBu
+		shld		$18, rBu, rBu
+		notq		rBo
+		movq		rBo, rT1
+		andq		rBu, rT1
+		xorq		rBi, rT1
+		movq		rT1, _ki(\oState)
+
+		movq		rBu, rT1
+		orq		rBa, rT1
+		xorq		rBo, rT1
+		movq		rT1, _ko(\oState)
+
+		andq		rBe, rBa
+		xorq		rBu, rBa
+		movq		rBa, _ku(\oState)
+		.if		\lastRound == 0
+		xorq		rBa, rCu
+		.endif
+
+		movq		_ga(\iState), rBe
+		xorq		rDa, rBe
+		movq		_ke(\iState), rBi
+		shld		$36, rBe, rBe
+		xorq		rDe, rBi
+		movq		_bu(\iState), rBa
+		shld		$10, rBi, rBi
+		movq		rBe, rT1
+		movq		_mi(\iState), rBo
+		andq		rBi, rT1
+		xorq		rDu, rBa
+		movq		_so(\iState), rBu
+		shld		$27, rBa, rBa
+		xorq		rBa, rT1
+		movq		rT1, _ma(\oState)
+		.if		\lastRound == 0
+		xor 		rT1, rCa
+		.endif
+
+		xorq		rDi, rBo
+		shld		$15, rBo, rBo
+		movq		rBi, rT1
+		orq		rBo, rT1
+		xorq		rBe, rT1
+		movq		rT1, _me(\oState)
+		.if		\lastRound == 0
+		xorq		rT1, rCe
+		.endif
+
+		xorq		rDo, rBu
+		shld		$56, rBu, rBu
+		notq		rBo
+		movq		rBo, rT1
+		orq		rBu, rT1
+		xorq		rBi, rT1
+		movq		rT1, _mi(\oState)
+
+		orq		rBa, rBe
+		xorq		rBu, rBe
+		movq		rBe, _mu(\oState)
+
+		andq		rBa, rBu
+		xorq		rBo, rBu
+		movq		rBu, _mo(\oState)
+		.if		\lastRound == 0
+		xorq		rBe, rCu
+		.endif
+
+
+		movq		_bi(\iState), rBa
+		movq		_go(\iState), rBe
+		movq		_ku(\iState), rBi
+		xorq		rDi, rBa
+		movq		_ma(\iState), rBo
+		shld		$62, rBa, rBa
+		xorq		rDo, rBe
+		movq		_se(\iState), rBu
+		shld		$55, rBe, rBe
+
+		xorq		rDu, rBi
+		movq		rBa, rDu
+		xorq		rDe, rBu
+		shld		$2,  rBu, rBu
+		andq		rBe, rDu
+		xorq		rBu, rDu
+		movq		rDu, _su(\oState)
+
+		shld		$39, rBi, rBi
+		.if		\lastRound == 0
+		xorq		rDu, rCu
+		.endif
+		notq		rBe
+		xorq		rDa, rBo
+		movq		rBe, rDa
+		andq		rBi, rDa
+		xorq		rBa, rDa
+		movq		rDa, _sa(\oState)
+		.if		\lastRound == 0
+		xor 		rDa, rCa
+		.endif
+
+		shld		$41, rBo, rBo
+		movq		rBi, rDe
+		orq		rBo, rDe
+		xorq		rBe, rDe
+		movq		rDe, _se(\oState)
+		.if		\lastRound == 0
+		xorq		rDe, rCe
+		.endif
+
+		movq		rBo, rDi
+		movq		rBu, rDo
+		andq		rBu, rDi
+		orq		rBa, rDo
+		xorq		rBi, rDi
+		xorq		rBo, rDo
+		movq		rDi, _si(\oState)
+		movq		rDo, _so(\oState)
+
+		.endm
+
+.macro	mKeccakPermutation	
+
+		subq		$8*25, %rsp
+
+		movq		_ba(rpState), rCa             
+		movq		_be(rpState), rCe
+		movq		_bu(rpState), rCu
+
+		xorq		_ga(rpState), rCa             
+		xorq		_ge(rpState), rCe
+		xorq		_gu(rpState), rCu             
+
+		xorq		_ka(rpState), rCa             
+		xorq		_ke(rpState), rCe
+		xorq		_ku(rpState), rCu             
+
+		xorq		_ma(rpState), rCa             
+		xorq		_me(rpState), rCe
+		xorq		_mu(rpState), rCu             
+
+		xorq		_sa(rpState), rCa
+		xorq		_se(rpState), rCe
+		movq		_si(rpState), rDi
+		movq		_so(rpState), rDo
+		xorq		_su(rpState), rCu             
+
+
+		mKeccakRound	rpState, rpStack, 0x0000000000000001, 0
+		mKeccakRound	rpStack, rpState, 0x0000000000008082, 0
+		mKeccakRound	rpState, rpStack, 0x800000000000808a, 0
+		mKeccakRound	rpStack, rpState, 0x8000000080008000, 0
+		mKeccakRound	rpState, rpStack, 0x000000000000808b, 0
+		mKeccakRound	rpStack, rpState, 0x0000000080000001, 0
+
+		mKeccakRound	rpState, rpStack, 0x8000000080008081, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000008009, 0
+		mKeccakRound	rpState, rpStack, 0x000000000000008a, 0
+		mKeccakRound	rpStack, rpState, 0x0000000000000088, 0
+		mKeccakRound	rpState, rpStack, 0x0000000080008009, 0
+		mKeccakRound	rpStack, rpState, 0x000000008000000a, 0
+
+		mKeccakRound	rpState, rpStack, 0x000000008000808b, 0
+		mKeccakRound	rpStack, rpState, 0x800000000000008b, 0
+		mKeccakRound	rpState, rpStack, 0x8000000000008089, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000008003, 0
+		mKeccakRound	rpState, rpStack, 0x8000000000008002, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000000080, 0
+
+		mKeccakRound	rpState, rpStack, 0x000000000000800a, 0
+		mKeccakRound	rpStack, rpState, 0x800000008000000a, 0
+		mKeccakRound	rpState, rpStack, 0x8000000080008081, 0
+		mKeccakRound	rpStack, rpState, 0x8000000000008080, 0
+		mKeccakRound	rpState, rpStack, 0x0000000080000001, 0
+		mKeccakRound	rpStack, rpState, 0x8000000080008008, 1
+
+		addq		$8*25, %rsp
+
+		.endm
+
+.macro	mPushRegs	
+
+	pushq		%rbx
+	pushq		%rbp
+	pushq		%r12
+	pushq		%r13
+	pushq		%r14
+	pushq		%r15
+
+	.endm
+
+
+.macro	mPopRegs	
+
+	popq		%r15
+	popq		%r14
+	popq		%r13
+	popq		%r12
+	popq		%rbp
+	popq		%rbx
+
+	.endm
+
+
+.macro	mXorState128	input, state, offset
+	.if 		UseSIMD == 0
+	movq		\offset(\input), %rax
+	movq		\offset+8(\input), %rcx
+	xorq		%rax, \offset(\state)
+	xorq		%rcx, \offset+8(\state)
+	.else
+	movdqu		\offset(\input), %xmm0
+	pxor		\offset(\state), %xmm0
+	movdqu		%xmm0, \offset(\state)
+	.endif
+	.endm
+
+.macro	mXorState256	input, state, offset
+	.if 		UseSIMD == 0
+	movq		\offset(\input), %rax
+	movq		\offset+8(\input), %r10
+	movq		\offset+16(\input), %rcx
+	movq		\offset+24(\input), %r8
+	xorq		%rax, \offset(\state)
+	xorq		%r10, \offset+8(\state)
+	xorq		%rcx, \offset+16(\state)
+	xorq		%r8,  \offset+24(\state)
+	.else
+	movdqu		\offset(\input), %xmm0
+	pxor		\offset(\state), %xmm0
+	movdqu		\offset+16(\input), %xmm1
+	pxor		\offset+16(\state), %xmm1
+	movdqu		%xmm0, \offset(\state)
+	movdqu		%xmm1, \offset+16(\state)
+	.endif
+	.endm
+
+.macro	mXorState512	input, state, offset
+	.if 		UseSIMD == 0
+	mXorState256	\input, \state, \offset
+	mXorState256	\input, \state, \offset+32
+	.else
+	movdqu		\offset(\input), %xmm0
+	movdqu		\offset+16(\input), %xmm1
+	pxor		\offset(\state), %xmm0
+	movdqu		\offset+32(\input), %xmm2
+	pxor		\offset+16(\state), %xmm1
+	movdqu		%xmm0, \offset(\state)
+	movdqu		\offset+48(\input), %xmm3
+	pxor		\offset+32(\state), %xmm2
+	movdqu		%xmm1, \offset+16(\state)
+	pxor		\offset+48(\state), %xmm3
+	movdqu		%xmm2, \offset+32(\state)
+	movdqu		%xmm3, \offset+48(\state)
+	.endif
+	.endm
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakPermutation, .-KeccakPermutation
+	.align	2
+	.global	KeccakPermutation
+	.type	KeccakPermutation, %function
+KeccakPermutation:
+
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb576bits, .-KeccakAbsorb576bits
+	.align	2
+	.global	KeccakAbsorb576bits
+	.type	KeccakAbsorb576bits, %function
+KeccakAbsorb576bits:
+
+	mXorState512	apInput, apState, 0
+	movq		64(apInput), %rax
+	xorq		%rax, 64(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb832bits, .-KeccakAbsorb832bits
+	.align	2
+	.global	KeccakAbsorb832bits
+	.type	KeccakAbsorb832bits, %function
+KeccakAbsorb832bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState256	apInput, apState, 64
+	movq		96(apInput), %rax
+	xorq		%rax, 96(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1024bits, .-KeccakAbsorb1024bits
+	.align	2
+	.global	KeccakAbsorb1024bits
+	.type	KeccakAbsorb1024bits, %function
+KeccakAbsorb1024bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1088bits, .-KeccakAbsorb1088bits
+	.align	2
+	.global	KeccakAbsorb1088bits
+	.type	KeccakAbsorb1088bits, %function
+KeccakAbsorb1088bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	movq		128(apInput), %rax
+	xorq		%rax, 128(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1152bits, .-KeccakAbsorb1152bits
+	.align	2
+	.global	KeccakAbsorb1152bits
+	.type	KeccakAbsorb1152bits, %function
+KeccakAbsorb1152bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	mXorState128	apInput, apState, 128
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb1344bits, .-KeccakAbsorb1344bits
+	.align	2
+	.global	KeccakAbsorb1344bits
+	.type	KeccakAbsorb1344bits, %function
+KeccakAbsorb1344bits:
+
+	mXorState512	apInput, apState, 0
+	mXorState512	apInput, apState, 64
+	mXorState256	apInput, apState, 128
+	movq		160(apInput), %rax
+	xorq		%rax, 160(apState)
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakAbsorb, .-KeccakAbsorb
+	.align	2
+	.global	KeccakAbsorb
+	.type	KeccakAbsorb, %function
+KeccakAbsorb:
+
+	movq		apState, xpState
+
+	test		$16, aNbrWords
+	jz		xorInputToState8
+	mXorState512	apInput, xpState, 0
+	mXorState512	apInput, xpState, 64
+	addq		$128, apInput
+	addq		$128, xpState
+
+xorInputToState8:
+	test		$8, aNbrWords
+	jz		xorInputToState4
+	mXorState512	apInput, xpState, 0
+	addq		$64, apInput
+	addq		$64, xpState
+
+xorInputToState4:
+	test		$4, aNbrWords
+	jz		xorInputToState2
+	mXorState256	apInput, xpState, 0
+	addq		$32, apInput
+	addq		$32, xpState
+
+xorInputToState2:
+	test		$2, aNbrWords
+	jz		xorInputToState1
+	mXorState128	apInput, xpState, 0
+	addq		$16, apInput
+	addq		$16, xpState
+
+xorInputToState1:
+	test		$1, aNbrWords
+	jz		xorInputToStateDone
+	movq		(apInput), %rax
+	xorq		%rax, (xpState)
+
+xorInputToStateDone:
+
+	mPushRegs
+	mKeccakPermutation
+	mPopRegs
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakInitializeState, .-KeccakInitializeState
+	.align	2
+	.global	KeccakInitializeState
+	.type	KeccakInitializeState, %function
+KeccakInitializeState:
+	xorq		%rax, %rax
+	xorq		%rcx, %rcx
+	notq		%rcx
+
+	.if 		UseSIMD == 0
+	movq		%rax,   0*8(apState)
+	movq		%rcx,   1*8(apState)
+	movq		%rcx,   2*8(apState)
+	movq		%rax,   3*8(apState)
+	movq		%rax,   4*8(apState)
+	movq		%rax,   5*8(apState)
+	movq		%rax,   6*8(apState)
+	movq		%rax,   7*8(apState)
+	movq		%rcx,   8*8(apState)
+	movq		%rax,   9*8(apState)
+	movq		%rax,  10*8(apState)
+	movq		%rax,  11*8(apState)
+	movq		%rcx,  12*8(apState)
+	movq		%rax,  13*8(apState)
+	movq		%rax,  14*8(apState)
+	movq		%rax,  15*8(apState)
+	movq		%rax,  16*8(apState)
+	movq		%rcx,  17*8(apState)
+	movq		%rax,  18*8(apState)
+	movq		%rax,  19*8(apState)
+	movq		%rcx,  20*8(apState)
+	movq		%rax,  21*8(apState)
+	movq		%rax,  22*8(apState)
+	movq		%rax,  23*8(apState)
+	movq		%rax,  24*8(apState)
+	.else
+	pxor		%xmm0, %xmm0
+
+	movq		%rax,   0*8(apState)
+	movq		%rcx,   1*8(apState)
+	movq		%rcx,   2*8(apState)
+	movq		%rax,   3*8(apState)
+	movdqu		%xmm0,  4*8(apState)
+	movdqu		%xmm0,  6*8(apState)
+	movq		%rcx,   8*8(apState)
+	movq		%rax,   9*8(apState)
+	movdqu		%xmm0, 10*8(apState)
+	movq		%rcx,  12*8(apState)
+	movq		%rax,  13*8(apState)
+	movdqu		%xmm0, 14*8(apState)
+	movq		%rax,  16*8(apState)
+	movq		%rcx,  17*8(apState)
+	movdqu		%xmm0, 18*8(apState)
+	movq		%rcx,  20*8(apState)
+	movq		%rax,  21*8(apState)
+	movdqu		%xmm0, 22*8(apState)
+	movq		%rax,  24*8(apState)
+	.endif
+	ret
+
+# -------------------------------------------------------------------------
+
+	.size	KeccakExtract1024bits, .-KeccakExtract1024bits
+	.align	2
+	.global	KeccakExtract1024bits
+	.type	KeccakExtract1024bits, %function
+KeccakExtract1024bits:
+
+	movq		0*8(apState), %rax
+	movq		1*8(apState), %rcx
+	movq		2*8(apState), %rdx
+	movq		3*8(apState), %r8
+	notq		%rcx
+	notq		%rdx
+	movq		%rax, 0*8(%rsi)
+	movq		%rcx, 1*8(%rsi)
+	movq		%rdx, 2*8(%rsi)
+	movq		%r8,  3*8(%rsi)
+
+	movq		4*8(apState), %rax
+	movq		5*8(apState), %rcx
+	movq		6*8(apState), %rdx
+	movq		7*8(apState), %r8
+	movq		%rax, 4*8(%rsi)
+	movq		%rcx, 5*8(%rsi)
+	movq		%rdx, 6*8(%rsi)
+	movq		%r8,  7*8(%rsi)
+
+	movq		 8*8(apState), %rax
+	movq		 9*8(apState), %rcx
+	movq		10*8(apState), %rdx
+	movq		11*8(apState), %r8
+	notq		%rax
+	movq		%rax,  8*8(%rsi)
+	movq		%rcx,  9*8(%rsi)
+	movq		%rdx, 10*8(%rsi)
+	movq		%r8,  11*8(%rsi)
+
+	movq		12*8(apState), %rax
+	movq		13*8(apState), %rcx
+	movq		14*8(apState), %rdx
+	movq		15*8(apState), %r8
+	notq		%rax
+	movq		%rax, 12*8(%rsi)
+	movq		%rcx, 13*8(%rsi)
+	movq		%rdx, 14*8(%rsi)
+	movq		%r8,  15*8(%rsi)
+	ret
+
diff --git a/c_src/KeccakF-1600-xop.macros b/c_src/KeccakF-1600-xop.macros
new file mode 100755
index 0000000..e5d6514
--- /dev/null
+++ b/c_src/KeccakF-1600-xop.macros
@@ -0,0 +1,573 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    V128 Abage, Abegi, Abigo, Abogu, Abuga; \
+    V128 Akame, Akemi, Akimo, Akomu, Akuma; \
+    V128 Abae, Abio, Agae, Agio, Akae, Akio, Amae, Amio; \
+    V64 Aba, Abe, Abi, Abo, Abu; \
+    V64 Aga, Age, Agi, Ago, Agu; \
+    V64 Aka, Ake, Aki, Ako, Aku; \
+    V64 Ama, Ame, Ami, Amo, Amu; \
+    V128 Asase, Asiso; \
+    V64 Asu; \
+    V128 Bbage, Bbegi, Bbigo, Bbogu, Bbuga; \
+    V128 Bkame, Bkemi, Bkimo, Bkomu, Bkuma; \
+    V128 Bsase, Bsesi, Bsiso, Bsosu, Bsusa; \
+    V128 Cae, Cei, Cio, Cou, Cua; \
+    V128 Dau, Dea, Die, Doi, Duo; \
+    V128 Dua, Dae, Dei, Dio, Dou; \
+    V128 Ebage, Ebegi, Ebigo, Ebogu, Ebuga; \
+    V128 Ekame, Ekemi, Ekimo, Ekomu, Ekuma; \
+    V128 Esase, Esiso; \
+    V64 Esu; \
+    V128 Zero;
+
+#define prepareTheta
+
+#define computeD \
+    Cua = GET64LOLO(Cua, Cae); \
+    Dei = XOR128(Cae, ROL6464same(Cio, 1)); \
+    Dou = XOR128(Cio, ROL6464same(Cua, 1)); \
+    Cei = GET64HILO(Cae, Cio); \
+    Dae = XOR128(Cua, ROL6464same(Cei, 1)); \
+    Dau = GET64LOHI(Dae, Dou); \
+    Dea = SWAP64(Dae); \
+    Die = SWAP64(Dei); \
+    Doi = GET64LOLO(Dou, Die); \
+    Duo = SWAP64(Dou);
+
+// --- Theta Rho Pi Chi Iota Prepare-theta
+// --- 64-bit lanes mapped to 64-bit and 128-bit words
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    computeD \
+    \
+    Bbage = XOR128(GET64LOHI(A##bage, A##bogu), Dau); \
+    Bbage = ROL6464(Bbage, 0, 20); \
+    Bbegi = XOR128(GET64HILO(A##bage, A##kame), Dea); \
+    Bbegi = ROL6464(Bbegi, 44, 3); \
+    Bbigo = XOR128(GET64LOHI(A##kimo, A##kame), Die); \
+    Bbigo = ROL6464(Bbigo, 43, 45); \
+    E##bage = XOR128(Bbage, ANDnu128(Bbegi, Bbigo)); \
+    XOReq128(E##bage, CONST64(KeccakF1600RoundConstants[i])); \
+    Cae = E##bage; \
+    Bbogu = XOR128(GET64HILO(A##kimo, A##siso), Doi); \
+    Bbogu = ROL6464(Bbogu, 21, 61); \
+    E##begi = XOR128(Bbegi, ANDnu128(Bbigo, Bbogu)); \
+    Cei = E##begi; \
+    Bbuga = XOR128(GET64LOLO(A##su, A##bogu), Duo); \
+    Bbuga = ROL6464(Bbuga, 14, 28); \
+    E##bigo = XOR128(Bbigo, ANDnu128(Bbogu, Bbuga)); \
+    Cio = E##bigo; \
+    E##bogu = XOR128(Bbogu, ANDnu128(Bbuga, Bbage)); \
+    Cou = E##bogu; \
+    E##buga = XOR128(Bbuga, ANDnu128(Bbage, Bbegi)); \
+    Cua = E##buga; \
+\
+    Bkame = XOR128(GET64LOHI(A##begi, A##buga), Dea); \
+    Bkame = ROL6464(Bkame, 1, 36); \
+    Bkemi = XOR128(GET64HILO(A##begi, A##kemi), Die); \
+    Bkemi = ROL6464(Bkemi, 6, 10); \
+    Bkimo = XOR128(GET64LOHI(A##komu, A##kemi), Doi); \
+    Bkimo = ROL6464(Bkimo, 25, 15); \
+    E##kame = XOR128(Bkame, ANDnu128(Bkemi, Bkimo)); \
+    XOReq128(Cae, E##kame); \
+    Bkomu = XOR128(GET64HIHI(A##komu, A##siso), Duo); \
+    Bkomu = ROL6464(Bkomu, 8, 56); \
+    E##kemi = XOR128(Bkemi, ANDnu128(Bkimo, Bkomu)); \
+    XOReq128(Cei, E##kemi); \
+    Bkuma = XOR128(GET64LOLO(A##sase, A##buga), Dau); \
+    Bkuma = ROL6464(Bkuma, 18, 27); \
+    E##kimo = XOR128(Bkimo, ANDnu128(Bkomu, Bkuma)); \
+    XOReq128(Cio, E##kimo); \
+    E##komu = XOR128(Bkomu, ANDnu128(Bkuma, Bkame)); \
+    XOReq128(Cou, E##komu); \
+    E##kuma = XOR128(Bkuma, ANDnu128(Bkame, Bkemi)); \
+    XOReq128(Cua, E##kuma); \
+\
+    Bsase = XOR128(A##bigo, SWAP64(Doi)); \
+    Bsase = ROL6464(Bsase, 62, 55); \
+    Bsiso = XOR128(A##kuma, SWAP64(Dau)); \
+    Bsiso = ROL6464(Bsiso, 39, 41); \
+    Bsusa = XOR64(COPY64HI2LO(A##sase), Dei); \
+    Bsusa = ROL6464same(Bsusa, 2); \
+    Bsusa = GET64LOLO(Bsusa, Bsase); \
+    Bsesi = GET64HILO(Bsase, Bsiso); \
+    Bsosu = GET64HILO(Bsiso, Bsusa); \
+    E##sase = XOR128(Bsase, ANDnu128(Bsesi, Bsiso)); \
+    XOReq128(Cae, E##sase); \
+    E##siso = XOR128(Bsiso, ANDnu128(Bsosu, Bsusa)); \
+    XOReq128(Cio, E##siso); \
+    E##su = GET64LOLO(XOR128(Bsusa, ANDnu128(Bsase, Bsesi)), Zero); \
+    XOReq128(Cua, E##su); \
+\
+    Zero = ZERO128(); \
+    XOReq128(Cae, GET64HIHI(Cua, Zero)); \
+    XOReq128(Cae, GET64LOLO(Zero, Cei)); \
+    XOReq128(Cio, GET64HIHI(Cei, Zero)); \
+    XOReq128(Cio, GET64LOLO(Zero, Cou)); \
+    XOReq128(Cua, GET64HIHI(Cou, Zero)); \
+
+// --- Theta Rho Pi Chi Iota
+// --- 64-bit lanes mapped to 64-bit and 128-bit words
+#define thetaRhoPiChiIota(i, A, E) thetaRhoPiChiIotaPrepareTheta(i, A, E)
+
+const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define copyFromStateAndXor576bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = LOAD64(state[ 9]); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = LOAD128(state[10]); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = LOAD128(state[12]); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = LOAD128(state[14]); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor832bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD64(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = LOAD128(state[14]); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1024bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1088bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1152bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = XOR128(LOAD128u(state[17]), LOAD64(input[17])); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromStateAndXor1344bits(X, state, input) \
+    X##bae = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
+    Cua = X##bu; \
+    X##gae = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = XOR128(LOAD128(state[14]), LOAD128(input[14])); \
+    XOReq64(Cua, X##kuma); \
+    X##me = XOR64(LOAD64(state[16]), LOAD64(input[16])); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = XOR128(LOAD128u(state[17]), LOAD128u(input[17])); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = XOR64(LOAD64(state[19]), LOAD64(input[19])); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = XOR128(LOAD128(state[20]), LOAD64(input[20])); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyFromState(X, state) \
+    X##bae = LOAD128(state[ 0]); \
+    X##ba = X##bae; \
+    X##be = GET64HIHI(X##bae, X##bae); \
+    Cae = X##bae; \
+    X##bio = LOAD128(state[ 2]); \
+    X##bi = X##bio; \
+    X##bo = GET64HIHI(X##bio, X##bio); \
+    Cio = X##bio; \
+    X##bu = LOAD64(state[ 4]); \
+    Cua = X##bu; \
+    X##gae = LOAD128u(state[ 5]); \
+    X##ga = X##gae; \
+    X##buga = GET64LOLO(X##bu, X##ga); \
+    X##ge = GET64HIHI(X##gae, X##gae); \
+    X##bage = GET64LOLO(X##ba, X##ge); \
+    XOReq128(Cae, X##gae); \
+    X##gio = LOAD128u(state[ 7]); \
+    X##gi = X##gio; \
+    X##begi = GET64LOLO(X##be, X##gi); \
+    X##go = GET64HIHI(X##gio, X##gio); \
+    X##bigo = GET64LOLO(X##bi, X##go); \
+    XOReq128(Cio, X##gio); \
+    X##gu = LOAD64(state[ 9]); \
+    X##bogu = GET64LOLO(X##bo, X##gu); \
+    XOReq64(Cua, X##gu); \
+    X##kae = LOAD128(state[10]); \
+    X##ka = X##kae; \
+    X##ke = GET64HIHI(X##kae, X##kae); \
+    XOReq128(Cae, X##kae); \
+    X##kio = LOAD128(state[12]); \
+    X##ki = X##kio; \
+    X##ko = GET64HIHI(X##kio, X##kio); \
+    XOReq128(Cio, X##kio); \
+    X##kuma = LOAD128(state[14]); \
+    XOReq64(Cua, X##kuma); \
+    X##me = LOAD64(state[16]); \
+    X##kame = GET64LOLO(X##ka, X##me); \
+    XOReq128(Cae, GET64HIHI(X##kuma, X##kame)); \
+    X##mio = LOAD128u(state[17]); \
+    X##mi = X##mio; \
+    X##kemi = GET64LOLO(X##ke, X##mi); \
+    X##mo = GET64HIHI(X##mio, X##mio); \
+    X##kimo = GET64LOLO(X##ki, X##mo); \
+    XOReq128(Cio, X##mio); \
+    X##mu = LOAD64(state[19]); \
+    X##komu = GET64LOLO(X##ko, X##mu); \
+    XOReq64(Cua, X##mu); \
+    X##sase = LOAD128(state[20]); \
+    XOReq128(Cae, X##sase); \
+    X##siso = LOAD128(state[22]); \
+    XOReq128(Cio, X##siso); \
+    X##su = LOAD64(state[24]); \
+    XOReq64(Cua, X##su); \
+
+#define copyToState(state, X) \
+    STORE64(state[ 0], X##bage); \
+    STORE64(state[ 1], X##begi); \
+    STORE64(state[ 2], X##bigo); \
+    STORE64(state[ 3], X##bogu); \
+    STORE128(state[ 4], X##buga); \
+    STORE64(state[ 6], COPY64HI2LO(X##bage)); \
+    STORE64(state[ 7], COPY64HI2LO(X##begi)); \
+    STORE64(state[ 8], COPY64HI2LO(X##bigo)); \
+    STORE64(state[ 9], COPY64HI2LO(X##bogu)); \
+    STORE64(state[10], X##kame); \
+    STORE64(state[11], X##kemi); \
+    STORE64(state[12], X##kimo); \
+    STORE64(state[13], X##komu); \
+    STORE128(state[14], X##kuma); \
+    STORE64(state[16], COPY64HI2LO(X##kame)); \
+    STORE64(state[17], COPY64HI2LO(X##kemi)); \
+    STORE64(state[18], COPY64HI2LO(X##kimo)); \
+    STORE64(state[19], COPY64HI2LO(X##komu)); \
+    STORE128(state[20], X##sase); \
+    STORE128(state[22], X##siso); \
+    STORE64(state[24], X##su); \
+
+#define copyStateVariables(X, Y) \
+    X##bage = Y##bage; \
+    X##begi = Y##begi; \
+    X##bigo = Y##bigo; \
+    X##bogu = Y##bogu; \
+    X##buga = Y##buga; \
+    X##kame = Y##kame; \
+    X##kemi = Y##kemi; \
+    X##kimo = Y##kimo; \
+    X##komu = Y##komu; \
+    X##kuma = Y##kuma; \
+    X##sase = Y##sase; \
+    X##siso = Y##siso; \
+    X##su = Y##su; \
+
diff --git a/c_src/KeccakNISTInterface.c b/c_src/KeccakNISTInterface.c
new file mode 100755
index 0000000..5d92c74
--- /dev/null
+++ b/c_src/KeccakNISTInterface.c
@@ -0,0 +1,81 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakNISTInterface.h"
+#include "KeccakF-1600-interface.h"
+
+HashReturn Init(hashState *state, int hashbitlen)
+{
+    switch(hashbitlen) {
+        case 0: // Default parameters, arbitrary length output
+            InitSponge((spongeState*)state, 1024, 576);
+            break;
+        case 224:
+            InitSponge((spongeState*)state, 1152, 448);
+            break;
+        case 256:
+            InitSponge((spongeState*)state, 1088, 512);
+            break;
+        case 384:
+            InitSponge((spongeState*)state, 832, 768);
+            break;
+        case 512:
+            InitSponge((spongeState*)state, 576, 1024);
+            break;
+        default:
+            return BAD_HASHLEN;
+    }
+    state->fixedOutputLength = hashbitlen;
+    return SUCCESS;
+}
+
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) == 0)
+        return Absorb((spongeState*)state, data, databitlen);
+    else {
+        HashReturn ret = Absorb((spongeState*)state, data, databitlen - (databitlen % 8));
+        if (ret == SUCCESS) {
+            unsigned char lastByte; 
+            // Align the last partial byte to the least significant bits
+            lastByte = data[databitlen/8] >> (8 - (databitlen % 8));
+            return Absorb((spongeState*)state, &lastByte, databitlen % 8);
+        }
+        else
+            return ret;
+    }
+}
+
+HashReturn Final(hashState *state, BitSequence *hashval)
+{
+    return Squeeze(state, hashval, state->fixedOutputLength);
+}
+
+HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
+{
+    hashState state;
+    HashReturn result;
+
+    if ((hashbitlen != 224) && (hashbitlen != 256) && (hashbitlen != 384) && (hashbitlen != 512))
+        return BAD_HASHLEN; // Only the four fixed output lengths available through this API
+    result = Init(&state, hashbitlen);
+    if (result != SUCCESS)
+        return result;
+    result = Update(&state, data, databitlen);
+    if (result != SUCCESS)
+        return result;
+    result = Final(&state, hashval);
+    return result;
+}
+
diff --git a/c_src/KeccakNISTInterface.h b/c_src/KeccakNISTInterface.h
new file mode 100755
index 0000000..c6987d4
--- /dev/null
+++ b/c_src/KeccakNISTInterface.h
@@ -0,0 +1,70 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakNISTInterface_h_
+#define _KeccakNISTInterface_h_
+
+#include "KeccakSponge.h"
+
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+
+typedef spongeState hashState;
+
+/**
+  * Function to initialize the state of the Keccak[r, c] sponge function.
+  * The rate r and capacity c values are determined from @a hashbitlen.
+  * @param  state       Pointer to the state of the sponge function to be initialized.
+  * @param  hashbitlen  The desired number of output bits, 
+  *                     or 0 for Keccak[] with default parameters
+  *                     and arbitrarily-long output.
+  * @pre    The value of hashbitlen must be one of 0, 224, 256, 384 and 512.
+  * @return SUCCESS if successful, BAD_HASHLEN if the value of hashbitlen is incorrect.
+  */
+HashReturn Init(hashState *state, int hashbitlen);
+/**
+  * Function to give input data for the sponge function to absorb.
+  * @param  state       Pointer to the state of the sponge function initialized by Init().
+  * @param  data        Pointer to the input data. 
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the most significant bits of the last byte.
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @pre    In the previous call to Absorb(), databitLen was a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+/**
+  * Function to squeeze output data from the sponge function.
+  * If @a hashbitlen was not 0 in the call to Init(), the number of output bits is equal to @a hashbitlen.
+  * If @a hashbitlen was 0 in the call to Init(), the output bits must be extracted using the Squeeze() function.
+  * @param  state       Pointer to the state of the sponge function initialized by Init().
+  * @param  hashval     Pointer to the buffer where to store the output data.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Final(hashState *state, BitSequence *hashval);
+/**
+  * Function to compute a hash using the Keccak[r, c] sponge function.
+  * The rate r and capacity c values are determined from @a hashbitlen.
+  * @param  hashbitlen  The desired number of output bits.
+  * @param  data        Pointer to the input data. 
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the most significant bits of the last byte.
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @param  hashval     Pointer to the buffer where to store the output data.
+  * @pre    The value of hashbitlen must be one of 224, 256, 384 and 512.
+  * @return SUCCESS if successful, BAD_HASHLEN if the value of hashbitlen is incorrect.
+  */
+HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+
+#endif
diff --git a/c_src/KeccakSponge.c b/c_src/KeccakSponge.c
new file mode 100755
index 0000000..5939ba4
--- /dev/null
+++ b/c_src/KeccakSponge.c
@@ -0,0 +1,266 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakSponge.h"
+#include "KeccakF-1600-interface.h"
+#ifdef KeccakReference
+#include "displayIntermediateValues.h"
+#endif
+
+int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity)
+{
+    if (rate+capacity != 1600)
+        return 1;
+    if ((rate <= 0) || (rate >= 1600) || ((rate % 64) != 0))
+        return 1;
+    KeccakInitialize();
+    state->rate = rate;
+    state->capacity = capacity;
+    state->fixedOutputLength = 0;
+    KeccakInitializeState(state->state);
+    memset(state->dataQueue, 0, KeccakMaximumRateInBytes);
+    state->bitsInQueue = 0;
+    state->squeezing = 0;
+    state->bitsAvailableForSqueezing = 0;
+
+    return 0;
+}
+
+void AbsorbQueue(spongeState *state)
+{
+    // state->bitsInQueue is assumed to be equal to state->rate
+    #ifdef KeccakReference
+    displayBytes(1, "Block to be absorbed", state->dataQueue, state->rate/8);
+    #endif
+#ifdef ProvideFast576
+    if (state->rate == 576)
+        KeccakAbsorb576bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast832
+    if (state->rate == 832)
+        KeccakAbsorb832bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast1024
+    if (state->rate == 1024)
+        KeccakAbsorb1024bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast1088
+    if (state->rate == 1088)
+        KeccakAbsorb1088bits(state->state, state->dataQueue);
+    else
+#endif
+#ifdef ProvideFast1152
+    if (state->rate == 1152)
+        KeccakAbsorb1152bits(state->state, state->dataQueue);
+    else 
+#endif
+#ifdef ProvideFast1344
+    if (state->rate == 1344)
+        KeccakAbsorb1344bits(state->state, state->dataQueue);
+    else 
+#endif
+        KeccakAbsorb(state->state, state->dataQueue, state->rate/64);
+    state->bitsInQueue = 0;
+}
+
+int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen)
+{
+    unsigned long long i, j, wholeBlocks;
+    unsigned int partialBlock, partialByte;
+    const unsigned char *curData;
+
+    if ((state->bitsInQueue % 8) != 0)
+        return 1; // Only the last call may contain a partial byte
+    if (state->squeezing)
+        return 1; // Too late for additional input
+
+    i = 0;
+    while(i < databitlen) {
+        if ((state->bitsInQueue == 0) && (databitlen >= state->rate) && (i <= (databitlen-state->rate))) {
+            wholeBlocks = (databitlen-i)/state->rate;
+            curData = data+i/8;
+#ifdef ProvideFast576
+            if (state->rate == 576) {
+                for(j=0; j<wholeBlocks; j++, curData+=576/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb576bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast832
+            if (state->rate == 832) {
+                for(j=0; j<wholeBlocks; j++, curData+=832/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb832bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1024
+            if (state->rate == 1024) {
+                for(j=0; j<wholeBlocks; j++, curData+=1024/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1024bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1088
+            if (state->rate == 1088) {
+                for(j=0; j<wholeBlocks; j++, curData+=1088/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1088bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1152
+            if (state->rate == 1152) {
+                for(j=0; j<wholeBlocks; j++, curData+=1152/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1152bits(state->state, curData);
+                }
+            }
+            else
+#endif
+#ifdef ProvideFast1344
+            if (state->rate == 1344) {
+                for(j=0; j<wholeBlocks; j++, curData+=1344/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb1344bits(state->state, curData);
+                }
+            }
+            else
+#endif
+            {
+                for(j=0; j<wholeBlocks; j++, curData+=state->rate/8) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, state->rate/8);
+                    #endif
+                    KeccakAbsorb(state->state, curData, state->rate/64);
+                }
+            }
+            i += wholeBlocks*state->rate;
+        }
+        else {
+            partialBlock = (unsigned int)(databitlen - i);
+            if (partialBlock+state->bitsInQueue > state->rate)
+                partialBlock = state->rate-state->bitsInQueue;
+            partialByte = partialBlock % 8;
+            partialBlock -= partialByte;
+            memcpy(state->dataQueue+state->bitsInQueue/8, data+i/8, partialBlock/8);
+            state->bitsInQueue += partialBlock;
+            i += partialBlock;
+            if (state->bitsInQueue == state->rate)
+                AbsorbQueue(state);
+            if (partialByte > 0) {
+                unsigned char mask = (1 << partialByte)-1;
+                state->dataQueue[state->bitsInQueue/8] = data[i/8] & mask;
+                state->bitsInQueue += partialByte;
+                i += partialByte;
+            }
+        }
+    }
+    return 0;
+}
+
+void PadAndSwitchToSqueezingPhase(spongeState *state)
+{
+    // Note: the bits are numbered from 0=LSB to 7=MSB
+    if (state->bitsInQueue + 1 == state->rate) {
+        state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8);
+        AbsorbQueue(state);
+        memset(state->dataQueue, 0, state->rate/8);
+    }
+    else {
+        memset(state->dataQueue + (state->bitsInQueue+7)/8, 0, state->rate/8 - (state->bitsInQueue+7)/8);
+        state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8);
+    }
+    state->dataQueue[(state->rate-1)/8] |= 1 << ((state->rate-1) % 8);
+    AbsorbQueue(state);
+
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+#ifdef ProvideFast1024
+    if (state->rate == 1024) {
+        KeccakExtract1024bits(state->state, state->dataQueue);
+        state->bitsAvailableForSqueezing = 1024;
+    }
+    else
+#endif
+    {
+        KeccakExtract(state->state, state->dataQueue, state->rate/64);
+        state->bitsAvailableForSqueezing = state->rate;
+    }
+    #ifdef KeccakReference
+    displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8);
+    #endif
+    state->squeezing = 1;
+}
+
+int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength)
+{
+    unsigned long long i;
+    unsigned int partialBlock;
+
+    if (!state->squeezing)
+        PadAndSwitchToSqueezingPhase(state);
+    if ((outputLength % 8) != 0)
+        return 1; // Only multiple of 8 bits are allowed, truncation can be done at user level
+
+    i = 0;
+    while(i < outputLength) {
+        if (state->bitsAvailableForSqueezing == 0) {
+            KeccakPermutation(state->state);
+#ifdef ProvideFast1024
+            if (state->rate == 1024) {
+                KeccakExtract1024bits(state->state, state->dataQueue);
+                state->bitsAvailableForSqueezing = 1024;
+            }
+            else
+#endif
+            {
+                KeccakExtract(state->state, state->dataQueue, state->rate/64);
+                state->bitsAvailableForSqueezing = state->rate;
+            }
+            #ifdef KeccakReference
+            displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8);
+            #endif
+        }
+        partialBlock = state->bitsAvailableForSqueezing;
+        if ((unsigned long long)partialBlock > outputLength - i)
+            partialBlock = (unsigned int)(outputLength - i);
+        memcpy(output+i/8, state->dataQueue+(state->rate-state->bitsAvailableForSqueezing)/8, partialBlock/8);
+        state->bitsAvailableForSqueezing -= partialBlock;
+        i += partialBlock;
+    }
+    return 0;
+}
diff --git a/c_src/KeccakSponge.h b/c_src/KeccakSponge.h
new file mode 100755
index 0000000..df3d797
--- /dev/null
+++ b/c_src/KeccakSponge.h
@@ -0,0 +1,76 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSponge_h_
+#define _KeccakSponge_h_
+
+#define KeccakPermutationSize 1600
+#define KeccakPermutationSizeInBytes (KeccakPermutationSize/8)
+#define KeccakMaximumRate 1536
+#define KeccakMaximumRateInBytes (KeccakMaximumRate/8)
+
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+ALIGN typedef struct spongeStateStruct {
+    ALIGN unsigned char state[KeccakPermutationSizeInBytes];
+    ALIGN unsigned char dataQueue[KeccakMaximumRateInBytes];
+    unsigned int rate;
+    unsigned int capacity;
+    unsigned int bitsInQueue;
+    unsigned int fixedOutputLength;
+    int squeezing;
+    unsigned int bitsAvailableForSqueezing;
+} spongeState;
+
+/**
+  * Function to initialize the state of the Keccak[r, c] sponge function.
+  * The sponge function is set to the absorbing phase.
+  * @param  state       Pointer to the state of the sponge function to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @pre    One must have r+c=1600 and the rate a multiple of 64 bits in this implementation.
+  * @return Zero if successful, 1 otherwise.
+  */
+int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity);
+/**
+  * Function to give input data for the sponge function to absorb.
+  * @param  state       Pointer to the state of the sponge function initialized by InitSponge().
+  * @param  data        Pointer to the input data. 
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the least significant bits of the last byte.
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @pre    In the previous call to Absorb(), databitLen was a multiple of 8.
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Squeeze() must not have been called before.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen);
+/**
+  * Function to squeeze output data from the sponge function.
+  * If the sponge function was in the absorbing phase, this function 
+  * switches it to the squeezing phase.
+  * @param  state       Pointer to the state of the sponge function initialized by InitSponge().
+  * @param  output      Pointer to the buffer where to store the output data.
+  * @param  outputLength    The number of output bits desired.
+  *                     It must be a multiple of 8.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength);
+
+#endif
diff --git a/c_src/brg_endian.h b/c_src/brg_endian.h
new file mode 100755
index 0000000..7226eb3
--- /dev/null
+++ b/c_src/brg_endian.h
@@ -0,0 +1,142 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/c_src/displayIntermediateValues.c b/c_src/displayIntermediateValues.c
new file mode 100755
index 0000000..f3bf9e2
--- /dev/null
+++ b/c_src/displayIntermediateValues.c
@@ -0,0 +1,117 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <stdio.h>
+#include "displayIntermediateValues.h"
+#include "KeccakNISTInterface.h"
+
+FILE *intermediateValueFile = 0;
+int displayLevel = 0;
+
+void displaySetIntermediateValueFile(FILE *f)
+{
+    intermediateValueFile = f;
+}
+
+void displaySetLevel(int level)
+{
+    displayLevel = level;
+}
+
+void displayBytes(int level, const char *text, const unsigned char *bytes, unsigned int size)
+{
+    unsigned int i;
+
+    if ((intermediateValueFile) && (level <= displayLevel)) {
+        fprintf(intermediateValueFile, "%s:\n", text);
+        for(i=0; i<size; i++)
+            fprintf(intermediateValueFile, "%02X ", bytes[i]);
+        fprintf(intermediateValueFile, "\n");
+        fprintf(intermediateValueFile, "\n");
+    }
+}
+
+void displayBits(int level, const char *text, const unsigned char *data, unsigned int size, int MSBfirst)
+{
+    unsigned int i, iByte, iBit;
+
+    if ((intermediateValueFile) && (level <= displayLevel)) {
+        fprintf(intermediateValueFile, "%s:\n", text);
+        for(i=0; i<size; i++) {
+            iByte = i/8;
+            iBit = i%8;
+            if (MSBfirst)
+                fprintf(intermediateValueFile, "%d ", ((data[iByte] << iBit) & 0x80) != 0);
+            else
+                fprintf(intermediateValueFile, "%d ", ((data[iByte] >> iBit) & 0x01) != 0);
+        }
+        fprintf(intermediateValueFile, "\n");
+        fprintf(intermediateValueFile, "\n");
+    }
+}
+
+void displayStateAsBytes(int level, const char *text, const unsigned char *state)
+{
+    displayBytes(level, text, state, KeccakPermutationSizeInBytes);
+}
+
+void displayStateAs32bitWords(int level, const char *text, const unsigned int *state)
+{
+    unsigned int i;
+
+    if ((intermediateValueFile) && (level <= displayLevel)) {
+        fprintf(intermediateValueFile, "%s:\n", text);
+        for(i=0; i<KeccakPermutationSize/64; i++) {
+            fprintf(intermediateValueFile, "%08X:%08X", (unsigned int)state[2*i+0], (unsigned int)state[2*i+1]);
+            if ((i%5) == 4)
+                fprintf(intermediateValueFile, "\n");
+            else
+                fprintf(intermediateValueFile, " ");
+        }
+    }
+}
+
+void displayStateAs64bitWords(int level, const char *text, const unsigned long long int *state)
+{
+    unsigned int i;
+
+    if ((intermediateValueFile) && (level <= displayLevel)) {
+        fprintf(intermediateValueFile, "%s:\n", text);
+        for(i=0; i<KeccakPermutationSize/64; i++) {
+            fprintf(intermediateValueFile, "%08X", (unsigned int)(state[i] >> 32));
+            fprintf(intermediateValueFile, "%08X", (unsigned int)(state[i] & 0xFFFFFFFFULL));
+            if ((i%5) == 4)
+                fprintf(intermediateValueFile, "\n");
+            else
+                fprintf(intermediateValueFile, " ");
+        }
+    }
+}
+
+void displayRoundNumber(int level, unsigned int i)
+{
+    if ((intermediateValueFile) && (level <= displayLevel)) {
+        fprintf(intermediateValueFile, "\n");
+        fprintf(intermediateValueFile, "--- Round %d ---\n", i);
+        fprintf(intermediateValueFile, "\n");
+    }
+}
+
+void displayText(int level, const char *text)
+{
+    if ((intermediateValueFile) && (level <= displayLevel)) {
+        fprintf(intermediateValueFile, text);
+        fprintf(intermediateValueFile, "\n");
+        fprintf(intermediateValueFile, "\n");
+    }
+}
diff --git a/c_src/displayIntermediateValues.h b/c_src/displayIntermediateValues.h
new file mode 100755
index 0000000..1d6c6c8
--- /dev/null
+++ b/c_src/displayIntermediateValues.h
@@ -0,0 +1,29 @@
+/*
+The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+Michaël Peeters and Gilles Van Assche. For more information, feedback or
+questions, please refer to our website: http://keccak.noekeon.org/
+
+Implementation by the designers,
+hereby denoted as "the implementer".
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _displayIntermediateValues_h_
+#define _displayIntermediateValues_h_
+
+#include <stdio.h>
+
+void displaySetIntermediateValueFile(FILE *f);
+void displaySetLevel(int level);
+void displayBytes(int level, const char *text, const unsigned char *bytes, unsigned int size);
+void displayBits(int level, const char *text, const unsigned char *data, unsigned int size, int MSBfirst);
+void displayStateAsBytes(int level, const char *text, const unsigned char *state);
+void displayStateAs32bitWords(int level, const char *text, const unsigned int *state);
+void displayStateAs64bitWords(int level, const char *text, const unsigned long long int *state);
+void displayRoundNumber(int level, unsigned int i);
+void displayText(int level, const char *text);
+
+#endif
diff --git a/c_src/sha3_nif.c b/c_src/sha3_nif.c
new file mode 100644
index 0000000..d485250
--- /dev/null
+++ b/c_src/sha3_nif.c
@@ -0,0 +1,144 @@
+#include "erl_nif.h"
+#include "KeccakNISTInterface.h"
+
+typedef struct nif_hash_context     nif_hash_context;
+
+struct nif_hash_context {
+  int bitlen;
+  hashState state;
+};
+
+static void sha3_resource_cleanup(ErlNifEnv* env, void* arg);
+static ERL_NIF_TERM nif_hash_init(ErlNifEnv* env, int argc,
+    const ERL_NIF_TERM argv[]);
+static ERL_NIF_TERM nif_hash_update(ErlNifEnv* env, int argc,
+    const ERL_NIF_TERM argv[]);
+static ERL_NIF_TERM nif_hash_final(ErlNifEnv* env, int argc,
+    const ERL_NIF_TERM argv[]);
+static ERL_NIF_TERM nif_hash(ErlNifEnv* env, int argc,
+    const ERL_NIF_TERM argv[]);
+
+static ErlNifFunc nif_funcs[] =
+{
+  {"hash_init", 1, nif_hash_init},
+  {"hash_update", 2, nif_hash_update},
+  {"hash_final", 1, nif_hash_final},
+  {"hash", 2, nif_hash}
+};
+
+static ErlNifResourceType *sha3_resource_type;
+
+static void
+sha3_resource_cleanup(ErlNifEnv* env, void* arg)
+{
+  /* do nothing */
+}
+
+static ERL_NIF_TERM
+nif_hash_init(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+{
+  ERL_NIF_TERM ctxt_term;
+  nif_hash_context *ctxt;
+  int bitlen;
+
+  if (!enif_get_int(env, argv[0], &bitlen))
+    return 0;
+
+  if (bitlen != 224 && bitlen != 256 && bitlen != 384 && bitlen != 512)
+    return 0;
+
+  ctxt = enif_alloc_resource(sha3_resource_type, sizeof(nif_hash_context));
+  ctxt->bitlen = bitlen;
+  Init(&ctxt->state, bitlen);
+  ctxt_term = enif_make_resource(env, ctxt);
+  enif_release_resource(ctxt);
+
+  return ctxt_term;
+}
+
+static ERL_NIF_TERM
+nif_hash_update(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+{
+  ERL_NIF_TERM ctxt_term;
+  ErlNifBinary src_bin;
+  nif_hash_context *ctxt, *new;
+  hashState state;
+
+  if (!enif_get_resource(env, argv[0], sha3_resource_type, (void **)&ctxt) ||
+      !enif_inspect_binary(env, argv[1], &src_bin))
+    return 0;
+
+  state = ctxt->state;
+  Update(&state, src_bin.data, src_bin.size * 8);
+  new = enif_alloc_resource(sha3_resource_type, sizeof(nif_hash_context));
+  new->bitlen = ctxt->bitlen;
+  new->state = state;
+  ctxt_term = enif_make_resource(env, new);
+  enif_release_resource(new);
+
+  return ctxt_term;
+}
+
+static ERL_NIF_TERM
+nif_hash_final(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+{
+  ERL_NIF_TERM digest_term;
+  ErlNifBinary digest_bin;
+  nif_hash_context *ctxt;
+  hashState state;
+
+  if (!enif_get_resource(env, argv[0], sha3_resource_type, (void **)&ctxt))
+    return 0;
+
+  state = ctxt->state;
+  enif_alloc_binary(ctxt->bitlen / 8, &digest_bin);
+  Final(&state, digest_bin.data);
+  digest_term = enif_make_binary(env, &digest_bin);
+  enif_release_binary(&digest_bin);
+
+  return digest_term;
+}
+
+static ERL_NIF_TERM
+nif_hash(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+{
+  ERL_NIF_TERM digest_term;
+  ErlNifBinary src_bin, digest_bin;
+  int bitlen;
+
+  if (!enif_get_int(env, argv[0], &bitlen) ||
+      !enif_inspect_binary(env, argv[1], &src_bin))
+    return 0;
+
+  if (bitlen != 224 && bitlen != 256 && bitlen != 384 && bitlen != 512)
+    return 0;
+
+  enif_alloc_binary(bitlen / 8, &digest_bin);
+  Hash(bitlen, src_bin.data, src_bin.size * 8, digest_bin.data);
+  digest_term = enif_make_binary(env, &digest_bin);
+  enif_release_binary(&digest_bin);
+
+  return digest_term;
+}
+
+static int
+on_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info)
+{
+  ErlNifResourceFlags flags = ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER;
+
+  sha3_resource_type = enif_open_resource_type(env, NULL, "sha3_resource",
+      &sha3_resource_cleanup, flags, NULL);
+  if (sha3_resource_type == NULL)
+    return -1;
+  else
+    return 0;
+}
+
+static void
+on_unload(ErlNifEnv* env, void* priv_data)
+{
+  /* do nothing */
+}
+
+ERL_NIF_INIT(sha3, nif_funcs, &on_load, NULL, NULL, &on_unload);
+
diff --git a/doc/overview.edoc b/doc/overview.edoc
new file mode 100644
index 0000000..41de035
--- /dev/null
+++ b/doc/overview.edoc
@@ -0,0 +1,8 @@
+@title SHA-3 for Erlang
+@author SUZUKI Tetsuya <tetsuya.suzuki@gmail.com>
+@copyright 2012- SUZUKI Tetsuya
+@version 0.1.0
+@reference <a href="http://en.wikipedia.org/wiki/SHA-3">Wikipedia: SHA-3</a>
+@reference <a href="http://csrc.nist.gov/groups/ST/hash/sha-3/index.html">NIST: Cryptographic Hash Algorithm Competition</a>
+@reference <a href="http://keccak.noekeon.org/">The Keccak sponge function family</a>
+
diff --git a/rebar b/rebar
new file mode 100755
index 0000000..e364a2b
Binary files /dev/null and b/rebar differ
diff --git a/rebar.config b/rebar.config
new file mode 100644
index 0000000..16377e2
--- /dev/null
+++ b/rebar.config
@@ -0,0 +1,33 @@
+{erl_opts, [{i, "src"},
+            warnings_as_errors,
+            {w, all},
+            warn_export_all]}.
+
+{clean_files, [".eunit",
+               "ebin/*.beam"]}.
+
+{port_env, [{"CFLAGS", "$CFLAGS -O2 -finline-functions -fomit-frame-pointer -fno-strict-aliasing -Wmissing-prototypes -Wall -std=c99"}]}.
+
+{port_specs, [
+    % TODO: support optimization
+    % {"i386", "priv/sha3_nif.so", ["c_src/sha3_nif.c",
+    %                       "c_src/KeccakNISTInterface.c",
+    %                       "c_src/KeccakSponge.c",
+    %                       "c_src/KeccakF-1600-opt32.c",
+    %                       "c_src/displayIntermediateValues.c"]},
+    % {"x86_64", "priv/sha3_nif.so", ["c_src/sha3_nif.c",
+    %                       "c_src/KeccakNISTInterface.c",
+    %                       "c_src/KeccakSponge.c",
+    %                       "c_src/KeccakF-1600-opt64.c",
+    %                       "c_src/displayIntermediateValues.c"]},
+    {"priv/sha3_nif.so", ["c_src/sha3_nif.c",
+                          "c_src/KeccakNISTInterface.c",
+                          "c_src/KeccakSponge.c",
+                          "c_src/KeccakF-1600-reference.c",
+                          "c_src/displayIntermediateValues.c"]}
+]}.
+
+{eunit_opts, [{report,{eunit_surefire,[{dir,"."}]}}]}.
+
+{xref_checks, [fail_on_warning, undefined_function_calls]}.
+
diff --git a/src/sha3.app.src b/src/sha3.app.src
new file mode 100644
index 0000000..aee6773
--- /dev/null
+++ b/src/sha3.app.src
@@ -0,0 +1,12 @@
+{application, sha3,
+ [
+  {description, ""},
+  {vsn, "0.1.0"},
+  {registered, []},
+  {applications, [
+                  kernel,
+                  stdlib
+                 ]},
+  {modules, [sha3]},
+  {env, []}
+ ]}.
diff --git a/src/sha3.erl b/src/sha3.erl
new file mode 100644
index 0000000..1b15118
--- /dev/null
+++ b/src/sha3.erl
@@ -0,0 +1,41 @@
+-module(sha3).
+
+-export([hash_init/1, hash_update/2, hash_final/1, hash/2]).
+
+-on_load(init/0).
+
+-type bitlen() :: 224 | 256 | 384 | 512.
+-type context() :: binary().
+-type digest() :: <<_:224>> | <<_:256>> | <<_:384>> | <<_:512>>.
+
+-define(nif_stub, nif_stub_error(?LINE)).
+nif_stub_error(Line) ->
+    erlang:nif_error({nif_not_loaded,module,?MODULE,line,Line}).
+
+init() ->
+    PrivDir = case code:priv_dir(?MODULE) of
+                  {error, bad_name} ->
+                      EbinDir = filename:dirname(code:which(?MODULE)),
+                      AppPath = filename:dirname(EbinDir),
+                      filename:join(AppPath, "priv");
+                  Path ->
+                      Path
+              end,
+    erlang:load_nif(filename:join(PrivDir, sha3_nif), 0).
+
+-spec hash_init(bitlen()) -> context().
+hash_init(_BitLen) ->
+    ?nif_stub.
+
+-spec hash_update(context(), binary()) -> context().
+hash_update(_Context, _Binary) ->
+    ?nif_stub.
+
+-spec hash_final(context()) -> digest().
+hash_final(_Context) ->
+    ?nif_stub.
+
+-spec hash(bitlen(), binary()) -> digest().
+hash(_BitLen, _Binary) ->
+    ?nif_stub.
+
diff --git a/test/sha3_tests.erl b/test/sha3_tests.erl
new file mode 100644
index 0000000..c8b262f
--- /dev/null
+++ b/test/sha3_tests.erl
@@ -0,0 +1,48 @@
+-module(sha3_tests).
+
+-include_lib("eunit/include/eunit.hrl").
+
+simple_data() ->
+    <<16#00112233445566778899AABBCCDDEEFF:128>>.
+
+simple_digest() ->
+    <<16#038907E89C919CD8F90A7FBC5A88FF9278108DAEF3EBCDA0CEB383E1:224>>.
+
+simple_test() ->
+    Digest = sha3:hash(224, simple_data()),
+    Expected = simple_digest(),
+    ?assertEqual(Expected, Digest).
+
+update_test() ->
+    Context1 = sha3:hash_init(224),
+    Context2 = sha3:hash_update(Context1, simple_data()),
+    Digest = sha3:hash_final(Context2),
+    Expected = simple_digest(),
+    ?assertEqual(Expected, Digest).
+
+update_context_test() ->
+    Context1 = sha3:hash_init(224),
+    Context2 = sha3:hash_update(Context1, simple_data()),
+    Context3 = sha3:hash_update(Context1, simple_data()),
+    Digest1 = sha3:hash_final(Context2),
+    Digest2 = sha3:hash_final(Context3),
+    Expected = simple_digest(),
+    ?assertEqual(Expected, Digest1),
+    ?assertEqual(Expected, Digest2).
+
+hash_224_test() ->
+    ?assertEqual(<<16#038907E89C919CD8F90A7FBC5A88FF9278108DAEF3EBCDA0CEB383E1:224>>,
+        sha3:hash(224, <<16#00112233445566778899AABBCCDDEEFF:128>>)).
+
+hash_256_test() ->
+    ?assertEqual(<<16#22BCE46032802AF0ABFACF3768F7BE04A34F5F01DF60F44FFD52D3CA937350C0:256>>,
+        sha3:hash(256, <<16#00112233445566778899AABBCCDDEEFF:128>>)).
+
+hash_384_test() ->
+    ?assertEqual(<<16#25FAC1ADECBE1B254976FE32C2FE78829B23D7D84316141ECD208D6806A9DB4352A014ADA4106BA0D210DDA0FD18E150:384>>,
+        sha3:hash(384, <<16#00112233445566778899AABBCCDDEEFF:128>>)).
+
+hash_512_test() ->
+    ?assertEqual(<<16#94EE7851163C39C3489373AA0BF885D95925EAD7484C586D2E0D01D9C8069D3C30E2EEA2DC63A91B517FE53E43A31D764A2154A2DA92876366B138ABC4406805:512>>,
+        sha3:hash(512, <<16#00112233445566778899AABBCCDDEEFF:128>>)).
+