aboutsummaryrefslogtreecommitdiff
path: root/crypto
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2015-12-02 14:27:23 +0100
committerAndy Polyakov <appro@openssl.org>2015-12-10 13:11:26 +0100
commitbd30091c9725bdad1c82bce10839f33ceaa5623b (patch)
treebaa02efae3c6c4119246f268e9551449891649e4 /crypto
parent2fb5535e64c395f01151315474fd10574677e3d6 (diff)
downloadopenssl-bd30091c9725bdad1c82bce10839f33ceaa5623b.zip
openssl-bd30091c9725bdad1c82bce10839f33ceaa5623b.tar.gz
openssl-bd30091c9725bdad1c82bce10839f33ceaa5623b.tar.bz2
x86[_64] assembly pack: add optimized AES-NI OCB subroutines.
Reviewed-by: Richard Levitte <levitte@openssl.org>
Diffstat (limited to 'crypto')
-rw-r--r--crypto/aes/asm/aesni-x86.pl887
-rw-r--r--crypto/aes/asm/aesni-x86_64.pl1013
-rw-r--r--crypto/evp/e_aes.c23
-rw-r--r--crypto/modes/modes_lcl.h1
-rw-r--r--crypto/modes/ocb128.c173
5 files changed, 2007 insertions, 90 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 9b2e37a..536f035 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -43,16 +43,20 @@
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt.
+
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
-# CBC en-/decrypt CTR XTS ECB
+# CBC en-/decrypt CTR XTS ECB OCB
# Westmere 3.77/1.37 1.37 1.52 1.27
-# * Bridge 5.07/0.98 0.99 1.09 0.91
-# Haswell 4.44/0.80 0.97 1.03 0.72
-# Silvermont 5.77/3.56 3.67 4.03 3.46
-# Bulldozer 5.80/0.98 1.05 1.24 0.93
+# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
+# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
+# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
+# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
@@ -1831,6 +1835,877 @@ if ($PREFIX eq "aesni") {
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_decrypt");
}
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+# offsets within stack frame
+my $checksum = 16*6;
+my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
+
+# reassigned registers
+my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
+# $l_, $blocks, $inp, $key are permanently allocated in registers;
+# remaining non-volatile ones are offloaded to stack, which even
+# stay invariant after written to stack.
+
+&function_begin("aesni_ocb_encrypt");
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
+ &mov ($block,&wparam(4)); # start_block_num
+ &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
+ &mov ($l_,&wparam(6)); # L_
+
+ &mov ($rounds,"esp");
+ &sub ("esp",$esp_off+4); # alloca
+ &and ("esp",-16); # align stack
+
+ &sub ($out,$inp);
+ &shl ($len,4);
+ &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
+ &mov (&DWP($out_off,"esp"),$out);
+ &mov (&DWP($end_off,"esp"),$len);
+ &mov (&DWP($esp_off,"esp"),$rounds);
+
+ &mov ($rounds,&DWP(240,$key));
+
+ &test ($block,1);
+ &jnz (&label("odd"));
+
+ &bsf ($i3,$block);
+ &add ($block,1);
+ &shl ($i3,4);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+ &mov ($i3,$key); # put aside key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16,$inp));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,$inout4); # pass the checksum
+
+ &movups (&QWP(-16,$out,$inp),$inout0); # store output
+
+ &mov ($rounds,&DWP(240,$i3));
+ &mov ($key,$i3); # restore key
+ &mov ($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+ &shl ($rounds,4);
+ &mov ($out,16);
+ &sub ($out,$rounds); # twisted rounds
+ &mov (&DWP($key_off,"esp"),$key);
+ &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
+ &mov (&DWP($rounds_off,"esp"),$out);
+
+ &cmp ($inp,$len);
+ &ja (&label("short"));
+ &jmp (&label("grandloop"));
+
+&set_label("grandloop",32);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &lea ($i5,&DWP(5,$block));
+ &add ($block,6);
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &bsf ($i5,$i5);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &shl ($i5,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+ &movdqu ($inout5,&QWP(0,$l_,$i5));
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+ &movdqa (&QWP(16*5,"esp"),$inout5);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &movdqu ($inout5,&QWP(16*5,$inp));
+ &lea ($inp,&DWP(16*6,$inp));
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($rndkey1,$inout3);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($rndkey1,$inout4);
+ &pxor ($inout4,$rndkey0);
+ &pxor ($rndkey1,$inout5);
+ &pxor ($inout5,$rndkey0);
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,&QWP(16*5,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($inout1,$rndkey1);
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
+ &aesenc ($inout4,$rndkey1);
+ &aesenc ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &mov ($len,&DWP($end_off,"esp"));
+ &call ("_aesni_encrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,$rndkey0);
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+ &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
+ &movdqu (&QWP(-16*5,$out,$inp),$inout1);
+ &movdqu (&QWP(-16*4,$out,$inp),$inout2);
+ &movdqu (&QWP(-16*3,$out,$inp),$inout3);
+ &movdqu (&QWP(-16*2,$out,$inp),$inout4);
+ &movdqu (&QWP(-16*1,$out,$inp),$inout5);
+ &cmp ($inp,$len); # done yet?
+ &jb (&label("grandloop"));
+
+&set_label("short");
+ &add ($len,16*6);
+ &sub ($len,$inp);
+ &jz (&label("done"));
+
+ &cmp ($len,16*2);
+ &jb (&label("one"));
+ &je (&label("two"));
+
+ &cmp ($len,16*4);
+ &jb (&label("three"));
+ &je (&label("four"));
+
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout5,$inout5);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($rndkey1,$inout3);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($rndkey1,$inout4);
+ &pxor ($inout4,$rndkey0);
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($inout1,$rndkey1);
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
+ &aesenc ($inout4,$rndkey1);
+ &aesenc ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,$rndkey0);
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+ &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
+ &movdqu (&QWP(16*1,$out,$inp),$inout1);
+ &movdqu (&QWP(16*2,$out,$inp),$inout2);
+ &movdqu (&QWP(16*3,$out,$inp),$inout3);
+ &movdqu (&QWP(16*4,$out,$inp),$inout4);
+
+ &jmp (&label("done"));
+
+&set_label("one",16);
+ &movdqu ($inout5,&QWP(0,$l_));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ &mov ($out,&DWP($out_off,"esp"));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,$inout4); # pass the checksum
+ &movups (&QWP(0,$out,$inp),$inout0);
+
+ &jmp (&label("done"));
+
+&set_label("two",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout4,&QWP(0,$l_));
+ &movdqu ($inout5,&QWP(0,$l_,$i1));
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout4,$rndkey0); # ^ last offset_i
+ &pxor ($inout5,$inout4);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout4); # ^ offset_i
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$inout5);
+
+ &movdqa ($inout3,$rndkey1)
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt2");
+
+ &xorps ($inout0,$inout4); # ^ offset_i
+ &xorps ($inout1,$inout5);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,$inout3); # pass the checksum
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+
+ &jmp (&label("done"));
+
+&set_label("three",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout3,&QWP(0,$l_));
+ &movdqu ($inout4,&QWP(0,$l_,$i1));
+ &movdqa ($inout5,$inout3);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout3,$rndkey0); # ^ last offset_i
+ &pxor ($inout4,$inout3);
+ &pxor ($inout5,$inout4);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout3); # ^ offset_i
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$inout4);
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$inout5);
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # ^ offset_i
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+
+ &jmp (&label("done"));
+
+&set_label("four",16);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout2,&QWP(0,$l_));
+ &movdqu ($inout3,&QWP(0,$l_,$i1));
+ &movdqa ($inout4,$inout2);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+
+ &pxor ($inout2,$rndkey0); # ^ last offset_i
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &pxor ($inout3,$inout2);
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*0,"esp"),$inout2);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*1,"esp"),$inout3);
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$inout4);
+ &pxor ($rndkey1,$inout3);
+ &pxor ($inout3,$inout5);
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1)
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout4);
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &xorps ($inout3,$inout5);
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &movups (&QWP(16*3,$out,$inp),$inout3);
+
+&set_label("done");
+ &mov ($key,&DWP($esp_off,"esp"));
+ &pxor ($inout0,$inout0); # clear register bank
+ &pxor ($inout1,$inout1);
+ &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
+ &pxor ($inout2,$inout2);
+ &movdqa (&QWP(16*1,"esp"),$inout0);
+ &pxor ($inout3,$inout3);
+ &movdqa (&QWP(16*2,"esp"),$inout0);
+ &pxor ($inout4,$inout4);
+ &movdqa (&QWP(16*3,"esp"),$inout0);
+ &pxor ($inout5,$inout5);
+ &movdqa (&QWP(16*4,"esp"),$inout0);
+ &movdqa (&QWP(16*5,"esp"),$inout0);
+ &movdqa (&QWP(16*6,"esp"),$inout0);
+
+ &lea ("esp",&DWP(0,$key));
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+ &movdqu (&QWP(0,$rounds),$rndkey0);
+ &pxor ($rndkey0,$rndkey0);
+ &movdqu (&QWP(0,$rounds_),$rndkey1);
+ &pxor ($rndkey1,$rndkey1);
+&function_end("aesni_ocb_encrypt");
+
+&function_begin("aesni_ocb_decrypt");
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
+ &mov ($block,&wparam(4)); # start_block_num
+ &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
+ &mov ($l_,&wparam(6)); # L_
+
+ &mov ($rounds,"esp");
+ &sub ("esp",$esp_off+4); # alloca
+ &and ("esp",-16); # align stack
+
+ &sub ($out,$inp);
+ &shl ($len,4);
+ &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
+ &mov (&DWP($out_off,"esp"),$out);
+ &mov (&DWP($end_off,"esp"),$len);
+ &mov (&DWP($esp_off,"esp"),$rounds);
+
+ &mov ($rounds,&DWP(240,$key));
+
+ &test ($block,1);
+ &jnz (&label("odd"));
+
+ &bsf ($i3,$block);
+ &add ($block,1);
+ &shl ($i3,4);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+ &mov ($i3,$key); # put aside key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16,$inp));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movaps ($rndkey1,$inout4); # pass the checksum
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &xorps ($rndkey1,$inout0); # checksum
+ &movups (&QWP(-16,$out,$inp),$inout0); # store output
+
+ &mov ($rounds,&DWP(240,$i3));
+ &mov ($key,$i3); # restore key
+ &mov ($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+ &shl ($rounds,4);
+ &mov ($out,16);
+ &sub ($out,$rounds); # twisted rounds
+ &mov (&DWP($key_off,"esp"),$key);
+ &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
+ &mov (&DWP($rounds_off,"esp"),$out);
+
+ &cmp ($inp,$len);
+ &ja (&label("short"));
+ &jmp (&label("grandloop"));
+
+&set_label("grandloop",32);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &lea ($i5,&DWP(5,$block));
+ &add ($block,6);
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &bsf ($i5,$i5);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &shl ($i5,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+ &movdqu ($inout5,&QWP(0,$l_,$i5));
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+ &movdqa (&QWP(16*5,"esp"),$inout5);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &movdqu ($inout5,&QWP(16*5,$inp));
+ &lea ($inp,&DWP(16*6,$inp));
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($inout4,$rndkey0);
+ &pxor ($inout5,$rndkey0);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,&QWP(16*5,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesdec ($inout0,$rndkey1);
+ &aesdec ($inout1,$rndkey1);
+ &aesdec ($inout2,$rndkey1);
+ &aesdec ($inout3,$rndkey1);
+ &aesdec ($inout4,$rndkey1);
+ &aesdec ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &mov ($len,&DWP($end_off,"esp"));
+ &call ("_aesni_decrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,$rndkey0);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
+ &pxor ($rndkey1,$inout1);
+ &movdqu (&QWP(-16*5,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movdqu (&QWP(-16*4,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout3);
+ &movdqu (&QWP(-16*3,$out,$inp),$inout3);
+ &pxor ($rndkey1,$inout4);
+ &movdqu (&QWP(-16*2,$out,$inp),$inout4);
+ &pxor ($rndkey1,$inout5);
+ &movdqu (&QWP(-16*1,$out,$inp),$inout5);
+ &cmp ($inp,$len); # done yet?
+ &jb (&label("grandloop"));
+
+&set_label("short");
+ &add ($len,16*6);
+ &sub ($len,$inp);
+ &jz (&label("done"));
+
+ &cmp ($len,16*2);
+ &jb (&label("one"));
+ &je (&label("two"));
+
+ &cmp ($len,16*4);
+ &jb (&label("three"));
+ &je (&label("four"));
+
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout5,$inout5);
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($inout4,$rndkey0);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesdec ($inout0,$rndkey1);
+ &aesdec ($inout1,$rndkey1);
+ &aesdec ($inout2,$rndkey1);
+ &aesdec ($inout3,$rndkey1);
+ &aesdec ($inout4,$rndkey1);
+ &aesdec ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,$rndkey0);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
+ &pxor ($rndkey1,$inout1);
+ &movdqu (&QWP(16*1,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movdqu (&QWP(16*2,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout3);
+ &movdqu (&QWP(16*3,$out,$inp),$inout3);
+ &pxor ($rndkey1,$inout4);
+ &movdqu (&QWP(16*4,$out,$inp),$inout4);
+
+ &jmp (&label("done"));
+
+&set_label("one",16);
+ &movdqu ($inout5,&QWP(0,$l_));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ &mov ($out,&DWP($out_off,"esp"));
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movaps ($rndkey1,$inout4); # pass the checksum
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &xorps ($rndkey1,$inout0); # checksum
+ &movups (&QWP(0,$out,$inp),$inout0);
+
+ &jmp (&label("done"));
+
+&set_label("two",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout4,&QWP(0,$l_));
+ &movdqu ($inout5,&QWP(0,$l_,$i1));
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &movdqa ($inout3,$rndkey1);
+ &pxor ($inout4,$rndkey0); # ^ last offset_i
+ &pxor ($inout5,$inout4);
+
+ &pxor ($inout0,$inout4); # ^ offset_i
+ &pxor ($inout1,$inout5);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt2");
+
+ &xorps ($inout0,$inout4); # ^ offset_i
+ &xorps ($inout1,$inout5);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &xorps ($inout3,$inout0); # checksum
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &xorps ($inout3,$inout1);
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &movaps ($rndkey1,$inout3); # pass the checksum
+
+ &jmp (&label("done"));
+
+&set_label("three",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout3,&QWP(0,$l_));
+ &movdqu ($inout4,&QWP(0,$l_,$i1));
+ &movdqa ($inout5,$inout3);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout3,$rndkey0); # ^ last offset_i
+ &pxor ($inout4,$inout3);
+ &pxor ($inout5,$inout4);
+
+ &pxor ($inout0,$inout3); # ^ offset_i
+ &pxor ($inout1,$inout4);
+ &pxor ($inout2,$inout5);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt3");
+
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &xorps ($inout0,$inout3); # ^ offset_i
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &pxor ($rndkey1,$inout0); # checksum
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout1);
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout2);
+
+ &jmp (&label("done"));
+
+&set_label("four",16);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout2,&QWP(0,$l_));
+ &movdqu ($inout3,&QWP(0,$l_,$i1));
+ &movdqa ($inout4,$inout2);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+
+ &pxor ($inout2,$rndkey0); # ^ last offset_i
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &pxor ($inout3,$inout2);
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*0,"esp"),$inout2);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*1,"esp"),$inout3);
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,$inout4);
+ &pxor ($inout3,$inout5);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt4");
+
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout4);
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &pxor ($rndkey1,$inout0); # checksum
+ &xorps ($inout3,$inout5);
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout1);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout2);
+ &movups (&QWP(16*3,$out,$inp),$inout3);
+ &pxor ($rndkey1,$inout3);
+
+&set_label("done");
+ &mov ($key,&DWP($esp_off,"esp"));
+ &pxor ($inout0,$inout0); # clear register bank
+ &pxor ($inout1,$inout1);
+ &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
+ &pxor ($inout2,$inout2);
+ &movdqa (&QWP(16*1,"esp"),$inout0);
+ &pxor ($inout3,$inout3);
+ &movdqa (&QWP(16*2,"esp"),$inout0);
+ &pxor ($inout4,$inout4);
+ &movdqa (&QWP(16*3,"esp"),$inout0);
+ &pxor ($inout5,$inout5);
+ &movdqa (&QWP(16*4,"esp"),$inout0);
+ &movdqa (&QWP(16*5,"esp"),$inout0);
+ &movdqa (&QWP(16*6,"esp"),$inout0);
+
+ &lea ("esp",&DWP(0,$key));
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+ &movdqu (&QWP(0,$rounds),$rndkey0);
+ &pxor ($rndkey0,$rndkey0);
+ &movdqu (&QWP(0,$rounds_),$rndkey1);
+ &pxor ($rndkey1,$rndkey1);
+&function_end("aesni_ocb_decrypt");
+}
}
######################################################################
@@ -2419,7 +3294,7 @@ if ($PREFIX eq "aesni") {
&pxor ("xmm3","xmm3");
&aesenclast ("xmm2","xmm3");
- &movdqa ("xmm3","xmm1")
+ &movdqa ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 6037e9e..6e41a1ae 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -157,17 +157,22 @@
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
-# CBC en-/decrypt CTR XTS ECB
+# CBC en-/decrypt CTR XTS ECB OCB
# Westmere 3.77/1.25 1.25 1.25 1.26
-# * Bridge 5.07/0.74 0.75 0.90 0.85
-# Haswell 4.44/0.63 0.63 0.73 0.63
+# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
+# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
# Skylake 2.62/0.63 0.63 0.63 0.63
-# Silvermont 5.75/3.54 3.56 4.12 3.87(*)
-# Bulldozer 5.77/0.70 0.72 0.90 0.70
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
+# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
#
# (*) Atom Silvermont ECB result is suboptimal because of penalties
# incurred by operations on %xmm8-15. As ECB is not considered
@@ -2709,6 +2714,925 @@ $code.=<<___;
ret
.size aesni_xts_decrypt,.-aesni_xts_decrypt
___
+}
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+ lea (%rsp),%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_enc_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_enc_done
+
+.Locb_enc_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_enc_short
+ jmp .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_encrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_enc_grandloop
+
+.Locb_enc_short:
+ add \$6,$blocks
+ jz .Locb_enc_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_enc_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_enc_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_enc_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_enc_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_encrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out)
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+
+.Locb_enc_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+ lea 0xa0(%rsp),%rsp
+___
+$code.=<<___;
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+.Locb_enc_epilogue:
+ ret
+.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor $inout4,$checksum
+ pxor @offset[4],$inout4
+ pxor $inout5,$checksum
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesenc $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesenclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ aesenclast @offset[4],$inout4
+ aesenclast @offset[5],$inout5
+ ret
+.size __ocb_encrypt6,.-__ocb_encrypt6
+
+.type __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop4
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast @offset[0],$inout0
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ ret
+.size __ocb_encrypt4,.-__ocb_encrypt4
+
+.type __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesenc $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesenc $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+ aesenc $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop1
+
+ aesenc $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast $inout5,$inout0
+ ret
+.size __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+ lea (%rsp),%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_dec_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ xorps $inout0,$checksum # accumulate checksum
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_dec_done
+
+.Locb_dec_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_dec_short
+ jmp .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_decrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+ movups $inout5,`16*5`($out)
+ pxor $inout5,$checksum
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_dec_grandloop
+
+.Locb_dec_short:
+ add \$6,$blocks
+ jz .Locb_dec_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_dec_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_dec_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_dec_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_dec_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_decrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ xorps $inout2,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+
+.Locb_dec_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+ lea 0xa0(%rsp),%rsp
+___
+$code.=<<___;
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+.Locb_dec_epilogue:
+ ret
+.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor @offset[4],$inout4
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop6
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesdeclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ aesdeclast @offset[4],$inout4
+ aesdeclast @offset[5],$inout5
+ ret
+.size __ocb_decrypt6,.-__ocb_decrypt6
+
+.type __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop4
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast @offset[0],$inout0
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ ret
+.size __ocb_decrypt4,.-__ocb_decrypt4
+
+.type __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesdec $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesdec $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+ aesdec $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop1
+
+ aesdec $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast $inout5,$inout0
+ ret
+.size __ocb_decrypt1,.-__ocb_decrypt1
+___
} }}
########################################################################
@@ -3820,6 +4744,65 @@ ctr_xts_se_handler:
jmp .Lcommon_rbp_tail
.size ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type ocb_se_handler,\@abi-omnipotent
+.align 16
+ocb_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10
+ cmp %r10,%rbx # context->Rip>=pop label
+ jae .Locb_no_xmm
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea (%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size ocb_se_handler,.-ocb_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
@@ -3933,6 +4916,14 @@ $code.=<<___ if ($PREFIX eq "aesni");
.rva .LSEH_begin_aesni_xts_decrypt
.rva .LSEH_end_aesni_xts_decrypt
.rva .LSEH_info_xts_dec
+
+ .rva .LSEH_begin_aesni_ocb_encrypt
+ .rva .LSEH_end_aesni_ocb_encrypt
+ .rva .LSEH_info_ocb_enc
+
+ .rva .LSEH_begin_aesni_ocb_decrypt
+ .rva .LSEH_end_aesni_ocb_decrypt
+ .rva .LSEH_info_ocb_dec
___
$code.=<<___;
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
@@ -3974,6 +4965,18 @@ $code.=<<___ if ($PREFIX eq "aesni");
.byte 9,0,0,0
.rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+.LSEH_info_ocb_enc:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
+ .rva .Locb_enc_pop
+ .long 0
+.LSEH_info_ocb_dec:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
+ .rva .Locb_dec_pop
+ .long 0
___
$code.=<<___;
.LSEH_info_cbc:
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index efa724a..c356c9a 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -461,6 +461,19 @@ static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t len);
# ifndef OPENSSL_NO_OCB
+void aesni_ocb_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const void *key,
+ size_t start_block_num,
+ unsigned char offset_i[16],
+ const unsigned char L_[][16],
+ unsigned char checksum[16]);
+void aesni_ocb_decrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const void *key,
+ size_t start_block_num,
+ unsigned char offset_i[16],
+ const unsigned char L_[][16],
+ unsigned char checksum[16]);
+
static int aesni_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
@@ -479,7 +492,9 @@ static int aesni_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (!CRYPTO_ocb128_init(&octx->ocb,
&octx->ksenc.ks, &octx->ksdec.ks,
(block128_f) aesni_encrypt,
- (block128_f) aesni_decrypt))
+ (block128_f) aesni_decrypt,
+ enc ? aesni_ocb_encrypt
+ : aesni_ocb_decrypt))
return 0;
}
while (0);
@@ -2348,7 +2363,8 @@ static int aes_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (!CRYPTO_ocb128_init(&octx->ocb,
&octx->ksenc.ks, &octx->ksdec.ks,
(block128_f) vpaes_encrypt,
- (block128_f) vpaes_decrypt))
+ (block128_f) vpaes_decrypt,
+ NULL))
return 0;
break;
}
@@ -2358,7 +2374,8 @@ static int aes_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (!CRYPTO_ocb128_init(&octx->ocb,
&octx->ksenc.ks, &octx->ksdec.ks,
(block128_f) AES_encrypt,
- (block128_f) AES_decrypt))
+ (block128_f) AES_decrypt,
+ NULL))
return 0;
}
while (0);
diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h
index 2f61afe..071b014 100644
--- a/crypto/modes/modes_lcl.h
+++ b/crypto/modes/modes_lcl.h
@@ -164,6 +164,7 @@ struct ocb128_context {
block128_f decrypt;
void *keyenc;
void *keydec;
+ ocb128_f stream; /* direction dependent */
/* Key dependent variables. Can be reused if key remains the same */
size_t l_index;
size_t max_l_index;
diff --git a/crypto/modes/ocb128.c b/crypto/modes/ocb128.c
index 3a3f7a8..c3daf7c 100644
--- a/crypto/modes/ocb128.c
+++ b/crypto/modes/ocb128.c
@@ -159,7 +159,7 @@ static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3;
ctx->l =
OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK));
- if (!ctx->l)
+ if (ctx->l == NULL)
return NULL;
}
while (l_index < idx) {
@@ -172,34 +172,18 @@ static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
}
/*
- * Encrypt a block from |in| and store the result in |out|
- */
-static void ocb_encrypt(OCB128_CONTEXT *ctx, OCB_BLOCK *in, OCB_BLOCK *out,
- void *keyenc)
-{
- ctx->encrypt(in->c, out->c, keyenc);
-}
-
-/*
- * Decrypt a block from |in| and store the result in |out|
- */
-static void ocb_decrypt(OCB128_CONTEXT *ctx, OCB_BLOCK *in, OCB_BLOCK *out,
- void *keydec)
-{
- ctx->decrypt(in->c, out->c, keydec);
-}
-
-/*
* Create a new OCB128_CONTEXT
*/
OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
- block128_f encrypt, block128_f decrypt)
+ block128_f encrypt, block128_f decrypt,
+ ocb128_f stream)
{
OCB128_CONTEXT *octx;
int ret;
if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) {
- ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt);
+ ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt,
+ stream);
if (ret)
return octx;
OPENSSL_free(octx);
@@ -212,7 +196,8 @@ OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
* Initialise an existing OCB128_CONTEXT
*/
int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
- block128_f encrypt, block128_f decrypt)
+ block128_f encrypt, block128_f decrypt,
+ ocb128_f stream)
{
memset(ctx, 0, sizeof(*ctx));
ctx->l_index = 0;
@@ -228,11 +213,12 @@ int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
*/
ctx->encrypt = encrypt;
ctx->decrypt = decrypt;
+ ctx->stream = stream;
ctx->keyenc = keyenc;
ctx->keydec = keydec;
/* L_* = ENCIPHER(K, zeros(128)) */
- ocb_encrypt(ctx, &ctx->l_star, &ctx->l_star, ctx->keyenc);
+ ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc);
/* L_$ = double(L_*) */
ocb_double(&ctx->l_star, &ctx->l_dollar);
@@ -324,11 +310,10 @@ int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
size_t len)
{
- u64 all_num_blocks, num_blocks;
- u64 i;
+ u64 i, all_num_blocks;
+ size_t num_blocks, last_len;
OCB_BLOCK tmp1;
OCB_BLOCK tmp2;
- int last_len;
/* Calculate the number of blocks of AAD provided now, and so far */
num_blocks = len / 16;
@@ -341,14 +326,14 @@ int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
lookup = ocb_lookup_l(ctx, ocb_ntz(i));
- if (!lookup)
+ if (lookup == NULL)
return 0;
ocb_block16_xor(&ctx->offset_aad, lookup, &ctx->offset_aad);
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
aad_block = (OCB_BLOCK *)(aad + ((i - ctx->blocks_hashed - 1) * 16));
ocb_block16_xor(&ctx->offset_aad, aad_block, &tmp1);
- ocb_encrypt(ctx, &tmp1, &tmp2, ctx->keyenc);
+ ctx->encrypt(tmp1.c, tmp2.c, ctx->keyenc);
ocb_block16_xor(&ctx->sum, &tmp2, &ctx->sum);
}
@@ -369,7 +354,7 @@ int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
ocb_block16_xor(&ctx->offset_aad, &tmp1, &tmp2);
/* Sum = Sum_m xor ENCIPHER(K, CipherInput) */
- ocb_encrypt(ctx, &tmp2, &tmp1, ctx->keyenc);
+ ctx->encrypt(tmp2.c, tmp1.c, ctx->keyenc);
ocb_block16_xor(&ctx->sum, &tmp1, &ctx->sum);
}
@@ -386,12 +371,11 @@ int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
const unsigned char *in, unsigned char *out,
size_t len)
{
- u64 i;
- u64 all_num_blocks, num_blocks;
+ u64 i, all_num_blocks;
+ size_t num_blocks, last_len;
OCB_BLOCK tmp1;
OCB_BLOCK tmp2;
OCB_BLOCK pad;
- int last_len;
/*
* Calculate the number of blocks of data to be encrypted provided now, and
@@ -400,28 +384,46 @@ int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
num_blocks = len / 16;
all_num_blocks = num_blocks + ctx->blocks_processed;
- /* Loop through all full blocks to be encrypted */
- for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
- OCB_BLOCK *lookup;
- OCB_BLOCK *inblock;
- OCB_BLOCK *outblock;
+ if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+ && ctx->stream != NULL) {
+ size_t max_idx = 0, top = (size_t)all_num_blocks;
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- lookup = ocb_lookup_l(ctx, ocb_ntz(i));
- if (!lookup)
+ /*
+ * See how many L_{i} entries we need to process data at hand
+ * and pre-compute missing entries in the table [if any]...
+ */
+ while (top >>= 1)
+ max_idx++;
+ if (ocb_lookup_l(ctx, max_idx) == NULL)
return 0;
- ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
-
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
- inblock = (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
- ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
- /* Checksum_i = Checksum_{i-1} xor P_i */
- ocb_block16_xor_misaligned(&ctx->checksum, inblock, &ctx->checksum);
- ocb_encrypt(ctx, &tmp1, &tmp2, ctx->keyenc);
- outblock =
- (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
- ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
+ ctx->stream(in, out, num_blocks, ctx->keyenc,
+ (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+ (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+ } else {
+ /* Loop through all full blocks to be encrypted */
+ for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+ OCB_BLOCK *lookup;
+ OCB_BLOCK *inblock;
+ OCB_BLOCK *outblock;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+ if (lookup == NULL)
+ return 0;
+ ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ inblock =
+ (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
+ ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ ocb_block16_xor_misaligned(&ctx->checksum, inblock, &ctx->checksum);
+ ctx->encrypt(tmp1.c, tmp2.c, ctx->keyenc);
+ outblock =
+ (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
+ ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
+ }
}
/*
@@ -435,7 +437,7 @@ int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
/* Pad = ENCIPHER(K, Offset_*) */
- ocb_encrypt(ctx, &ctx->offset, &pad, ctx->keyenc);
+ ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
/* C_* = P_* xor Pad[1..bitlen(P_*)] */
ocb_block_xor(in + (len / 16) * 16, (unsigned char *)&pad, last_len,
@@ -461,12 +463,12 @@ int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
const unsigned char *in, unsigned char *out,
size_t len)
{
- u64 i;
- u64 all_num_blocks, num_blocks;
+ u64 i, all_num_blocks;
+ size_t num_blocks, last_len;
OCB_BLOCK tmp1;
OCB_BLOCK tmp2;
OCB_BLOCK pad;
- int last_len;
+
/*
* Calculate the number of blocks of data to be decrypted provided now, and
* so far
@@ -474,27 +476,46 @@ int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
num_blocks = len / 16;
all_num_blocks = num_blocks + ctx->blocks_processed;
- /* Loop through all full blocks to be decrypted */
- for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
- OCB_BLOCK *inblock;
- OCB_BLOCK *outblock;
+ if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+ && ctx->stream != NULL) {
+ size_t max_idx = 0, top = (size_t)all_num_blocks;
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
- if (!lookup)
+ /*
+ * See how many L_{i} entries we need to process data at hand
+ * and pre-compute missing entries in the table [if any]...
+ */
+ while (top >>= 1)
+ max_idx++;
+ if (ocb_lookup_l(ctx, max_idx) == NULL)
return 0;
- ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
-
- /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
- inblock = (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
- ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
- ocb_decrypt(ctx, &tmp1, &tmp2, ctx->keydec);
- outblock =
- (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
- ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
-
- /* Checksum_i = Checksum_{i-1} xor P_i */
- ocb_block16_xor_misaligned(&ctx->checksum, outblock, &ctx->checksum);
+
+ ctx->stream(in, out, num_blocks, ctx->keydec,
+ (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+ (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+ } else {
+ /* Loop through all full blocks to be decrypted */
+ for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+ OCB_BLOCK *inblock;
+ OCB_BLOCK *outblock;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+ if (lookup == NULL)
+ return 0;
+ ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ inblock =
+ (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
+ ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
+ ctx->decrypt(tmp1.c, tmp2.c, ctx->keydec);
+ outblock =
+ (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
+ ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ ocb_block16_xor_misaligned(&ctx->checksum, outblock, &ctx->checksum);
+ }
}
/*
@@ -508,7 +529,7 @@ int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
/* Pad = ENCIPHER(K, Offset_*) */
- ocb_encrypt(ctx, &ctx->offset, &pad, ctx->keyenc);
+ ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
/* P_* = C_* xor Pad[1..bitlen(C_*)] */
ocb_block_xor(in + (len / 16) * 16, (unsigned char *)&pad, last_len,
@@ -539,7 +560,7 @@ int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag,
*/
ocb_block16_xor(&ctx->checksum, &ctx->offset, &tmp1);
ocb_block16_xor(&tmp1, &ctx->l_dollar, &tmp2);
- ocb_encrypt(ctx, &tmp2, &tmp1, ctx->keyenc);
+ ctx->encrypt(tmp2.c, tmp1.c, ctx->keyenc);
ocb_block16_xor(&tmp1, &ctx->sum, &ctx->tag);
if (len > 16 || len < 1) {