From 453060a9062959ceb1522b8b99adeb01b2a3f7b7 Mon Sep 17 00:00:00 2001 From: Ian Lance Taylor Date: Thu, 8 Dec 2016 16:37:54 +0000 Subject: runtime: copy memory hash code from Go 1.7 Rewrite the AES hashing code from gc assembler to C code using intrinsics. The resulting code generates the same hash code for the same input as the gc code--that doesn't matter as such, but testing it ensures that the C code does something useful. Also change mips64pe32le to mips64p32le in configure script--noticed during CL review. Reviewed-on: https://go-review.googlesource.com/34022 From-SVN: r243445 --- libgo/Makefile.am | 1 + libgo/Makefile.in | 11 +- libgo/configure | 2 +- libgo/configure.ac | 2 +- libgo/go/runtime/alg.go | 45 +++ libgo/go/runtime/hash32.go | 94 +++++++ libgo/go/runtime/hash64.go | 94 +++++++ libgo/go/runtime/os_gccgo.go | 23 ++ libgo/go/runtime/runtime2.go | 12 +- libgo/go/runtime/stubs.go | 6 + libgo/go/runtime/unaligned1.go | 17 ++ libgo/go/runtime/unaligned2.go | 20 ++ libgo/runtime/aeshash.c | 583 +++++++++++++++++++++++++++++++++++++++ libgo/runtime/go-libmain.c | 1 + libgo/runtime/go-main.c | 1 + libgo/runtime/go-type-identity.c | 40 +-- libgo/runtime/go-type.h | 1 - libgo/runtime/proc.c | 3 +- libgo/runtime/runtime.h | 6 + libgo/runtime/runtime_c.c | 19 ++ 20 files changed, 935 insertions(+), 46 deletions(-) create mode 100644 libgo/go/runtime/hash32.go create mode 100644 libgo/go/runtime/hash64.go create mode 100644 libgo/go/runtime/os_gccgo.go create mode 100644 libgo/go/runtime/unaligned1.go create mode 100644 libgo/go/runtime/unaligned2.go create mode 100644 libgo/runtime/aeshash.c (limited to 'libgo') diff --git a/libgo/Makefile.am b/libgo/Makefile.am index 7165dfd..b9aee9d 100644 --- a/libgo/Makefile.am +++ b/libgo/Makefile.am @@ -422,6 +422,7 @@ endif endif runtime_files = \ + runtime/aeshash.c \ runtime/go-assert.c \ runtime/go-breakpoint.c \ runtime/go-caller.c \ diff --git a/libgo/Makefile.in b/libgo/Makefile.in index 9b87db0..86d7aa8 100644 --- a/libgo/Makefile.in +++ b/libgo/Makefile.in @@ -189,7 +189,7 @@ libgo_llgo_la_DEPENDENCIES = $(am__DEPENDENCIES_4) @LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@am__objects_4 = \ @LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@ getncpu-bsd.lo @LIBGO_IS_LINUX_TRUE@am__objects_4 = getncpu-linux.lo -am__objects_5 = go-assert.lo go-breakpoint.lo go-caller.lo \ +am__objects_5 = aeshash.lo go-assert.lo go-breakpoint.lo go-caller.lo \ go-callers.lo go-cdiv.lo go-cgo.lo go-construct-map.lo \ go-ffi.lo go-fieldtrack.lo go-matherr.lo go-memclr.lo \ go-memcmp.lo go-memequal.lo go-memmove.lo go-nanotime.lo \ @@ -767,6 +767,7 @@ toolexeclibgounicode_DATA = \ @LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@runtime_getncpu_file = runtime/getncpu-bsd.c @LIBGO_IS_LINUX_TRUE@runtime_getncpu_file = runtime/getncpu-linux.c runtime_files = \ + runtime/aeshash.c \ runtime/go-assert.c \ runtime/go-breakpoint.c \ runtime/go-caller.c \ @@ -1446,6 +1447,7 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aeshash.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env_posix.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-bsd.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-irix.Plo@am__quote@ @@ -1573,6 +1575,13 @@ libgolibbegin_a-go-libmain.obj: runtime/go-libmain.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libgolibbegin_a_CFLAGS) $(CFLAGS) -c -o libgolibbegin_a-go-libmain.obj `if test -f 'runtime/go-libmain.c'; then $(CYGPATH_W) 'runtime/go-libmain.c'; else $(CYGPATH_W) '$(srcdir)/runtime/go-libmain.c'; fi` +aeshash.lo: runtime/aeshash.c +@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT aeshash.lo -MD -MP -MF $(DEPDIR)/aeshash.Tpo -c -o aeshash.lo `test -f 'runtime/aeshash.c' || echo '$(srcdir)/'`runtime/aeshash.c +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/aeshash.Tpo $(DEPDIR)/aeshash.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='runtime/aeshash.c' object='aeshash.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o aeshash.lo `test -f 'runtime/aeshash.c' || echo '$(srcdir)/'`runtime/aeshash.c + go-assert.lo: runtime/go-assert.c @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT go-assert.lo -MD -MP -MF $(DEPDIR)/go-assert.Tpo -c -o go-assert.lo `test -f 'runtime/go-assert.c' || echo '$(srcdir)/'`runtime/go-assert.c @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/go-assert.Tpo $(DEPDIR)/go-assert.Plo diff --git a/libgo/configure b/libgo/configure index 9eac5c0..7789c120 100755 --- a/libgo/configure +++ b/libgo/configure @@ -13624,7 +13624,7 @@ esac # supported by the gofrontend and all architectures supported by the # gc toolchain. # N.B. Keep in sync with gcc/testsuite/go.test/go-test.exp (go-set-goarch). -ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64pe32le ppc ppc64 ppc64le s390 s390x sparc sparc64" +ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64p32le ppc ppc64 ppc64le s390 s390x sparc sparc64" # All known GOARCH_FAMILY values. ALLGOARCHFAMILY="I386 ALPHA AMD64 ARM ARM64 IA64 M68K MIPS MIPS64 PPC PPC64 S390 S390X SPARC SPARC64" diff --git a/libgo/configure.ac b/libgo/configure.ac index 9e76540..77a744e 100644 --- a/libgo/configure.ac +++ b/libgo/configure.ac @@ -197,7 +197,7 @@ AC_SUBST(USE_DEJAGNU) # supported by the gofrontend and all architectures supported by the # gc toolchain. # N.B. Keep in sync with gcc/testsuite/go.test/go-test.exp (go-set-goarch). -ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64pe32le ppc ppc64 ppc64le s390 s390x sparc sparc64" +ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64p32le ppc ppc64 ppc64le s390 s390x sparc sparc64" # All known GOARCH_FAMILY values. ALLGOARCHFAMILY="I386 ALPHA AMD64 ARM ARM64 IA64 M68K MIPS MIPS64 PPC PPC64 S390 S390X SPARC SPARC64" diff --git a/libgo/go/runtime/alg.go b/libgo/go/runtime/alg.go index 8f7c3c0..5ec19d0 100644 --- a/libgo/go/runtime/alg.go +++ b/libgo/go/runtime/alg.go @@ -23,12 +23,29 @@ import ( //go:linkname efacevaleq runtime.efacevaleq //go:linkname eqstring runtime.eqstring //go:linkname cmpstring runtime.cmpstring +// +// Temporary to be called from C code. +//go:linkname alginit runtime.alginit const ( c0 = uintptr((8-sys.PtrSize)/4*2860486313 + (sys.PtrSize-4)/4*33054211828000289) c1 = uintptr((8-sys.PtrSize)/4*3267000013 + (sys.PtrSize-4)/4*23344194077549503) ) +var useAeshash bool + +// in C code +func aeshashbody(p unsafe.Pointer, h, s uintptr, sched []byte) uintptr + +func aeshash(p unsafe.Pointer, h, s uintptr) uintptr { + return aeshashbody(p, h, s, aeskeysched[:]) +} + +func aeshashstr(p unsafe.Pointer, h uintptr) uintptr { + ps := (*stringStruct)(p) + return aeshashbody(unsafe.Pointer(ps.str), h, uintptr(ps.len), aeskeysched[:]) +} + func interhash(p unsafe.Pointer, h uintptr, size uintptr) uintptr { a := (*iface)(p) tab := a.tab @@ -198,7 +215,35 @@ func cmpstring(x, y string) int { // Force the creation of function descriptors for equality and hash // functions. These will be referenced directly by the compiler. +var _ = memhash var _ = interhash var _ = interequal var _ = nilinterhash var _ = nilinterequal + +const hashRandomBytes = sys.PtrSize / 4 * 64 + +// used in asm_{386,amd64}.s to seed the hash function +var aeskeysched [hashRandomBytes]byte + +// used in hash{32,64}.go to seed the hash function +var hashkey [4]uintptr + +func alginit() { + // Install aes hash algorithm if we have the instructions we need + if (GOARCH == "386" || GOARCH == "amd64") && + GOOS != "nacl" && + cpuid_ecx&(1<<25) != 0 && // aes (aesenc) + cpuid_ecx&(1<<9) != 0 && // sse3 (pshufb) + cpuid_ecx&(1<<19) != 0 { // sse4.1 (pinsr{d,q}) + useAeshash = true + // Initialize with random data so hash collisions will be hard to engineer. + getRandomData(aeskeysched[:]) + return + } + getRandomData((*[len(hashkey) * sys.PtrSize]byte)(unsafe.Pointer(&hashkey))[:]) + hashkey[0] |= 1 // make sure these numbers are odd + hashkey[1] |= 1 + hashkey[2] |= 1 + hashkey[3] |= 1 +} diff --git a/libgo/go/runtime/hash32.go b/libgo/go/runtime/hash32.go new file mode 100644 index 0000000..cfb3a58 --- /dev/null +++ b/libgo/go/runtime/hash32.go @@ -0,0 +1,94 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Hashing algorithm inspired by +// xxhash: https://code.google.com/p/xxhash/ +// cityhash: https://code.google.com/p/cityhash/ + +// +build 386 arm armbe m68k mipso32 mipsn32 mips mipsle ppc s390 sparc + +package runtime + +import "unsafe" + +// For gccgo, use go:linkname to rename compiler-called functions to +// themselves, so that the compiler will export them. +// +//go:linkname memhash runtime.memhash + +const ( + // Constants for multiplication: four random odd 32-bit numbers. + m1 = 3168982561 + m2 = 3339683297 + m3 = 832293441 + m4 = 2336365089 +) + +func memhash(p unsafe.Pointer, seed, s uintptr) uintptr { + if GOARCH == "386" && GOOS != "nacl" && useAeshash { + return aeshash(p, seed, s) + } + h := uint32(seed + s*hashkey[0]) +tail: + switch { + case s == 0: + case s < 4: + h ^= uint32(*(*byte)(p)) + h ^= uint32(*(*byte)(add(p, s>>1))) << 8 + h ^= uint32(*(*byte)(add(p, s-1))) << 16 + h = rotl_15(h*m1) * m2 + case s == 4: + h ^= readUnaligned32(p) + h = rotl_15(h*m1) * m2 + case s <= 8: + h ^= readUnaligned32(p) + h = rotl_15(h*m1) * m2 + h ^= readUnaligned32(add(p, s-4)) + h = rotl_15(h*m1) * m2 + case s <= 16: + h ^= readUnaligned32(p) + h = rotl_15(h*m1) * m2 + h ^= readUnaligned32(add(p, 4)) + h = rotl_15(h*m1) * m2 + h ^= readUnaligned32(add(p, s-8)) + h = rotl_15(h*m1) * m2 + h ^= readUnaligned32(add(p, s-4)) + h = rotl_15(h*m1) * m2 + default: + v1 := h + v2 := uint32(seed * hashkey[1]) + v3 := uint32(seed * hashkey[2]) + v4 := uint32(seed * hashkey[3]) + for s >= 16 { + v1 ^= readUnaligned32(p) + v1 = rotl_15(v1*m1) * m2 + p = add(p, 4) + v2 ^= readUnaligned32(p) + v2 = rotl_15(v2*m2) * m3 + p = add(p, 4) + v3 ^= readUnaligned32(p) + v3 = rotl_15(v3*m3) * m4 + p = add(p, 4) + v4 ^= readUnaligned32(p) + v4 = rotl_15(v4*m4) * m1 + p = add(p, 4) + s -= 16 + } + h = v1 ^ v2 ^ v3 ^ v4 + goto tail + } + h ^= h >> 17 + h *= m3 + h ^= h >> 13 + h *= m4 + h ^= h >> 16 + return uintptr(h) +} + +// Note: in order to get the compiler to issue rotl instructions, we +// need to constant fold the shift amount by hand. +// TODO: convince the compiler to issue rotl instructions after inlining. +func rotl_15(x uint32) uint32 { + return (x << 15) | (x >> (32 - 15)) +} diff --git a/libgo/go/runtime/hash64.go b/libgo/go/runtime/hash64.go new file mode 100644 index 0000000..551d5b5 --- /dev/null +++ b/libgo/go/runtime/hash64.go @@ -0,0 +1,94 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Hashing algorithm inspired by +// xxhash: https://code.google.com/p/xxhash/ +// cityhash: https://code.google.com/p/cityhash/ + +// +build amd64 amd64p32 arm64 mips64 mips64le ppc64 ppc64le s390x alpha arm64be ia64 mipso64 mipsn64 mips64p32 mips64p32le sparc64 + +package runtime + +import "unsafe" + +// For gccgo, use go:linkname to rename compiler-called functions to +// themselves, so that the compiler will export them. +// +//go:linkname memhash runtime.memhash + +const ( + // Constants for multiplication: four random odd 64-bit numbers. + m1 = 16877499708836156737 + m2 = 2820277070424839065 + m3 = 9497967016996688599 + m4 = 15839092249703872147 +) + +func memhash(p unsafe.Pointer, seed, s uintptr) uintptr { + if GOARCH == "amd64" && GOOS != "nacl" && useAeshash { + return aeshash(p, seed, s) + } + h := uint64(seed + s*hashkey[0]) +tail: + switch { + case s == 0: + case s < 4: + h ^= uint64(*(*byte)(p)) + h ^= uint64(*(*byte)(add(p, s>>1))) << 8 + h ^= uint64(*(*byte)(add(p, s-1))) << 16 + h = rotl_31(h*m1) * m2 + case s <= 8: + h ^= uint64(readUnaligned32(p)) + h ^= uint64(readUnaligned32(add(p, s-4))) << 32 + h = rotl_31(h*m1) * m2 + case s <= 16: + h ^= readUnaligned64(p) + h = rotl_31(h*m1) * m2 + h ^= readUnaligned64(add(p, s-8)) + h = rotl_31(h*m1) * m2 + case s <= 32: + h ^= readUnaligned64(p) + h = rotl_31(h*m1) * m2 + h ^= readUnaligned64(add(p, 8)) + h = rotl_31(h*m1) * m2 + h ^= readUnaligned64(add(p, s-16)) + h = rotl_31(h*m1) * m2 + h ^= readUnaligned64(add(p, s-8)) + h = rotl_31(h*m1) * m2 + default: + v1 := h + v2 := uint64(seed * hashkey[1]) + v3 := uint64(seed * hashkey[2]) + v4 := uint64(seed * hashkey[3]) + for s >= 32 { + v1 ^= readUnaligned64(p) + v1 = rotl_31(v1*m1) * m2 + p = add(p, 8) + v2 ^= readUnaligned64(p) + v2 = rotl_31(v2*m2) * m3 + p = add(p, 8) + v3 ^= readUnaligned64(p) + v3 = rotl_31(v3*m3) * m4 + p = add(p, 8) + v4 ^= readUnaligned64(p) + v4 = rotl_31(v4*m4) * m1 + p = add(p, 8) + s -= 32 + } + h = v1 ^ v2 ^ v3 ^ v4 + goto tail + } + + h ^= h >> 29 + h *= m3 + h ^= h >> 32 + return uintptr(h) +} + +// Note: in order to get the compiler to issue rotl instructions, we +// need to constant fold the shift amount by hand. +// TODO: convince the compiler to issue rotl instructions after inlining. +func rotl_31(x uint64) uint64 { + return (x << 31) | (x >> (64 - 31)) +} diff --git a/libgo/go/runtime/os_gccgo.go b/libgo/go/runtime/os_gccgo.go new file mode 100644 index 0000000..4609432 --- /dev/null +++ b/libgo/go/runtime/os_gccgo.go @@ -0,0 +1,23 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +import ( + "unsafe" +) + +var urandom_dev = []byte("/dev/urandom\x00") + +func getRandomData(r []byte) { + if startupRandomData != nil { + n := copy(r, startupRandomData) + extendRandom(r, n) + return + } + fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0) + n := read(fd, unsafe.Pointer(&r[0]), int32(len(r))) + closefd(fd) + extendRandom(r, int(n)) +} diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go index c8db7ad..4712318 100644 --- a/libgo/go/runtime/runtime2.go +++ b/libgo/go/runtime/runtime2.go @@ -5,6 +5,7 @@ package runtime import ( + "runtime/internal/sys" "unsafe" ) @@ -668,7 +669,6 @@ type forcegcstate struct { // the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go). var startupRandomData []byte -/* // extendRandom extends the random numbers in r[:n] to the whole slice r. // Treats n<0 as n==0. func extendRandom(r []byte, n int) { @@ -689,7 +689,6 @@ func extendRandom(r []byte, n int) { } } } -*/ // deferred subroutine calls // This is the gccgo version. @@ -770,11 +769,12 @@ var ( sched schedt -// newprocs int32 + // newprocs int32 + + // Information about what cpu features are available. + // Set on startup. + cpuid_ecx uint32 -// Information about what cpu features are available. -// Set on startup in asm_{x86,amd64}.s. -// cpuid_ecx uint32 // cpuid_edx uint32 // cpuid_ebx7 uint32 // lfenceBeforeRdtsc bool diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go index b2f1829..b4fee6b 100644 --- a/libgo/go/runtime/stubs.go +++ b/libgo/go/runtime/stubs.go @@ -248,6 +248,12 @@ func funcPC(f interface{}) uintptr { return **(**uintptr)(i.data) } +// For gccgo, to communicate from the C code to the Go code. +//go:linkname setCpuidECX runtime.setCpuidECX +func setCpuidECX(v uint32) { + cpuid_ecx = v +} + // typedmemmove copies a typed value. // For gccgo for now. //go:nosplit diff --git a/libgo/go/runtime/unaligned1.go b/libgo/go/runtime/unaligned1.go new file mode 100644 index 0000000..c94f19e --- /dev/null +++ b/libgo/go/runtime/unaligned1.go @@ -0,0 +1,17 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build 386 amd64 amd64p32 arm64 ppc64 ppc64le s390x ppc s390 arm64be + +package runtime + +import "unsafe" + +func readUnaligned32(p unsafe.Pointer) uint32 { + return *(*uint32)(p) +} + +func readUnaligned64(p unsafe.Pointer) uint64 { + return *(*uint64)(p) +} diff --git a/libgo/go/runtime/unaligned2.go b/libgo/go/runtime/unaligned2.go new file mode 100644 index 0000000..e52d6ce --- /dev/null +++ b/libgo/go/runtime/unaligned2.go @@ -0,0 +1,20 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm mips64 mips64le armbe m68k mipso32 mipsn32 mips mipsle sparc alpha ia64 mipso64 mipsn64 mips64p32 mips64p32le sparc64 + +package runtime + +import "unsafe" + +// Note: These routines perform the read with an unspecified endianness. +func readUnaligned32(p unsafe.Pointer) uint32 { + q := (*[4]byte)(p) + return uint32(q[0]) + uint32(q[1])<<8 + uint32(q[2])<<16 + uint32(q[3])<<24 +} + +func readUnaligned64(p unsafe.Pointer) uint64 { + q := (*[8]byte)(p) + return uint64(q[0]) + uint64(q[1])<<8 + uint64(q[2])<<16 + uint64(q[3])<<24 + uint64(q[4])<<32 + uint64(q[5])<<40 + uint64(q[6])<<48 + uint64(q[7])<<56 +} diff --git a/libgo/runtime/aeshash.c b/libgo/runtime/aeshash.c new file mode 100644 index 0000000..faa90e0 --- /dev/null +++ b/libgo/runtime/aeshash.c @@ -0,0 +1,583 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Hash code using AES intrinsics. + +#include "runtime.h" + +uintptr aeshashbody(void*, uintptr, uintptr, Slice) + __asm__(GOSYM_PREFIX "runtime.aeshashbody"); + +uintptr aeshashbody(void*, uintptr, uintptr, Slice) + __attribute__((no_split_stack)); + +#if defined(__i386__) || defined(__x86_64__) + +#include +#include +#include + +// Force appropriate CPU level. We won't call here unless the CPU +// supports it. + +#pragma GCC target("ssse3", "aes") + +#ifdef __x86_64__ + +// aeshashbody implements a hash function using AES instructions +// available in recent x86 processors. Note this is not encryption, +// just hashing. +// +// This is written to produce exactly the same results as the gc +// implementation, not because that matters, but just to ensure that +// this does something reasonable. +uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) { + __m128i mseed, mseed2, mseed3, mseed4, mseed5, mseed6, mseed7, mseed8; + __m128i mval, mval2, mval3, mval4, mval5, mval6, mval7, mval8; + + // Start with hash seed. + mseed = _mm_cvtsi64_si128(seed); + // Get 16 bits of length. + mseed = _mm_insert_epi16(mseed, size, 4); + // Repeat length 4 times total. + mseed = _mm_shufflehi_epi16(mseed, 0); + // Save unscrambled seed. + mseed2 = mseed; + // XOR in per-process seed. + mseed ^= _mm_loadu_si128(aeskeysched.__values); + // Scramble seed. + mseed = _mm_aesenc_si128(mseed, mseed); + + if (size <= 16) { + if (size == 0) { + // Return scrambled input seed. + return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed, mseed)); + } else if (size < 16) { + if ((((uintptr)(p) + 16) & 0xff0) != 0) { + static const uint64 masks[32] + __attribute__ ((aligned(16))) = + { + 0x0000000000000000, 0x0000000000000000, + 0x00000000000000ff, 0x0000000000000000, + 0x000000000000ffff, 0x0000000000000000, + 0x0000000000ffffff, 0x0000000000000000, + 0x00000000ffffffff, 0x0000000000000000, + 0x000000ffffffffff, 0x0000000000000000, + 0x0000ffffffffffff, 0x0000000000000000, + 0x00ffffffffffffff, 0x0000000000000000, + 0xffffffffffffffff, 0x0000000000000000, + 0xffffffffffffffff, 0x00000000000000ff, + 0xffffffffffffffff, 0x000000000000ffff, + 0xffffffffffffffff, 0x0000000000ffffff, + 0xffffffffffffffff, 0x00000000ffffffff, + 0xffffffffffffffff, 0x000000ffffffffff, + 0xffffffffffffffff, 0x0000ffffffffffff, + 0xffffffffffffffff, 0x00ffffffffffffff + }; + + // 16 bytes loaded at p won't cross a page + // boundary, so we can load directly. + mval = _mm_loadu_si128(p); + mval &= *(const __m128i*)(&masks[size*2]); + } else { + static const uint64 shifts[32] + __attribute__ ((aligned(16))) = + { + 0x0000000000000000, 0x0000000000000000, + 0xffffffffffffff0f, 0xffffffffffffffff, + 0xffffffffffff0f0e, 0xffffffffffffffff, + 0xffffffffff0f0e0d, 0xffffffffffffffff, + 0xffffffff0f0e0d0c, 0xffffffffffffffff, + 0xffffff0f0e0d0c0b, 0xffffffffffffffff, + 0xffff0f0e0d0c0b0a, 0xffffffffffffffff, + 0xff0f0e0d0c0b0a09, 0xffffffffffffffff, + 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, + 0x0e0d0c0b0a090807, 0xffffffffffffff0f, + 0x0d0c0b0a09080706, 0xffffffffffff0f0e, + 0x0c0b0a0908070605, 0xffffffffff0f0e0d, + 0x0b0a090807060504, 0xffffffff0f0e0d0c, + 0x0a09080706050403, 0xffffff0f0e0d0c0b, + 0x0908070605040302, 0xffff0f0e0d0c0b0a, + 0x0807060504030201, 0xff0f0e0d0c0b0a09, + }; + + // address ends in 1111xxxx. Might be + // up against a page boundary, so load + // ending at last byte. Then shift + // bytes down using pshufb. + mval = _mm_loadu_si128((void*)((char*)p - 16 + size)); + mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2])); + } + } else { + mval = _mm_loadu_si128(p); + } + + // XOR data with seed. + mval ^= mseed; + // Scramble combo 3 times. + mval = _mm_aesenc_si128(mval, mval); + mval = _mm_aesenc_si128(mval, mval); + mval = _mm_aesenc_si128(mval, mval); + return _mm_cvtsi128_si64(mval); + } else if (size <= 32) { + // Make second starting seed. + mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16)); + mseed2 = _mm_aesenc_si128(mseed2, mseed2); + // Load data to be hashed. + mval = _mm_loadu_si128(p); + mval2 = _mm_loadu_si128((void*)((char*)p + size - 16)); + // XOR with seed. + mval ^= mseed; + mval2 ^= mseed2; + // Scramble 3 times. + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + // Combine results. + mval ^= mval2; + return _mm_cvtsi128_si64(mval); + } else if (size <= 64) { + // Make 3 more starting seeds. + mseed3 = mseed2; + mseed4 = mseed2; + mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16)); + mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32)); + mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48)); + mseed2 = _mm_aesenc_si128(mseed2, mseed2); + mseed3 = _mm_aesenc_si128(mseed3, mseed3); + mseed4 = _mm_aesenc_si128(mseed4, mseed4); + + mval = _mm_loadu_si128(p); + mval2 = _mm_loadu_si128((void*)((char*)p + 16)); + mval3 = _mm_loadu_si128((void*)((char*)p + size - 32)); + mval4 = _mm_loadu_si128((void*)((char*)p + size - 16)); + + mval ^= mseed; + mval2 ^= mseed2; + mval3 ^= mseed3; + mval4 ^= mseed4; + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + mval ^= mval3; + mval2 ^= mval4; + mval ^= mval2; + return _mm_cvtsi128_si64(mval); + } else if (size <= 128) { + // Make 7 more starting seeds. + mseed3 = mseed2; + mseed4 = mseed2; + mseed5 = mseed2; + mseed6 = mseed2; + mseed7 = mseed2; + mseed8 = mseed2; + mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16)); + mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32)); + mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48)); + mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64)); + mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80)); + mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96)); + mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112)); + mseed2 = _mm_aesenc_si128(mseed2, mseed2); + mseed3 = _mm_aesenc_si128(mseed3, mseed3); + mseed4 = _mm_aesenc_si128(mseed4, mseed4); + mseed5 = _mm_aesenc_si128(mseed5, mseed5); + mseed6 = _mm_aesenc_si128(mseed6, mseed6); + mseed7 = _mm_aesenc_si128(mseed7, mseed7); + mseed8 = _mm_aesenc_si128(mseed8, mseed8); + + // Load data. + mval = _mm_loadu_si128(p); + mval2 = _mm_loadu_si128((void*)((char*)p + 16)); + mval3 = _mm_loadu_si128((void*)((char*)p + 32)); + mval4 = _mm_loadu_si128((void*)((char*)p + 48)); + mval5 = _mm_loadu_si128((void*)((char*)p + size - 64)); + mval6 = _mm_loadu_si128((void*)((char*)p + size - 48)); + mval7 = _mm_loadu_si128((void*)((char*)p + size - 32)); + mval8 = _mm_loadu_si128((void*)((char*)p + size - 16)); + + // XOR with seed. + mval ^= mseed; + mval2 ^= mseed2; + mval3 ^= mseed3; + mval4 ^= mseed4; + mval5 ^= mseed5; + mval6 ^= mseed6; + mval7 ^= mseed7; + mval8 ^= mseed8; + + // Scramble 3 times. + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + mval5 = _mm_aesenc_si128(mval5, mval5); + mval6 = _mm_aesenc_si128(mval6, mval6); + mval7 = _mm_aesenc_si128(mval7, mval7); + mval8 = _mm_aesenc_si128(mval8, mval8); + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + mval5 = _mm_aesenc_si128(mval5, mval5); + mval6 = _mm_aesenc_si128(mval6, mval6); + mval7 = _mm_aesenc_si128(mval7, mval7); + mval8 = _mm_aesenc_si128(mval8, mval8); + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + mval5 = _mm_aesenc_si128(mval5, mval5); + mval6 = _mm_aesenc_si128(mval6, mval6); + mval7 = _mm_aesenc_si128(mval7, mval7); + mval8 = _mm_aesenc_si128(mval8, mval8); + + // Combine results. + mval ^= mval5; + mval2 ^= mval6; + mval3 ^= mval7; + mval4 ^= mval8; + mval ^= mval3; + mval2 ^= mval4; + mval ^= mval2; + return _mm_cvtsi128_si64(mval); + } else { + // Make 7 more starting seeds. + mseed3 = mseed2; + mseed4 = mseed2; + mseed5 = mseed2; + mseed6 = mseed2; + mseed7 = mseed2; + mseed8 = mseed2; + mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16)); + mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32)); + mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48)); + mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64)); + mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80)); + mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96)); + mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112)); + mseed2 = _mm_aesenc_si128(mseed2, mseed2); + mseed3 = _mm_aesenc_si128(mseed3, mseed3); + mseed4 = _mm_aesenc_si128(mseed4, mseed4); + mseed5 = _mm_aesenc_si128(mseed5, mseed5); + mseed6 = _mm_aesenc_si128(mseed6, mseed6); + mseed7 = _mm_aesenc_si128(mseed7, mseed7); + mseed8 = _mm_aesenc_si128(mseed8, mseed8); + + // Start with last (possibly overlapping) block. + mval = _mm_loadu_si128((void*)((char*)p + size - 128)); + mval2 = _mm_loadu_si128((void*)((char*)p + size - 112)); + mval3 = _mm_loadu_si128((void*)((char*)p + size - 96)); + mval4 = _mm_loadu_si128((void*)((char*)p + size - 80)); + mval5 = _mm_loadu_si128((void*)((char*)p + size - 64)); + mval6 = _mm_loadu_si128((void*)((char*)p + size - 48)); + mval7 = _mm_loadu_si128((void*)((char*)p + size - 32)); + mval8 = _mm_loadu_si128((void*)((char*)p + size - 16)); + + // XOR in seed. + mval ^= mseed; + mval2 ^= mseed2; + mval3 ^= mseed3; + mval4 ^= mseed4; + mval5 ^= mseed5; + mval6 ^= mseed6; + mval7 ^= mseed7; + mval8 ^= mseed8; + + // Compute number of remaining 128-byte blocks. + size--; + size >>= 7; + do { + // Scramble state. + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + mval5 = _mm_aesenc_si128(mval5, mval5); + mval6 = _mm_aesenc_si128(mval6, mval6); + mval7 = _mm_aesenc_si128(mval7, mval7); + mval8 = _mm_aesenc_si128(mval8, mval8); + + // Scramble state, XOR in a block. + mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p)); + mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16))); + mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32))); + mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48))); + mval5 = _mm_aesenc_si128(mval5, _mm_loadu_si128((void*)((char*)p + 64))); + mval6 = _mm_aesenc_si128(mval6, _mm_loadu_si128((void*)((char*)p + 80))); + mval7 = _mm_aesenc_si128(mval7, _mm_loadu_si128((void*)((char*)p + 96))); + mval8 = _mm_aesenc_si128(mval8, _mm_loadu_si128((void*)((char*)p + 112))); + + p = (void*)((char*)p + 128); + } while (--size > 0); + + // 3 more scrambles to finish. + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + mval5 = _mm_aesenc_si128(mval5, mval5); + mval6 = _mm_aesenc_si128(mval6, mval6); + mval7 = _mm_aesenc_si128(mval7, mval7); + mval8 = _mm_aesenc_si128(mval8, mval8); + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + mval5 = _mm_aesenc_si128(mval5, mval5); + mval6 = _mm_aesenc_si128(mval6, mval6); + mval7 = _mm_aesenc_si128(mval7, mval7); + mval8 = _mm_aesenc_si128(mval8, mval8); + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + mval5 = _mm_aesenc_si128(mval5, mval5); + mval6 = _mm_aesenc_si128(mval6, mval6); + mval7 = _mm_aesenc_si128(mval7, mval7); + mval8 = _mm_aesenc_si128(mval8, mval8); + + mval ^= mval5; + mval2 ^= mval6; + mval3 ^= mval7; + mval4 ^= mval8; + mval ^= mval3; + mval2 ^= mval4; + mval ^= mval2; + return _mm_cvtsi128_si64(mval); + } +} + +#else // !defined(__x86_64__) + +// The 32-bit version of aeshashbody. + +uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) { + __m128i mseed, mseed2, mseed3, mseed4; + __m128i mval, mval2, mval3, mval4; + + // Start with hash seed. + mseed = _mm_cvtsi32_si128(seed); + // Get 16 bits of length. + mseed = _mm_insert_epi16(mseed, size, 4); + // Replace size with its low 2 bytes repeated 4 times. + mseed = _mm_shufflehi_epi16(mseed, 0); + // Save unscrambled seed. + mseed2 = mseed; + // XOR in per-process seed. + mseed ^= _mm_loadu_si128(aeskeysched.__values); + // Scramble seed. + mseed = _mm_aesenc_si128(mseed, mseed); + + if (size <= 16) { + if (size == 0) { + // Return scrambled input seed. + return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed, mseed)); + } else if (size < 16) { + if ((((uintptr)(p) + 16) & 0xff0) != 0) { + static const uint64 masks[32] + __attribute__ ((aligned(16))) = + { + 0x0000000000000000, 0x0000000000000000, + 0x00000000000000ff, 0x0000000000000000, + 0x000000000000ffff, 0x0000000000000000, + 0x0000000000ffffff, 0x0000000000000000, + 0x00000000ffffffff, 0x0000000000000000, + 0x000000ffffffffff, 0x0000000000000000, + 0x0000ffffffffffff, 0x0000000000000000, + 0x00ffffffffffffff, 0x0000000000000000, + 0xffffffffffffffff, 0x0000000000000000, + 0xffffffffffffffff, 0x00000000000000ff, + 0xffffffffffffffff, 0x000000000000ffff, + 0xffffffffffffffff, 0x0000000000ffffff, + 0xffffffffffffffff, 0x00000000ffffffff, + 0xffffffffffffffff, 0x000000ffffffffff, + 0xffffffffffffffff, 0x0000ffffffffffff, + 0xffffffffffffffff, 0x00ffffffffffffff + }; + + // 16 bytes loaded at p won't cross a page + // boundary, so we can load it directly. + mval = _mm_loadu_si128(p); + mval &= *(const __m128i*)(&masks[size*2]); + } else { + static const uint64 shifts[32] + __attribute__ ((aligned(16))) = + { + 0x0000000000000000, 0x0000000000000000, + 0xffffffffffffff0f, 0xffffffffffffffff, + 0xffffffffffff0f0e, 0xffffffffffffffff, + 0xffffffffff0f0e0d, 0xffffffffffffffff, + 0xffffffff0f0e0d0c, 0xffffffffffffffff, + 0xffffff0f0e0d0c0b, 0xffffffffffffffff, + 0xffff0f0e0d0c0b0a, 0xffffffffffffffff, + 0xff0f0e0d0c0b0a09, 0xffffffffffffffff, + 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, + 0x0e0d0c0b0a090807, 0xffffffffffffff0f, + 0x0d0c0b0a09080706, 0xffffffffffff0f0e, + 0x0c0b0a0908070605, 0xffffffffff0f0e0d, + 0x0b0a090807060504, 0xffffffff0f0e0d0c, + 0x0a09080706050403, 0xffffff0f0e0d0c0b, + 0x0908070605040302, 0xffff0f0e0d0c0b0a, + 0x0807060504030201, 0xff0f0e0d0c0b0a09, + }; + + // address ends in 1111xxxx. Might be + // up against a page boundary, so load + // ending at last byte. Then shift + // bytes down using pshufb. + mval = _mm_loadu_si128((void*)((char*)p - 16 + size)); + mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2])); + } + } else { + mval = _mm_loadu_si128(p); + } + + // Scramble input, XOR in seed. + mval = _mm_aesenc_si128(mval, mseed); + mval = _mm_aesenc_si128(mval, mval); + mval = _mm_aesenc_si128(mval, mval); + return _mm_cvtsi128_si32(mval); + } else if (size <= 32) { + // Make second starting seed. + mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16)); + mseed2 = _mm_aesenc_si128(mseed2, mseed2); + // Load data to be hashed. + mval = _mm_loadu_si128(p); + mval2 = _mm_loadu_si128((void*)((char*)p + size - 16)); + + // Scramble 3 times. + mval = _mm_aesenc_si128(mval, mseed); + mval2 = _mm_aesenc_si128(mval2, mseed2); + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + + // Combine results. + mval ^= mval2; + return _mm_cvtsi128_si32(mval); + } else if (size <= 64) { + // Make 3 more starting seeds. + mseed3 = mseed2; + mseed4 = mseed2; + mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16)); + mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32)); + mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48)); + mseed2 = _mm_aesenc_si128(mseed2, mseed2); + mseed3 = _mm_aesenc_si128(mseed3, mseed3); + mseed4 = _mm_aesenc_si128(mseed4, mseed4); + + mval = _mm_loadu_si128(p); + mval2 = _mm_loadu_si128((void*)((char*)p + 16)); + mval3 = _mm_loadu_si128((void*)((char*)p + size - 32)); + mval4 = _mm_loadu_si128((void*)((char*)p + size - 16)); + + mval = _mm_aesenc_si128(mval, mseed); + mval2 = _mm_aesenc_si128(mval2, mseed2); + mval3 = _mm_aesenc_si128(mval3, mseed3); + mval4 = _mm_aesenc_si128(mval4, mseed4); + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + mval ^= mval3; + mval2 ^= mval4; + mval ^= mval2; + return _mm_cvtsi128_si32(mval); + } else { + // Make 3 more starting seeds. + mseed3 = mseed2; + mseed4 = mseed2; + mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16)); + mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32)); + mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48)); + mseed2 = _mm_aesenc_si128(mseed2, mseed2); + mseed3 = _mm_aesenc_si128(mseed3, mseed3); + mseed4 = _mm_aesenc_si128(mseed4, mseed4); + + // Start with last (possibly overlapping) block. + mval = _mm_loadu_si128((void*)((char*)p + size - 64)); + mval2 = _mm_loadu_si128((void*)((char*)p + size - 48)); + mval3 = _mm_loadu_si128((void*)((char*)p + size - 32)); + mval4 = _mm_loadu_si128((void*)((char*)p + size - 16)); + + // Scramble state once. + mval = _mm_aesenc_si128(mval, mseed); + mval2 = _mm_aesenc_si128(mval2, mseed2); + mval3 = _mm_aesenc_si128(mval3, mseed3); + mval4 = _mm_aesenc_si128(mval4, mseed4); + + // Compute number of remaining 64-byte blocks. + size--; + size >>= 6; + do { + // Scramble state, XOR in a block. + mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p)); + mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16))); + mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32))); + mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48))); + + // Scramble state. + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + p = (void*)((char*)p + 64); + } while (--size > 0); + + // 2 more scrambles to finish. + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + mval = _mm_aesenc_si128(mval, mval); + mval2 = _mm_aesenc_si128(mval2, mval2); + mval3 = _mm_aesenc_si128(mval3, mval3); + mval4 = _mm_aesenc_si128(mval4, mval4); + + mval ^= mval3; + mval2 ^= mval4; + mval ^= mval2; + return _mm_cvtsi128_si32(mval); + } +} + +#endif // !defined(__x86_64__) + +#else // !defined(__i386__) && !defined(__x86_64__) + +uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) { + // We should never get here on a non-x86 system. + runtime_throw("impossible call to aeshashbody"); +} + +#endif // !defined(__i386__) && !defined(__x86_64__) diff --git a/libgo/runtime/go-libmain.c b/libgo/runtime/go-libmain.c index 6884f3a..c62ad93 100644 --- a/libgo/runtime/go-libmain.c +++ b/libgo/runtime/go-libmain.c @@ -61,6 +61,7 @@ initfn (int argc, char **argv, char** env __attribute__ ((unused))) runtime_isarchive = true; + runtime_cpuinit (); runtime_initsig(true); a = (struct args *) malloc (sizeof *a); diff --git a/libgo/runtime/go-main.c b/libgo/runtime/go-main.c index ff2958c..622a77d 100644 --- a/libgo/runtime/go-main.c +++ b/libgo/runtime/go-main.c @@ -47,6 +47,7 @@ main (int argc, char **argv) runtime_isstarted = true; __go_end = (uintptr)_end; + runtime_cpuinit (); runtime_check (); runtime_args (argc, (byte **) argv); runtime_osinit (); diff --git a/libgo/runtime/go-type-identity.c b/libgo/runtime/go-type-identity.c index d58aa75..842fa24 100644 --- a/libgo/runtime/go-type-identity.c +++ b/libgo/runtime/go-type-identity.c @@ -9,44 +9,14 @@ #include "runtime.h" #include "go-type.h" -/* An identity hash function for a type. This is used for types where - we can simply use the type value itself as a hash code. This is - true of, e.g., integers and pointers. */ +/* The hash functions for types that can compare as identity is + written in Go. */ -uintptr_t -__go_type_hash_identity (const void *key, uintptr_t seed, uintptr_t key_size) -{ - uintptr_t ret; - uintptr_t i; - const unsigned char *p; - - if (key_size <= 8) - { - union - { - uint64 v; - unsigned char a[8]; - } u; - u.v = 0; -#ifdef WORDS_BIGENDIAN - __builtin_memcpy (&u.a[8 - key_size], key, key_size); -#else - __builtin_memcpy (&u.a[0], key, key_size); -#endif - if (sizeof (uintptr_t) >= 8) - return (uintptr_t) u.v ^ seed; - else - return (uintptr_t) ((u.v >> 32) ^ (u.v & 0xffffffff)) ^ seed; - } - - ret = seed; - for (i = 0, p = (const unsigned char *) key; i < key_size; i++, p++) - ret = ret * 33 + *p; - return ret; -} +extern uintptr runtime_memhash(void *, uintptr, uintptr) + __asm__ (GOSYM_PREFIX "runtime.memhash"); const FuncVal __go_type_hash_identity_descriptor = - { (void *) __go_type_hash_identity }; + { (void *) runtime_memhash }; /* An identity equality function for a type. This is used for types where we can check for equality by checking that the values have diff --git a/libgo/runtime/go-type.h b/libgo/runtime/go-type.h index 7c3149b..2d5965c 100644 --- a/libgo/runtime/go-type.h +++ b/libgo/runtime/go-type.h @@ -362,7 +362,6 @@ extern _Bool __go_type_descriptors_equal(const struct __go_type_descriptor*, const struct __go_type_descriptor*); -extern uintptr_t __go_type_hash_identity (const void *, uintptr_t, uintptr_t); extern const FuncVal __go_type_hash_identity_descriptor; extern _Bool __go_type_equal_identity (const void *, const void *, uintptr_t); extern const FuncVal __go_type_equal_identity_descriptor; diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c index dd5562b..be7e083 100644 --- a/libgo/runtime/proc.c +++ b/libgo/runtime/proc.c @@ -455,7 +455,8 @@ runtime_schedinit(void) // runtime_symtabinit(); runtime_mallocinit(); mcommoninit(m); - + runtime_alginit(); // maps must not be used before this call + // Initialize the itable value for newErrorCString, // so that the next time it gets called, possibly // in a fault during a garbage collection, it will not diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h index f793fea..424b429 100644 --- a/libgo/runtime/runtime.h +++ b/libgo/runtime/runtime.h @@ -265,6 +265,8 @@ struct __go_func_type; void runtime_args(int32, byte**) __asm__ (GOSYM_PREFIX "runtime.args"); void runtime_osinit(); +void runtime_alginit(void) + __asm__ (GOSYM_PREFIX "runtime.alginit"); void runtime_goargs(void) __asm__ (GOSYM_PREFIX "runtime.goargs"); void runtime_goenvs(void); @@ -592,3 +594,7 @@ extern void *getitab(const struct __go_type_descriptor *, const struct __go_type_descriptor *, _Bool) __asm__ (GOSYM_PREFIX "runtime.getitab"); + +extern void runtime_cpuinit(void); +extern void setCpuidECX(uint32) + __asm__ (GOSYM_PREFIX "runtime.setCpuidECX"); diff --git a/libgo/runtime/runtime_c.c b/libgo/runtime/runtime_c.c index 16be089..3387401 100644 --- a/libgo/runtime/runtime_c.c +++ b/libgo/runtime/runtime_c.c @@ -6,6 +6,10 @@ #include #include +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + #include "config.h" #include "runtime.h" @@ -204,3 +208,18 @@ go_errno() { return (intgo)errno; } + +// CPU-specific initialization. +// Fetch CPUID info on x86. + +void +runtime_cpuinit() +{ +#if defined(__i386__) || defined(__x86_64__) + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { + setCpuidECX(ecx); + } +#endif +} -- cgit v1.1