543 lines
12 KiB
Go
543 lines
12 KiB
Go
package xxh3
|
|
|
|
// avx512Switch is the size at which the avx512 code is used.
|
|
// Bigger blocks benefit more.
|
|
const avx512Switch = 1 << 10
|
|
|
|
func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
|
|
if secret != key {
|
|
accumScalarSeed(accs, p, secret, l)
|
|
return
|
|
}
|
|
for l > _block {
|
|
k := secret
|
|
|
|
// accs
|
|
for i := 0; i < 16; i++ {
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ readU64(k, 8*0)
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ readU64(k, 8*1)
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ readU64(k, 8*2)
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ readU64(k, 8*3)
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ readU64(k, 8*4)
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ readU64(k, 8*5)
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ readU64(k, 8*6)
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ readU64(k, 8*7)
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
|
|
l -= _stripe
|
|
if l > 0 {
|
|
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
|
|
}
|
|
}
|
|
|
|
// scramble accs
|
|
accs[0] ^= accs[0] >> 47
|
|
accs[0] ^= key64_128
|
|
accs[0] *= prime32_1
|
|
|
|
accs[1] ^= accs[1] >> 47
|
|
accs[1] ^= key64_136
|
|
accs[1] *= prime32_1
|
|
|
|
accs[2] ^= accs[2] >> 47
|
|
accs[2] ^= key64_144
|
|
accs[2] *= prime32_1
|
|
|
|
accs[3] ^= accs[3] >> 47
|
|
accs[3] ^= key64_152
|
|
accs[3] *= prime32_1
|
|
|
|
accs[4] ^= accs[4] >> 47
|
|
accs[4] ^= key64_160
|
|
accs[4] *= prime32_1
|
|
|
|
accs[5] ^= accs[5] >> 47
|
|
accs[5] ^= key64_168
|
|
accs[5] *= prime32_1
|
|
|
|
accs[6] ^= accs[6] >> 47
|
|
accs[6] ^= key64_176
|
|
accs[6] *= prime32_1
|
|
|
|
accs[7] ^= accs[7] >> 47
|
|
accs[7] ^= key64_184
|
|
accs[7] *= prime32_1
|
|
}
|
|
|
|
if l > 0 {
|
|
t, k := (l-1)/_stripe, secret
|
|
|
|
for i := u64(0); i < t; i++ {
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ readU64(k, 8*0)
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ readU64(k, 8*1)
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ readU64(k, 8*2)
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ readU64(k, 8*3)
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ readU64(k, 8*4)
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ readU64(k, 8*5)
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ readU64(k, 8*6)
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ readU64(k, 8*7)
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
|
|
l -= _stripe
|
|
if l > 0 {
|
|
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
|
|
}
|
|
}
|
|
|
|
if l > 0 {
|
|
p = ptr(ui(p) - uintptr(_stripe-l))
|
|
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ key64_121
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ key64_129
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ key64_137
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ key64_145
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ key64_153
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ key64_161
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ key64_169
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ key64_177
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
}
|
|
}
|
|
}
|
|
|
|
func accumBlockScalar(accs *[8]u64, p, secret ptr) {
|
|
if secret != key {
|
|
accumBlockScalarSeed(accs, p, secret)
|
|
return
|
|
}
|
|
// accs
|
|
for i := 0; i < 16; i++ {
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ readU64(secret, 8*0)
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ readU64(secret, 8*1)
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ readU64(secret, 8*2)
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ readU64(secret, 8*3)
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ readU64(secret, 8*4)
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ readU64(secret, 8*5)
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ readU64(secret, 8*6)
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ readU64(secret, 8*7)
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
|
|
p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
|
|
}
|
|
|
|
// scramble accs
|
|
accs[0] ^= accs[0] >> 47
|
|
accs[0] ^= key64_128
|
|
accs[0] *= prime32_1
|
|
|
|
accs[1] ^= accs[1] >> 47
|
|
accs[1] ^= key64_136
|
|
accs[1] *= prime32_1
|
|
|
|
accs[2] ^= accs[2] >> 47
|
|
accs[2] ^= key64_144
|
|
accs[2] *= prime32_1
|
|
|
|
accs[3] ^= accs[3] >> 47
|
|
accs[3] ^= key64_152
|
|
accs[3] *= prime32_1
|
|
|
|
accs[4] ^= accs[4] >> 47
|
|
accs[4] ^= key64_160
|
|
accs[4] *= prime32_1
|
|
|
|
accs[5] ^= accs[5] >> 47
|
|
accs[5] ^= key64_168
|
|
accs[5] *= prime32_1
|
|
|
|
accs[6] ^= accs[6] >> 47
|
|
accs[6] ^= key64_176
|
|
accs[6] *= prime32_1
|
|
|
|
accs[7] ^= accs[7] >> 47
|
|
accs[7] ^= key64_184
|
|
accs[7] *= prime32_1
|
|
}
|
|
|
|
// accumScalarSeed should be used with custom key.
|
|
func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
|
|
for l > _block {
|
|
k := secret
|
|
|
|
// accs
|
|
for i := 0; i < 16; i++ {
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ readU64(k, 8*0)
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ readU64(k, 8*1)
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ readU64(k, 8*2)
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ readU64(k, 8*3)
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ readU64(k, 8*4)
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ readU64(k, 8*5)
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ readU64(k, 8*6)
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ readU64(k, 8*7)
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
|
|
l -= _stripe
|
|
if l > 0 {
|
|
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
|
|
}
|
|
}
|
|
|
|
// scramble accs
|
|
accs[0] ^= accs[0] >> 47
|
|
accs[0] ^= readU64(secret, 128)
|
|
accs[0] *= prime32_1
|
|
|
|
accs[1] ^= accs[1] >> 47
|
|
accs[1] ^= readU64(secret, 136)
|
|
accs[1] *= prime32_1
|
|
|
|
accs[2] ^= accs[2] >> 47
|
|
accs[2] ^= readU64(secret, 144)
|
|
accs[2] *= prime32_1
|
|
|
|
accs[3] ^= accs[3] >> 47
|
|
accs[3] ^= readU64(secret, 152)
|
|
accs[3] *= prime32_1
|
|
|
|
accs[4] ^= accs[4] >> 47
|
|
accs[4] ^= readU64(secret, 160)
|
|
accs[4] *= prime32_1
|
|
|
|
accs[5] ^= accs[5] >> 47
|
|
accs[5] ^= readU64(secret, 168)
|
|
accs[5] *= prime32_1
|
|
|
|
accs[6] ^= accs[6] >> 47
|
|
accs[6] ^= readU64(secret, 176)
|
|
accs[6] *= prime32_1
|
|
|
|
accs[7] ^= accs[7] >> 47
|
|
accs[7] ^= readU64(secret, 184)
|
|
accs[7] *= prime32_1
|
|
}
|
|
|
|
if l > 0 {
|
|
t, k := (l-1)/_stripe, secret
|
|
|
|
for i := u64(0); i < t; i++ {
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ readU64(k, 8*0)
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ readU64(k, 8*1)
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ readU64(k, 8*2)
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ readU64(k, 8*3)
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ readU64(k, 8*4)
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ readU64(k, 8*5)
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ readU64(k, 8*6)
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ readU64(k, 8*7)
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
|
|
l -= _stripe
|
|
if l > 0 {
|
|
p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
|
|
}
|
|
}
|
|
|
|
if l > 0 {
|
|
p = ptr(ui(p) - uintptr(_stripe-l))
|
|
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ readU64(secret, 121)
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ readU64(secret, 129)
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ readU64(secret, 137)
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ readU64(secret, 145)
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ readU64(secret, 153)
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ readU64(secret, 161)
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ readU64(secret, 169)
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ readU64(secret, 177)
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
}
|
|
}
|
|
}
|
|
|
|
// accumBlockScalarSeed should be used with custom key.
|
|
func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
|
|
// accs
|
|
{
|
|
secret := secret
|
|
for i := 0; i < 16; i++ {
|
|
dv0 := readU64(p, 8*0)
|
|
dk0 := dv0 ^ readU64(secret, 8*0)
|
|
accs[1] += dv0
|
|
accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
|
|
|
|
dv1 := readU64(p, 8*1)
|
|
dk1 := dv1 ^ readU64(secret, 8*1)
|
|
accs[0] += dv1
|
|
accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
|
|
|
|
dv2 := readU64(p, 8*2)
|
|
dk2 := dv2 ^ readU64(secret, 8*2)
|
|
accs[3] += dv2
|
|
accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
|
|
|
|
dv3 := readU64(p, 8*3)
|
|
dk3 := dv3 ^ readU64(secret, 8*3)
|
|
accs[2] += dv3
|
|
accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
|
|
|
|
dv4 := readU64(p, 8*4)
|
|
dk4 := dv4 ^ readU64(secret, 8*4)
|
|
accs[5] += dv4
|
|
accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
|
|
|
|
dv5 := readU64(p, 8*5)
|
|
dk5 := dv5 ^ readU64(secret, 8*5)
|
|
accs[4] += dv5
|
|
accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
|
|
|
|
dv6 := readU64(p, 8*6)
|
|
dk6 := dv6 ^ readU64(secret, 8*6)
|
|
accs[7] += dv6
|
|
accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
|
|
|
|
dv7 := readU64(p, 8*7)
|
|
dk7 := dv7 ^ readU64(secret, 8*7)
|
|
accs[6] += dv7
|
|
accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
|
|
|
|
p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
|
|
}
|
|
}
|
|
|
|
// scramble accs
|
|
accs[0] ^= accs[0] >> 47
|
|
accs[0] ^= readU64(secret, 128)
|
|
accs[0] *= prime32_1
|
|
|
|
accs[1] ^= accs[1] >> 47
|
|
accs[1] ^= readU64(secret, 136)
|
|
accs[1] *= prime32_1
|
|
|
|
accs[2] ^= accs[2] >> 47
|
|
accs[2] ^= readU64(secret, 144)
|
|
accs[2] *= prime32_1
|
|
|
|
accs[3] ^= accs[3] >> 47
|
|
accs[3] ^= readU64(secret, 152)
|
|
accs[3] *= prime32_1
|
|
|
|
accs[4] ^= accs[4] >> 47
|
|
accs[4] ^= readU64(secret, 160)
|
|
accs[4] *= prime32_1
|
|
|
|
accs[5] ^= accs[5] >> 47
|
|
accs[5] ^= readU64(secret, 168)
|
|
accs[5] *= prime32_1
|
|
|
|
accs[6] ^= accs[6] >> 47
|
|
accs[6] ^= readU64(secret, 176)
|
|
accs[6] *= prime32_1
|
|
|
|
accs[7] ^= accs[7] >> 47
|
|
accs[7] ^= readU64(secret, 184)
|
|
accs[7] *= prime32_1
|
|
}
|