1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // +build s390x,!gccgo,!appengine
10 // This is an implementation of the ChaCha20 encryption algorithm as
11 // specified in RFC 7539. It uses vector instructions to compute
12 // 4 keystream blocks in parallel (256 bytes) which are then XORed
13 // with the bytes in the input slice.
15 GLOBL ·constants<>(SB), RODATA|NOPTR, $32
16 // BSWAP: swap bytes in each 4-byte element
17 DATA ·constants<>+0x00(SB)/4, $0x03020100
18 DATA ·constants<>+0x04(SB)/4, $0x07060504
19 DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
20 DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
21 // J0: [j0, j1, j2, j3]
22 DATA ·constants<>+0x10(SB)/4, $0x61707865
23 DATA ·constants<>+0x14(SB)/4, $0x3320646e
24 DATA ·constants<>+0x18(SB)/4, $0x79622d32
25 DATA ·constants<>+0x1c(SB)/4, $0x6b206574
28 TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
32 TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
66 #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
116 #define PERMUTE(mask, v0, v1, v2, v3) \
117 VPERM v0, v0, mask, v0 \
118 VPERM v1, v1, mask, v1 \
119 VPERM v2, v2, mask, v2 \
120 VPERM v3, v3, mask, v3
122 #define ADDV(x, v0, v1, v2, v3) \
128 #define XORV(off, dst, src, v0, v1, v2, v3) \
129 VLM off(src), M0, M3 \
130 PERMUTE(BSWAP, v0, v1, v2, v3) \
135 VSTM M0, M3, off(dst)
137 #define SHUFFLE(a, b, c, d, t, u, v, w) \
138 VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
139 VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
140 VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
141 VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
142 VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
143 VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
144 VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
145 VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
147 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
148 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
149 MOVD $·constants<>(SB), R1
150 MOVD dst+0(FP), R2 // R2=&dst[0]
151 LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
152 MOVD key+48(FP), R5 // R5=key
153 MOVD nonce+56(FP), R6 // R6=nonce
154 MOVD counter+64(FP), R7 // R7=counter
155 MOVD buf+72(FP), R8 // R8=buf
156 MOVD len+80(FP), R9 // R9=len
161 // set up tail buffer
164 CMPUBEQ R12, $255, aligned
168 EXRL $·mvcSrcToBuf(SB), R12
171 MOVD R0, (R9) // update len
180 VSRLB M0, NONCE, NONCE
182 // initialize counter values
209 MOVD $(NUM_ROUNDS/2), R1
212 ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
213 ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
224 SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
225 ADDV(J0, X0, X1, X2, X3)
226 SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
227 ADDV(KEY0, X4, X5, X6, X7)
228 SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
229 ADDV(KEY1, X8, X9, X10, X11)
231 SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
232 ADDV(NONCE, X12, X13, X14, X15)
234 // increment counters
237 // xor keystream with plaintext
238 XORV(0*64, R2, R3, X0, X4, X8, X12)
239 XORV(1*64, R2, R3, X1, X5, X9, X13)
240 XORV(2*64, R2, R3, X2, X6, X10, X14)
241 XORV(3*64, R2, R3, X3, X7, X11, X15)
243 // increment pointers
247 CMPBNE R4, $0, chacha
248 CMPUBEQ R12, $255, return
249 EXRL $·mvcBufToDst(SB), R12 // len was updated during setup