1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // +build s390x,go1.11,!gccgo,!appengine
9 // Implementation of Poly1305 using the vector facility (vx).
58 GLOBL ·keyMask<>(SB), RODATA, $16
59 DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
60 DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
62 GLOBL ·bswapMask<>(SB), RODATA, $16
63 DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
64 DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
66 GLOBL ·constants<>(SB), RODATA, $64
68 DATA ·constants<>+0(SB)/8, $0x3ffffff
69 DATA ·constants<>+8(SB)/8, $0x3ffffff
71 DATA ·constants<>+16(SB)/8, $0x0006050403020100
72 DATA ·constants<>+24(SB)/8, $0x1016151413121110
74 DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
75 DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
77 DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
78 DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
80 // h = (f*g) % (2**130-5) [partial reduction]
81 #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
92 VMALOF f2, g53, h0, h0 \
93 VMALOF f2, g54, h1, h1 \
94 VMALOF f2, g0, h2, h2 \
95 VMALOF f2, g1, h3, h3 \
96 VMALOF f2, g2, h4, h4 \
97 VMALOF f3, g52, T_0, T_0 \
98 VMALOF f3, g53, T_1, T_1 \
99 VMALOF f3, g54, T_2, T_2 \
100 VMALOF f3, g0, T_3, T_3 \
101 VMALOF f3, g1, T_4, T_4 \
102 VMALOF f4, g51, h0, h0 \
103 VMALOF f4, g52, h1, h1 \
104 VMALOF f4, g53, h2, h2 \
105 VMALOF f4, g54, h3, h3 \
106 VMALOF f4, g0, h4, h4 \
113 // carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
114 #define REDUCE(h0, h1, h2, h3, h4) \
115 VESRLG $26, h0, T_0 \
116 VESRLG $26, h3, T_1 \
121 VESRLG $26, h1, T_2 \
122 VESRLG $26, h4, T_3 \
129 VESRLG $26, h2, T_0 \
130 VESRLG $26, h0, T_1 \
135 VESRLG $26, h3, T_2 \
139 // expand in0 into d[0] and in1 into d[1]
140 #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
141 VGBM $0x0707, d1 \ // d1=tmp
142 VPERM in0, in1, EX2, d4 \
143 VPERM in0, in1, EX0, d0 \
144 VPERM in0, in1, EX1, d2 \
154 // pack h4:h0 into h1:h0 (no carry)
155 #define PACK(h0, h1, h2, h3, h4) \
170 // if h > 2**130-5 then h -= 2**130-5
171 #define MOD(h0, h1, t0, t1, t2) \
186 // func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
187 TEXT ·poly1305vx(SB), $0-32
188 // This code processes up to 2 blocks (32 bytes) per iteration
189 // using the algorithm described in:
190 // NEON crypto, Daniel J. Bernstein & Peter Schwabe
191 // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
192 LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
194 // load MOD26, EX0, EX1 and EX2
195 MOVD $·constants<>(SB), R5
200 MOVD $·keyMask<>(SB), R6
203 EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
209 // store r (for final block)
210 VMLOF T_0, R_1, R5SAVE_1
211 VMLOF T_0, R_2, R5SAVE_2
212 VMLOF T_0, R_3, R5SAVE_3
213 VMLOF T_0, R_4, R5SAVE_4
214 VLGVG $0, R_0, RSAVE_0
215 VLGVG $0, R_1, RSAVE_1
216 VLGVG $0, R_2, RSAVE_2
217 VLGVG $0, R_3, RSAVE_3
218 VLGVG $0, R_4, RSAVE_4
220 // skip r**2 calculation
224 MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
225 REDUCE(H_0, H_1, H_2, H_3, H_4)
250 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
260 MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
261 REDUCE(H_0, H_1, H_2, H_3, H_4)
273 // h may be >= 2*(2**130-5) so we need to reduce it again
274 REDUCE(H_0, H_1, H_2, H_3, H_4)
287 // h is now < 2*(2**130-5)
288 // pack h into h1 (hi) and h0 (lo)
289 PACK(H_0, H_1, H_2, H_3, H_4)
291 // if h > 2**130-5 then h -= 2**130-5
292 MOD(H_0, H_1, T_0, T_1, T_2)
295 MOVD $·bswapMask<>(SB), R5
298 VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
300 VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little)
308 // 2 blocks remaining
314 CMPBEQ R3, $16, 2(PC)
316 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
317 CMPBNE R3, $16, 2(PC)
322 VLVGG $1, RSAVE_0, R_0
323 VLVGG $1, RSAVE_1, R_1
324 VLVGG $1, RSAVE_2, R_2
325 VLVGG $1, RSAVE_3, R_3
326 VLVGG $1, RSAVE_4, R_4
327 VPDI $0, R5_1, R5SAVE_1, R5_1
328 VPDI $0, R5_2, R5SAVE_2, R5_2
329 VPDI $0, R5_3, R5SAVE_3, R5_3
330 VPDI $0, R5_4, R5SAVE_4, R5_4
342 CMPBEQ R3, $0, finish
350 CMPBEQ R3, $16, 2(PC)
353 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
354 CMPBNE R3, $16, 2(PC)
367 VLVGG $0, RSAVE_0, R_0
368 VLVGG $0, RSAVE_1, R_1
369 VLVGG $0, RSAVE_2, R_2
370 VLVGG $0, RSAVE_3, R_3
371 VLVGG $0, RSAVE_4, R_4
372 VPDI $0, R5SAVE_1, R5_1, R5_1
373 VPDI $0, R5SAVE_2, R5_2, R5_2
374 VPDI $0, R5SAVE_3, R5_3, R5_3
375 VPDI $0, R5SAVE_4, R5_4, R5_4