1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 DATA ·chachaConst+0x00(SB)/4, $0x61707865
9 DATA ·chachaConst+0x04(SB)/4, $0x3320646e
10 DATA ·chachaConst+0x08(SB)/4, $0x79622d32
11 DATA ·chachaConst+0x0c(SB)/4, $0x6b206574
12 GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
13
14 DATA ·chachaIncRot+0x00(SB)/4, $0x00000000
15 DATA ·chachaIncRot+0x04(SB)/4, $0x00000001
16 DATA ·chachaIncRot+0x08(SB)/4, $0x00000002
17 DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003
18 GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32
19
20 // QR is the ChaCha8 quarter-round on a, b, c, and d.
21 #define QR(a, b, c, d) \
22 VADDW a, b, a; \
23 VXORV d, a, d; \
24 VROTRW $16, d; \
25 VADDW c, d, c; \
26 VXORV b, c, b; \
27 VROTRW $20, b; \
28 VADDW a, b, a; \
29 VXORV d, a, d; \
30 VROTRW $24, d; \
31 VADDW c, d, c; \
32 VXORV b, c, b; \
33 VROTRW $25, b
34
35
36 // func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
37 TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
38 // seed in R4
39 // blocks in R5
40 // counter in R6
41
42 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
43 BNE R7, lsx_chacha8
44 JMP ·block_generic<ABIInternal>(SB)
45 RET
46
47 lsx_chacha8:
48 MOVV $·chachaConst(SB), R10
49 MOVV $·chachaIncRot(SB), R11
50
51 // load contants
52 VMOVQ (R10), V0.W4
53 VMOVQ 1(R10), V1.W4
54 VMOVQ 2(R10), V2.W4
55 VMOVQ 3(R10), V3.W4
56
57 // load 4-32bit data from incRotMatrix added to counter
58 VMOVQ (R11), V30
59
60 // load seed
61 VMOVQ (R4), V4.W4
62 VMOVQ 1(R4), V5.W4
63 VMOVQ 2(R4), V6.W4
64 VMOVQ 3(R4), V7.W4
65 VMOVQ 4(R4), V8.W4
66 VMOVQ 5(R4), V9.W4
67 VMOVQ 6(R4), V10.W4
68 VMOVQ 7(R4), V11.W4
69
70 // load counter and update counter
71 VMOVQ R6, V12.W4
72 VADDW V12, V30, V12
73
74 // zeros for remaining three matrix entries
75 VXORV V13, V13, V13
76 VXORV V14, V14, V14
77 VXORV V15, V15, V15
78
79 // save seed state for adding back later
80 VORV V4, V13, V20
81 VORV V5, V13, V21
82 VORV V6, V13, V22
83 VORV V7, V13, V23
84 VORV V8, V13, V24
85 VORV V9, V13, V25
86 VORV V10, V13, V26
87 VORV V11, V13, V27
88
89 // 4 iterations. Each iteration is 8 quarter-rounds.
90 MOVV $4, R7
91 loop:
92 QR(V0, V4, V8, V12)
93 QR(V1, V5, V9, V13)
94 QR(V2, V6, V10, V14)
95 QR(V3, V7, V11, V15)
96
97 QR(V0, V5, V10, V15)
98 QR(V1, V6, V11, V12)
99 QR(V2, V7, V8, V13)
100 QR(V3, V4, V9, V14)
101
102 SUBV $1, R7
103 BNE R7, R0, loop
104
105 // add seed back
106 VADDW V4, V20, V4
107 VADDW V5, V21, V5
108 VADDW V6, V22, V6
109 VADDW V7, V23, V7
110 VADDW V8, V24, V8
111 VADDW V9, V25, V9
112 VADDW V10, V26, V10
113 VADDW V11, V27, V11
114
115 // store blocks back to output buffer
116 VMOVQ V0, (R5)
117 VMOVQ V1, 16(R5)
118 VMOVQ V2, 32(R5)
119 VMOVQ V3, 48(R5)
120 VMOVQ V4, 64(R5)
121 VMOVQ V5, 80(R5)
122 VMOVQ V6, 96(R5)
123 VMOVQ V7, 112(R5)
124 VMOVQ V8, 128(R5)
125 VMOVQ V9, 144(R5)
126 VMOVQ V10, 160(R5)
127 VMOVQ V11, 176(R5)
128 VMOVQ V12, 192(R5)
129 VMOVQ V13, 208(R5)
130 VMOVQ V14, 224(R5)
131 VMOVQ V15, 240(R5)
132
133 RET
134
View as plain text