doc.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  /*
     6  Package loong64 implements an LoongArch64 assembler. Go assembly syntax is different from
     7  GNU LoongArch64 syntax, but we can still follow the general rules to map between them.
     8  
     9  # Instructions mnemonics mapping rules
    10  
    11  1. Bit widths represented by various instruction suffixes and prefixes
    12  V (vlong)     = 64 bit
    13  WU (word)     = 32 bit unsigned
    14  W (word)      = 32 bit
    15  H (half word) = 16 bit
    16  HU            = 16 bit unsigned
    17  B (byte)      = 8 bit
    18  BU            = 8 bit unsigned
    19  F (float)     = 32 bit float
    20  D (double)    = 64 bit float
    21  
    22  V  (LSX)      = 128 bit
    23  XV (LASX)     = 256 bit
    24  
    25  Examples:
    26  
    27  	MOVB  (R2), R3  // Load 8 bit memory data into R3 register
    28  	MOVH  (R2), R3  // Load 16 bit memory data into R3 register
    29  	MOVW  (R2), R3  // Load 32 bit memory data into R3 register
    30  	MOVV  (R2), R3  // Load 64 bit memory data into R3 register
    31  	VMOVQ  (R2), V1 // Load 128 bit memory data into V1 register
    32  	XVMOVQ (R2), X1 // Load 256 bit memory data into X1 register
    33  
    34  2. Align directive
    35  Go asm supports the PCALIGN directive, which indicates that the next instruction should
    36  be aligned to a specified boundary by padding with NOOP instruction. The alignment value
    37  supported on loong64 must be a power of 2 and in the range of [8, 2048].
    38  
    39  Examples:
    40  
    41  	PCALIGN	$16
    42  	MOVV	$2, R4	// This instruction is aligned with 16 bytes.
    43  	PCALIGN	$1024
    44  	MOVV	$3, R5	// This instruction is aligned with 1024 bytes.
    45  
    46  # On loong64, auto-align loop heads to 16-byte boundaries
    47  
    48  Examples:
    49  
    50  	TEXT ·Add(SB),NOSPLIT|NOFRAME,$0
    51  
    52  start:
    53  
    54  	MOVV	$1, R4	// This instruction is aligned with 16 bytes.
    55  	MOVV	$-1, R5
    56  	BNE	R5, start
    57  	RET
    58  
    59  # Register mapping rules
    60  
    61  1. All generial-prupose register names are written as Rn.
    62  
    63  2. All floating-point register names are written as Fn.
    64  
    65  3. All LSX register names are written as Vn.
    66  
    67  4. All LASX register names are written as Xn.
    68  
    69  # Argument mapping rules
    70  
    71  1. The operands appear in left-to-right assignment order.
    72  
    73  Go reverses the arguments of most instructions.
    74  
    75  Examples:
    76  
    77  	ADDV	R11, R12, R13 <=> add.d R13, R12, R11
    78  	LLV	(R4), R7      <=> ll.d R7, R4
    79  	OR	R5, R6        <=> or R6, R6, R5
    80  
    81  Special Cases.
    82  (1) Argument order is the same as in the GNU Loong64 syntax: jump instructions,
    83  
    84  Examples:
    85  
    86  	BEQ	R0, R4, lable1  <=>  beq R0, R4, lable1
    87  	JMP	lable1          <=>  b lable1
    88  
    89  (2) BSTRINSW, BSTRINSV, BSTRPICKW, BSTRPICKV $<msb>, <Rj>, $<lsb>, <Rd>
    90  
    91  Examples:
    92  
    93  	BSTRPICKW $15, R4, $6, R5  <=>  bstrpick.w r5, r4, 15, 6
    94  
    95  2. Expressions for special arguments.
    96  
    97  Memory references: a base register and an offset register is written as (Rbase)(Roff).
    98  
    99  Examples:
   100  
   101  	MOVB (R4)(R5), R6  <=>  ldx.b R6, R4, R5
   102  	MOVV (R4)(R5), R6  <=>  ldx.d R6, R4, R5
   103  	MOVD (R4)(R5), F6  <=>  fldx.d F6, R4, R5
   104  	MOVB R6, (R4)(R5)  <=>  stx.b R6, R5, R5
   105  	MOVV R6, (R4)(R5)  <=>  stx.d R6, R5, R5
   106  	MOVV F6, (R4)(R5)  <=>  fstx.d F6, R5, R5
   107  
   108  3. Alphabetical list of SIMD instructions
   109  
   110  Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate),
   111  "ui3", "ui2", and "ui1" represent the related "index".
   112  
   113  3.1 Move general-purpose register to a vector element:
   114  
   115  	Instruction format:
   116  	        VMOVQ  Rj, <Vd>.<T>[index]
   117  
   118  	Mapping between Go and platform assembly:
   119  	       Go assembly       |      platform assembly     |          semantics
   120  	-------------------------------------------------------------------------------------
   121  	 VMOVQ  Rj, Vd.B[index]  |  vinsgr2vr.b  Vd, Rj, ui4  |  VR[vd].b[ui4] = GR[rj][7:0]
   122  	 VMOVQ  Rj, Vd.H[index]  |  vinsgr2vr.h  Vd, Rj, ui3  |  VR[vd].h[ui3] = GR[rj][15:0]
   123  	 VMOVQ  Rj, Vd.W[index]  |  vinsgr2vr.w  Vd, Rj, ui2  |  VR[vd].w[ui2] = GR[rj][31:0]
   124  	 VMOVQ  Rj, Vd.V[index]  |  vinsgr2vr.d  Vd, Rj, ui1  |  VR[vd].d[ui1] = GR[rj][63:0]
   125  	XVMOVQ  Rj, Xd.W[index]  | xvinsgr2vr.w  Xd, Rj, ui3  |  XR[xd].w[ui3] = GR[rj][31:0]
   126  	XVMOVQ  Rj, Xd.V[index]  | xvinsgr2vr.d  Xd, Rj, ui2  |  XR[xd].d[ui2] = GR[rj][63:0]
   127  
   128  3.2 Move vector element to general-purpose register
   129  
   130  	Instruction format:
   131  	        VMOVQ     <Vj>.<T>[index], Rd
   132  
   133  	Mapping between Go and platform assembly:
   134  	        Go assembly       |       platform assembly      |            semantics
   135  	---------------------------------------------------------------------------------------------
   136  	 VMOVQ  Vj.B[index],  Rd  |   vpickve2gr.b   rd, vj, ui4 | GR[rd] = SignExtend(VR[vj].b[ui4])
   137  	 VMOVQ  Vj.H[index],  Rd  |   vpickve2gr.h   rd, vj, ui3 | GR[rd] = SignExtend(VR[vj].h[ui3])
   138  	 VMOVQ  Vj.W[index],  Rd  |   vpickve2gr.w   rd, vj, ui2 | GR[rd] = SignExtend(VR[vj].w[ui2])
   139  	 VMOVQ  Vj.V[index],  Rd  |   vpickve2gr.d   rd, vj, ui1 | GR[rd] = SignExtend(VR[vj].d[ui1])
   140  	 VMOVQ  Vj.BU[index], Rd  |   vpickve2gr.bu  rd, vj, ui4 | GR[rd] = ZeroExtend(VR[vj].bu[ui4])
   141  	 VMOVQ  Vj.HU[index], Rd  |   vpickve2gr.hu  rd, vj, ui3 | GR[rd] = ZeroExtend(VR[vj].hu[ui3])
   142  	 VMOVQ  Vj.WU[index], Rd  |   vpickve2gr.wu  rd, vj, ui2 | GR[rd] = ZeroExtend(VR[vj].wu[ui2])
   143  	 VMOVQ  Vj.VU[index], Rd  |   vpickve2gr.du  rd, vj, ui1 | GR[rd] = ZeroExtend(VR[vj].du[ui1])
   144  	XVMOVQ  Xj.W[index],  Rd  |  xvpickve2gr.w   rd, xj, ui3 | GR[rd] = SignExtend(VR[xj].w[ui3])
   145  	XVMOVQ  Xj.V[index],  Rd  |  xvpickve2gr.d   rd, xj, ui2 | GR[rd] = SignExtend(VR[xj].d[ui2])
   146  	XVMOVQ  Xj.WU[index], Rd  |  xvpickve2gr.wu  rd, xj, ui3 | GR[rd] = ZeroExtend(VR[xj].wu[ui3])
   147  	XVMOVQ  Xj.VU[index], Rd  |  xvpickve2gr.du  rd, xj, ui2 | GR[rd] = ZeroExtend(VR[xj].du[ui2])
   148  
   149  3.3 Duplicate general-purpose register to vector.
   150  
   151  	Instruction format:
   152  	        VMOVQ    Rj, <Vd>.<T>
   153  
   154  	Mapping between Go and platform assembly:
   155  	   Go assembly      |    platform assembly    |                    semantics
   156  	------------------------------------------------------------------------------------------------
   157  	 VMOVQ  Rj, Vd.B16  |   vreplgr2vr.b  Vd, Rj  |  for i in range(16): VR[vd].b[i] = GR[rj][7:0]
   158  	 VMOVQ  Rj, Vd.H8   |   vreplgr2vr.h  Vd, Rj  |  for i in range(8) : VR[vd].h[i] = GR[rj][16:0]
   159  	 VMOVQ  Rj, Vd.W4   |   vreplgr2vr.w  Vd, Rj  |  for i in range(4) : VR[vd].w[i] = GR[rj][31:0]
   160  	 VMOVQ  Rj, Vd.V2   |   vreplgr2vr.d  Vd, Rj  |  for i in range(2) : VR[vd].d[i] = GR[rj][63:0]
   161  	XVMOVQ  Rj, Xd.B32  |  xvreplgr2vr.b  Xd, Rj  |  for i in range(32): XR[xd].b[i] = GR[rj][7:0]
   162  	XVMOVQ  Rj, Xd.H16  |  xvreplgr2vr.h  Xd, Rj  |  for i in range(16): XR[xd].h[i] = GR[rj][16:0]
   163  	XVMOVQ  Rj, Xd.W8   |  xvreplgr2vr.w  Xd, Rj  |  for i in range(8) : XR[xd].w[i] = GR[rj][31:0]
   164  	XVMOVQ  Rj, Xd.V4   |  xvreplgr2vr.d  Xd, Rj  |  for i in range(4) : XR[xd].d[i] = GR[rj][63:0]
   165  
   166  3.4 Replace vector elements
   167  
   168  	Instruction format:
   169  	        XVMOVQ    Xj, <Xd>.<T>
   170  
   171  	Mapping between Go and platform assembly:
   172  	   Go assembly      |   platform assembly   |                semantics
   173  	------------------------------------------------------------------------------------------------
   174  	XVMOVQ  Xj, Xd.B32  |  xvreplve0.b  Xd, Xj  | for i in range(32): XR[xd].b[i] = XR[xj].b[0]
   175  	XVMOVQ  Xj, Xd.H16  |  xvreplve0.h  Xd, Xj  | for i in range(16): XR[xd].h[i] = XR[xj].h[0]
   176  	XVMOVQ  Xj, Xd.W8   |  xvreplve0.w  Xd, Xj  | for i in range(8) : XR[xd].w[i] = XR[xj].w[0]
   177  	XVMOVQ  Xj, Xd.V4   |  xvreplve0.d  Xd, Xj  | for i in range(4) : XR[xd].d[i] = XR[xj].d[0]
   178  	XVMOVQ  Xj, Xd.Q2   |  xvreplve0.q  Xd, Xj  | for i in range(2) : XR[xd].q[i] = XR[xj].q[0]
   179  
   180  3.5 Move vector element to scalar
   181  
   182  	Instruction format:
   183  	        XVMOVQ  Xj, <Xd>.<T>[index]
   184  	        XVMOVQ  Xj.<T>[index], Xd
   185  
   186  	Mapping between Go and platform assembly:
   187  	       Go assembly        |     platform assembly     |               semantics
   188  	------------------------------------------------------------------------------------------------
   189  	 XVMOVQ  Xj, Xd.W[index]  |  xvinsve0.w   xd, xj, ui3 | XR[xd].w[ui3] = XR[xj].w[0]
   190  	 XVMOVQ  Xj, Xd.V[index]  |  xvinsve0.d   xd, xj, ui2 | XR[xd].d[ui2] = XR[xj].d[0]
   191  	 XVMOVQ  Xj.W[index], Xd  |  xvpickve.w   xd, xj, ui3 | XR[xd].w[0] = XR[xj].w[ui3], XR[xd][255:32] = 0
   192  	 XVMOVQ  Xj.V[index], Xd  |  xvpickve.d   xd, xj, ui2 | XR[xd].d[0] = XR[xj].d[ui2], XR[xd][255:64] = 0
   193  
   194  3.6 Move vector element to vector register.
   195  
   196  	Instruction format:
   197  	VMOVQ     <Vn>.<T>[index], Vn.<T>
   198  
   199  	Mapping between Go and platform assembly:
   200  	         Go assembly      |    platform assembly   |               semantics
   201  	VMOVQ Vj.B[index], Vd.B16 | vreplvei.b vd, vj, ui4 | for i in range(16): VR[vd].b[i] = VR[vj].b[ui4]
   202  	VMOVQ Vj.H[index], Vd.H8  | vreplvei.h vd, vj, ui3 | for i in range(8) : VR[vd].h[i] = VR[vj].h[ui3]
   203  	VMOVQ Vj.W[index], Vd.W4  | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2]
   204  	VMOVQ Vj.V[index], Vd.V2  | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1]
   205  
   206  3.7 Load data from memory and broadcast to each element of a vector register.
   207  
   208  	Instruction format:
   209  	        VMOVQ    offset(Rj), <Vd>.<T>
   210  
   211  	Mapping between Go and platform assembly:
   212  	   Go assembly              |     platform assembly      |                                semantics
   213  	-------------------------------------------------------------------------------------------------------------------------------------------------------
   214  	 VMOVQ  offset(Rj), Vd.B16  |   vldrepl.b  Vd, Rj, si12  |  for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
   215  	 VMOVQ  offset(Rj), Vd.H8   |   vldrepl.h  Vd, Rj, si11  |  for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
   216  	 VMOVQ  offset(Rj), Vd.W4   |   vldrepl.w  Vd, Rj, si10  |  for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
   217  	 VMOVQ  offset(Rj), Vd.V2   |   vldrepl.d  Vd, Rj, si9   |  for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
   218  	XVMOVQ  offset(Rj), Xd.B32  |  xvldrepl.b  Xd, Rj, si12  |  for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
   219  	XVMOVQ  offset(Rj), Xd.H16  |  xvldrepl.h  Xd, Rj, si11  |  for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
   220  	XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
   221  	XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
   222  
   223  # Special instruction encoding definition and description on LoongArch
   224  
   225   1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
   226      from the Linux kernel implementation: https://git.kernel.org/torvalds/c/e031a5f3f1ed
   227  
   228      - Bit4: ordering or completion (0: completion, 1: ordering)
   229      - Bit3: barrier for previous read (0: true, 1: false)
   230      - Bit2: barrier for previous write (0: true, 1: false)
   231      - Bit1: barrier for succeeding read (0: true, 1: false)
   232      - Bit0: barrier for succeeding write (0: true, 1: false)
   233      - Hint 0x700: barrier for "read after read" from the same address
   234  
   235      Traditionally, on microstructures that do not support dbar grading such as LA464
   236      (Loongson 3A5000, 3C5000) all variants are treated as “dbar 0” (full barrier).
   237  
   238  2. Notes on using atomic operation instructions
   239  
   240    - AM*_DB.W[U]/V[U] instructions such as AMSWAPDBW not only complete the corresponding
   241      atomic operation sequence, but also implement the complete full data barrier function.
   242  
   243    - When using the AM*_.W[U]/D[U] instruction, registers rd and rj cannot be the same,
   244      otherwise an exception is triggered, and rd and rk cannot be the same, otherwise
   245      the execution result is uncertain.
   246  
   247  3. Prefetch instructions
   248      Instruction format:
   249        PRELD	offset(Rbase), $hint
   250        PRELDX	offset(Rbase), $n, $hint
   251  
   252      Mapping between Go and platform assembly:
   253                 Go assembly            |    platform assembly
   254        PRELD  offset(Rbase), $hint     | preld hint, Rbase, offset
   255        PRELDX offset(Rbase), $n, $hint | move rk, $x; preldx hint, Rbase, rk
   256  
   257        note: $x is the value after $n and offset are reassembled
   258  
   259      Definition of hint value:
   260        0: load to L1
   261        2: load to L3
   262        8: store to L1
   263  
   264        The meaning of the rest of values is not defined yet, and the processor executes it as NOP
   265  
   266      Definition of $n in the PRELDX instruction:
   267        bit[0]: address sequence, 0 indicating ascending and 1 indicating descending
   268        bits[11:1]:  block size, the value range is [16, 1024], and it must be an integer multiple of 16
   269        bits[20:12]: block num, the value range is [1, 256]
   270        bits[36:21]: stride, the value range is [0, 0xffff]
   271  
   272  4. ShiftAdd instructions
   273      Mapping between Go and platform assembly:
   274                  Go assembly            |    platform assembly
   275       ALSL.W/WU/V $Imm, Rj, Rk, Rd      |    alsl.w/wu/d rd, rj, rk, $imm
   276  
   277      Instruction encoding format is as follows:
   278  
   279  	| 31 ~ 17 | 16 ~ 15 | 14 ~ 10 | 9 ~ 5 | 4 ~ 0 |
   280  	|  opcode |   sa2   |   rk    |   rj  |   rd  |
   281  
   282      The alsl.w/wu/v series of instructions shift the data in rj left by sa+1, add the value
   283      in rk, and write the result to rd.
   284  
   285      To allow programmers to directly write the desired shift amount in assembly code, we actually write
   286      the value of sa2+1 in the assembly code and then include the value of sa2 in the instruction encoding.
   287  
   288      For example:
   289  
   290              Go assembly      | instruction Encoding
   291          ALSLV $4, r4, r5, R6 |      002d9486
   292  */
   293  
   294  package loong64
   295
View as plain text