diff options
author | Bill Traynor <wmat@riscv.org> | 2023-08-01 13:15:58 -0400 |
---|---|---|
committer | Bill Traynor <wmat@riscv.org> | 2023-08-01 13:15:58 -0400 |
commit | bebbad41087bbfb713c15db173cc96daf2bd1a81 (patch) | |
tree | 1f3305bac68564041004111f1d34e893e0c9642f /src/example | |
parent | 9cd24d514bc52cb3863b8a547cd1ed40d07727ea (diff) | |
download | riscv-isa-manual-bebbad41087bbfb713c15db173cc96daf2bd1a81.zip riscv-isa-manual-bebbad41087bbfb713c15db173cc96daf2bd1a81.tar.gz riscv-isa-manual-bebbad41087bbfb713c15db173cc96daf2bd1a81.tar.bz2 |
Setting up the inclusion of Vector.
Added Vector and all supporting files.
Diffstat (limited to 'src/example')
-rw-r--r-- | src/example/memcpy.s | 17 | ||||
-rw-r--r-- | src/example/saxpy.s | 29 | ||||
-rw-r--r-- | src/example/sgemm.S | 221 | ||||
-rw-r--r-- | src/example/strcmp.s | 34 | ||||
-rw-r--r-- | src/example/strcpy.s | 20 | ||||
-rw-r--r-- | src/example/strlen.s | 22 | ||||
-rw-r--r-- | src/example/strncpy.s | 36 | ||||
-rw-r--r-- | src/example/vvaddint32.s | 22 |
8 files changed, 401 insertions, 0 deletions
diff --git a/src/example/memcpy.s b/src/example/memcpy.s new file mode 100644 index 0000000..5f6318a --- /dev/null +++ b/src/example/memcpy.s @@ -0,0 +1,17 @@ + .text + .balign 4 + .global memcpy + # void *memcpy(void* dest, const void* src, size_t n) + # a0=dest, a1=src, a2=n + # + memcpy: + mv a3, a0 # Copy destination + loop: + vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b + vle8.v v0, (a1) # Load bytes + add a1, a1, t0 # Bump pointer + sub a2, a2, t0 # Decrement count + vse8.v v0, (a3) # Store bytes + add a3, a3, t0 # Bump pointer + bnez a2, loop # Any more? + ret # Return diff --git a/src/example/saxpy.s b/src/example/saxpy.s new file mode 100644 index 0000000..de7f224 --- /dev/null +++ b/src/example/saxpy.s @@ -0,0 +1,29 @@ + .text + .balign 4 + .global saxpy +# void +# saxpy(size_t n, const float a, const float *x, float *y) +# { +# size_t i; +# for (i=0; i<n; i++) +# y[i] = a * x[i] + y[i]; +# } +# +# register arguments: +# a0 n +# fa0 a +# a1 x +# a2 y + +saxpy: + vsetvli a4, a0, e32, m8, ta, ma + vle32.v v0, (a1) + sub a0, a0, a4 + slli a4, a4, 2 + add a1, a1, a4 + vle32.v v8, (a2) + vfmacc.vf v8, fa0, v0 + vse32.v v8, (a2) + add a2, a2, a4 + bnez a0, saxpy + ret diff --git a/src/example/sgemm.S b/src/example/sgemm.S new file mode 100644 index 0000000..e29cc8d --- /dev/null +++ b/src/example/sgemm.S @@ -0,0 +1,221 @@ + .text + .balign 4 + .global sgemm_nn +# RV64IDV system +# +# void +# sgemm_nn(size_t n, +# size_t m, +# size_t k, +# const float*a, // m * k matrix +# size_t lda, +# const float*b, // k * n matrix +# size_t ldb, +# float*c, // m * n matrix +# size_t ldc) +# +# c += a*b (alpha=1, no transpose on input matrices) +# matrices stored in C row-major order + +#define n a0 +#define m a1 +#define k a2 +#define ap a3 +#define astride a4 +#define bp a5 +#define bstride a6 +#define cp a7 +#define cstride t0 +#define kt t1 +#define nt t2 +#define bnp t3 +#define cnp t4 +#define akp t5 +#define bkp s0 +#define nvl s1 +#define ccp s2 +#define amp s3 + +# Use args as additional temporaries +#define ft12 fa0 +#define ft13 fa1 +#define ft14 fa2 +#define ft15 fa3 + +# This version holds a 16*VLMAX block of C matrix in vector registers +# in inner loop, but otherwise does not cache or TLB tiling. + +sgemm_nn: + addi sp, sp, -FRAMESIZE + sd s0, OFFSET(sp) + sd s1, OFFSET(sp) + sd s2, OFFSET(sp) + + # Check for zero size matrices + beqz n, exit + beqz m, exit + beqz k, exit + + # Convert elements strides to byte strides. + ld cstride, OFFSET(sp) # Get arg from stack frame + slli astride, astride, 2 + slli bstride, bstride, 2 + slli cstride, cstride, 2 + + slti t6, m, 16 + bnez t6, end_rows + +c_row_loop: # Loop across rows of C blocks + + mv nt, n # Initialize n counter for next row of C blocks + + mv bnp, bp # Initialize B n-loop pointer to start + mv cnp, cp # Initialize C n-loop pointer + +c_col_loop: # Loop across one row of C blocks + vsetvli nvl, nt, e32, ta, ma # 32-bit vectors, LMUL=1 + + mv akp, ap # reset pointer into A to beginning + mv bkp, bnp # step to next column in B matrix + + # Initalize current C submatrix block from memory. + vle32.v v0, (cnp); add ccp, cnp, cstride; + vle32.v v1, (ccp); add ccp, ccp, cstride; + vle32.v v2, (ccp); add ccp, ccp, cstride; + vle32.v v3, (ccp); add ccp, ccp, cstride; + vle32.v v4, (ccp); add ccp, ccp, cstride; + vle32.v v5, (ccp); add ccp, ccp, cstride; + vle32.v v6, (ccp); add ccp, ccp, cstride; + vle32.v v7, (ccp); add ccp, ccp, cstride; + vle32.v v8, (ccp); add ccp, ccp, cstride; + vle32.v v9, (ccp); add ccp, ccp, cstride; + vle32.v v10, (ccp); add ccp, ccp, cstride; + vle32.v v11, (ccp); add ccp, ccp, cstride; + vle32.v v12, (ccp); add ccp, ccp, cstride; + vle32.v v13, (ccp); add ccp, ccp, cstride; + vle32.v v14, (ccp); add ccp, ccp, cstride; + vle32.v v15, (ccp) + + + mv kt, k # Initialize inner loop counter + + # Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline + # Software pipeline loads + flw ft0, (akp); add amp, akp, astride; + flw ft1, (amp); add amp, amp, astride; + flw ft2, (amp); add amp, amp, astride; + flw ft3, (amp); add amp, amp, astride; + # Get vector from B matrix + vle32.v v16, (bkp) + + # Loop on inner dimension for current C block + k_loop: + vfmacc.vf v0, ft0, v16 + add bkp, bkp, bstride + flw ft4, (amp) + add amp, amp, astride + vfmacc.vf v1, ft1, v16 + addi kt, kt, -1 # Decrement k counter + flw ft5, (amp) + add amp, amp, astride + vfmacc.vf v2, ft2, v16 + flw ft6, (amp) + add amp, amp, astride + flw ft7, (amp) + vfmacc.vf v3, ft3, v16 + add amp, amp, astride + flw ft8, (amp) + add amp, amp, astride + vfmacc.vf v4, ft4, v16 + flw ft9, (amp) + add amp, amp, astride + vfmacc.vf v5, ft5, v16 + flw ft10, (amp) + add amp, amp, astride + vfmacc.vf v6, ft6, v16 + flw ft11, (amp) + add amp, amp, astride + vfmacc.vf v7, ft7, v16 + flw ft12, (amp) + add amp, amp, astride + vfmacc.vf v8, ft8, v16 + flw ft13, (amp) + add amp, amp, astride + vfmacc.vf v9, ft9, v16 + flw ft14, (amp) + add amp, amp, astride + vfmacc.vf v10, ft10, v16 + flw ft15, (amp) + add amp, amp, astride + addi akp, akp, 4 # Move to next column of a + vfmacc.vf v11, ft11, v16 + beqz kt, 1f # Don't load past end of matrix + flw ft0, (akp) + add amp, akp, astride +1: vfmacc.vf v12, ft12, v16 + beqz kt, 1f + flw ft1, (amp) + add amp, amp, astride +1: vfmacc.vf v13, ft13, v16 + beqz kt, 1f + flw ft2, (amp) + add amp, amp, astride +1: vfmacc.vf v14, ft14, v16 + beqz kt, 1f # Exit out of loop + flw ft3, (amp) + add amp, amp, astride + vfmacc.vf v15, ft15, v16 + vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls + j k_loop + +1: vfmacc.vf v15, ft15, v16 + + # Save C matrix block back to memory + vse32.v v0, (cnp); add ccp, cnp, cstride; + vse32.v v1, (ccp); add ccp, ccp, cstride; + vse32.v v2, (ccp); add ccp, ccp, cstride; + vse32.v v3, (ccp); add ccp, ccp, cstride; + vse32.v v4, (ccp); add ccp, ccp, cstride; + vse32.v v5, (ccp); add ccp, ccp, cstride; + vse32.v v6, (ccp); add ccp, ccp, cstride; + vse32.v v7, (ccp); add ccp, ccp, cstride; + vse32.v v8, (ccp); add ccp, ccp, cstride; + vse32.v v9, (ccp); add ccp, ccp, cstride; + vse32.v v10, (ccp); add ccp, ccp, cstride; + vse32.v v11, (ccp); add ccp, ccp, cstride; + vse32.v v12, (ccp); add ccp, ccp, cstride; + vse32.v v13, (ccp); add ccp, ccp, cstride; + vse32.v v14, (ccp); add ccp, ccp, cstride; + vse32.v v15, (ccp) + + # Following tail instructions should be scheduled earlier in free slots during C block save. + # Leaving here for clarity. + + # Bump pointers for loop across blocks in one row + slli t6, nvl, 2 + add cnp, cnp, t6 # Move C block pointer over + add bnp, bnp, t6 # Move B block pointer over + sub nt, nt, nvl # Decrement element count in n dimension + bnez nt, c_col_loop # Any more to do? + + # Move to next set of rows + addi m, m, -16 # Did 16 rows above + slli t6, astride, 4 # Multiply astride by 16 + add ap, ap, t6 # Move A matrix pointer down 16 rows + slli t6, cstride, 4 # Multiply cstride by 16 + add cp, cp, t6 # Move C matrix pointer down 16 rows + + slti t6, m, 16 + beqz t6, c_row_loop + + # Handle end of matrix with fewer than 16 rows. + # Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns. +end_rows: + # Not done. + +exit: + ld s0, OFFSET(sp) + ld s1, OFFSET(sp) + ld s2, OFFSET(sp) + addi sp, sp, FRAMESIZE + ret diff --git a/src/example/strcmp.s b/src/example/strcmp.s new file mode 100644 index 0000000..c657703 --- /dev/null +++ b/src/example/strcmp.s @@ -0,0 +1,34 @@ + .text + .balign 4 + .global strcmp + # int strcmp(const char *src1, const char* src2) +strcmp: + ## Using LMUL=2, but same register names work for larger LMULs + li t1, 0 # Initial pointer bump +loop: + vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes + add a0, a0, t1 # Bump src1 pointer + vle8ff.v v8, (a0) # Get src1 bytes + add a1, a1, t1 # Bump src2 pointer + vle8ff.v v16, (a1) # Get src2 bytes + + vmseq.vi v0, v8, 0 # Flag zero bytes in src1 + vmsne.vv v1, v8, v16 # Flag if src1 != src2 + vmor.mm v0, v0, v1 # Combine exit conditions + + vfirst.m a2, v0 # ==0 or != ? + csrr t1, vl # Get number of bytes fetched + + bltz a2, loop # Loop if all same and no zero byte + + add a0, a0, a2 # Get src1 element address + lbu a3, (a0) # Get src1 byte from memory + + add a1, a1, a2 # Get src2 element address + lbu a4, (a1) # Get src2 byte from memory + + sub a0, a3, a4 # Return value. + + ret + + diff --git a/src/example/strcpy.s b/src/example/strcpy.s new file mode 100644 index 0000000..109112d --- /dev/null +++ b/src/example/strcpy.s @@ -0,0 +1,20 @@ + .text + .balign 4 + .global strcpy + # char* strcpy(char *dst, const char* src) +strcpy: + mv a2, a0 # Copy dst + li t0, -1 # Infinite AVL +loop: + vsetvli x0, t0, e8, m8, ta, ma # Max length vectors of bytes + vle8ff.v v8, (a1) # Get src bytes + csrr t1, vl # Get number of bytes fetched + vmseq.vi v1, v8, 0 # Flag zero bytes + vfirst.m a3, v1 # Zero found? + add a1, a1, t1 # Bump pointer + vmsif.m v0, v1 # Set mask up to and including zero byte. + vse8.v v8, (a2), v0.t # Write out bytes + add a2, a2, t1 # Bump pointer + bltz a3, loop # Zero byte not found, so loop + + ret diff --git a/src/example/strlen.s b/src/example/strlen.s new file mode 100644 index 0000000..1c3af4b --- /dev/null +++ b/src/example/strlen.s @@ -0,0 +1,22 @@ + .text + .balign 4 + .global strlen +# size_t strlen(const char *str) +# a0 holds *str + +strlen: + mv a3, a0 # Save start +loop: + vsetvli a1, x0, e8, m8, ta, ma # Vector of bytes of maximum length + vle8ff.v v8, (a3) # Load bytes + csrr a1, vl # Get bytes read + vmseq.vi v0, v8, 0 # Set v0[i] where v8[i] = 0 + vfirst.m a2, v0 # Find first set bit + add a3, a3, a1 # Bump pointer + bltz a2, loop # Not found? + + add a0, a0, a1 # Sum start + bump + add a3, a3, a2 # Add index + sub a0, a3, a0 # Subtract start address+bump + + ret diff --git a/src/example/strncpy.s b/src/example/strncpy.s new file mode 100644 index 0000000..87e5410 --- /dev/null +++ b/src/example/strncpy.s @@ -0,0 +1,36 @@ + .text + .balign 4 + .global strncpy + # char* strncpy(char *dst, const char* src, size_t n) +strncpy: + mv a3, a0 # Copy dst +loop: + vsetvli x0, a2, e8, m8, ta, ma # Vectors of bytes. + vle8ff.v v8, (a1) # Get src bytes + vmseq.vi v1, v8, 0 # Flag zero bytes + csrr t1, vl # Get number of bytes fetched + vfirst.m a4, v1 # Zero found? + vmsbf.m v0, v1 # Set mask up to before zero byte. + vse8.v v8, (a3), v0.t # Write out non-zero bytes + bgez a4, zero_tail # Zero remaining bytes. + sub a2, a2, t1 # Decrement count. + add a3, a3, t1 # Bump dest pointer + add a1, a1, t1 # Bump src pointer + bnez a2, loop # Anymore? + + ret + +zero_tail: + sub a2, a2, a4 # Subtract count on non-zero bytes. + add a3, a3, a4 # Advance past non-zero bytes. + vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes. + vmv.v.i v0, 0 # Splat zero. + +zero_loop: + vse8.v v0, (a3) # Store zero. + sub a2, a2, t1 # Decrement count. + add a3, a3, t1 # Bump pointer + vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes. + bnez a2, zero_loop # Anymore? + + ret diff --git a/src/example/vvaddint32.s b/src/example/vvaddint32.s new file mode 100644 index 0000000..22305d9 --- /dev/null +++ b/src/example/vvaddint32.s @@ -0,0 +1,22 @@ + .text + .balign 4 + .global vvaddint32 + # vector-vector add routine of 32-bit integers + # void vvaddint32(size_t n, const int*x, const int*y, int*z) + # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } + # + # a0 = n, a1 = x, a2 = y, a3 = z + # Non-vector instructions are indented +vvaddint32: + vsetvli t0, a0, e32, ta, ma # Set vector length based on 32-bit vectors + vle32.v v0, (a1) # Get first vector + sub a0, a0, t0 # Decrement number done + slli t0, t0, 2 # Multiply number done by 4 bytes + add a1, a1, t0 # Bump pointer + vle32.v v1, (a2) # Get second vector + add a2, a2, t0 # Bump pointer + vadd.vv v2, v0, v1 # Sum vectors + vse32.v v2, (a3) # Store result + add a3, a3, t0 # Bump pointer + bnez a0, vvaddint32 # Loop back + ret # Finished |