aboutsummaryrefslogtreecommitdiff
path: root/src/example
diff options
context:
space:
mode:
authorBill Traynor <wmat@riscv.org>2023-08-01 13:15:58 -0400
committerBill Traynor <wmat@riscv.org>2023-08-01 13:15:58 -0400
commitbebbad41087bbfb713c15db173cc96daf2bd1a81 (patch)
tree1f3305bac68564041004111f1d34e893e0c9642f /src/example
parent9cd24d514bc52cb3863b8a547cd1ed40d07727ea (diff)
downloadriscv-isa-manual-bebbad41087bbfb713c15db173cc96daf2bd1a81.zip
riscv-isa-manual-bebbad41087bbfb713c15db173cc96daf2bd1a81.tar.gz
riscv-isa-manual-bebbad41087bbfb713c15db173cc96daf2bd1a81.tar.bz2
Setting up the inclusion of Vector.
Added Vector and all supporting files.
Diffstat (limited to 'src/example')
-rw-r--r--src/example/memcpy.s17
-rw-r--r--src/example/saxpy.s29
-rw-r--r--src/example/sgemm.S221
-rw-r--r--src/example/strcmp.s34
-rw-r--r--src/example/strcpy.s20
-rw-r--r--src/example/strlen.s22
-rw-r--r--src/example/strncpy.s36
-rw-r--r--src/example/vvaddint32.s22
8 files changed, 401 insertions, 0 deletions
diff --git a/src/example/memcpy.s b/src/example/memcpy.s
new file mode 100644
index 0000000..5f6318a
--- /dev/null
+++ b/src/example/memcpy.s
@@ -0,0 +1,17 @@
+ .text
+ .balign 4
+ .global memcpy
+ # void *memcpy(void* dest, const void* src, size_t n)
+ # a0=dest, a1=src, a2=n
+ #
+ memcpy:
+ mv a3, a0 # Copy destination
+ loop:
+ vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
+ vle8.v v0, (a1) # Load bytes
+ add a1, a1, t0 # Bump pointer
+ sub a2, a2, t0 # Decrement count
+ vse8.v v0, (a3) # Store bytes
+ add a3, a3, t0 # Bump pointer
+ bnez a2, loop # Any more?
+ ret # Return
diff --git a/src/example/saxpy.s b/src/example/saxpy.s
new file mode 100644
index 0000000..de7f224
--- /dev/null
+++ b/src/example/saxpy.s
@@ -0,0 +1,29 @@
+ .text
+ .balign 4
+ .global saxpy
+# void
+# saxpy(size_t n, const float a, const float *x, float *y)
+# {
+# size_t i;
+# for (i=0; i<n; i++)
+# y[i] = a * x[i] + y[i];
+# }
+#
+# register arguments:
+# a0 n
+# fa0 a
+# a1 x
+# a2 y
+
+saxpy:
+ vsetvli a4, a0, e32, m8, ta, ma
+ vle32.v v0, (a1)
+ sub a0, a0, a4
+ slli a4, a4, 2
+ add a1, a1, a4
+ vle32.v v8, (a2)
+ vfmacc.vf v8, fa0, v0
+ vse32.v v8, (a2)
+ add a2, a2, a4
+ bnez a0, saxpy
+ ret
diff --git a/src/example/sgemm.S b/src/example/sgemm.S
new file mode 100644
index 0000000..e29cc8d
--- /dev/null
+++ b/src/example/sgemm.S
@@ -0,0 +1,221 @@
+ .text
+ .balign 4
+ .global sgemm_nn
+# RV64IDV system
+#
+# void
+# sgemm_nn(size_t n,
+# size_t m,
+# size_t k,
+# const float*a, // m * k matrix
+# size_t lda,
+# const float*b, // k * n matrix
+# size_t ldb,
+# float*c, // m * n matrix
+# size_t ldc)
+#
+# c += a*b (alpha=1, no transpose on input matrices)
+# matrices stored in C row-major order
+
+#define n a0
+#define m a1
+#define k a2
+#define ap a3
+#define astride a4
+#define bp a5
+#define bstride a6
+#define cp a7
+#define cstride t0
+#define kt t1
+#define nt t2
+#define bnp t3
+#define cnp t4
+#define akp t5
+#define bkp s0
+#define nvl s1
+#define ccp s2
+#define amp s3
+
+# Use args as additional temporaries
+#define ft12 fa0
+#define ft13 fa1
+#define ft14 fa2
+#define ft15 fa3
+
+# This version holds a 16*VLMAX block of C matrix in vector registers
+# in inner loop, but otherwise does not cache or TLB tiling.
+
+sgemm_nn:
+ addi sp, sp, -FRAMESIZE
+ sd s0, OFFSET(sp)
+ sd s1, OFFSET(sp)
+ sd s2, OFFSET(sp)
+
+ # Check for zero size matrices
+ beqz n, exit
+ beqz m, exit
+ beqz k, exit
+
+ # Convert elements strides to byte strides.
+ ld cstride, OFFSET(sp) # Get arg from stack frame
+ slli astride, astride, 2
+ slli bstride, bstride, 2
+ slli cstride, cstride, 2
+
+ slti t6, m, 16
+ bnez t6, end_rows
+
+c_row_loop: # Loop across rows of C blocks
+
+ mv nt, n # Initialize n counter for next row of C blocks
+
+ mv bnp, bp # Initialize B n-loop pointer to start
+ mv cnp, cp # Initialize C n-loop pointer
+
+c_col_loop: # Loop across one row of C blocks
+ vsetvli nvl, nt, e32, ta, ma # 32-bit vectors, LMUL=1
+
+ mv akp, ap # reset pointer into A to beginning
+ mv bkp, bnp # step to next column in B matrix
+
+ # Initalize current C submatrix block from memory.
+ vle32.v v0, (cnp); add ccp, cnp, cstride;
+ vle32.v v1, (ccp); add ccp, ccp, cstride;
+ vle32.v v2, (ccp); add ccp, ccp, cstride;
+ vle32.v v3, (ccp); add ccp, ccp, cstride;
+ vle32.v v4, (ccp); add ccp, ccp, cstride;
+ vle32.v v5, (ccp); add ccp, ccp, cstride;
+ vle32.v v6, (ccp); add ccp, ccp, cstride;
+ vle32.v v7, (ccp); add ccp, ccp, cstride;
+ vle32.v v8, (ccp); add ccp, ccp, cstride;
+ vle32.v v9, (ccp); add ccp, ccp, cstride;
+ vle32.v v10, (ccp); add ccp, ccp, cstride;
+ vle32.v v11, (ccp); add ccp, ccp, cstride;
+ vle32.v v12, (ccp); add ccp, ccp, cstride;
+ vle32.v v13, (ccp); add ccp, ccp, cstride;
+ vle32.v v14, (ccp); add ccp, ccp, cstride;
+ vle32.v v15, (ccp)
+
+
+ mv kt, k # Initialize inner loop counter
+
+ # Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
+ # Software pipeline loads
+ flw ft0, (akp); add amp, akp, astride;
+ flw ft1, (amp); add amp, amp, astride;
+ flw ft2, (amp); add amp, amp, astride;
+ flw ft3, (amp); add amp, amp, astride;
+ # Get vector from B matrix
+ vle32.v v16, (bkp)
+
+ # Loop on inner dimension for current C block
+ k_loop:
+ vfmacc.vf v0, ft0, v16
+ add bkp, bkp, bstride
+ flw ft4, (amp)
+ add amp, amp, astride
+ vfmacc.vf v1, ft1, v16
+ addi kt, kt, -1 # Decrement k counter
+ flw ft5, (amp)
+ add amp, amp, astride
+ vfmacc.vf v2, ft2, v16
+ flw ft6, (amp)
+ add amp, amp, astride
+ flw ft7, (amp)
+ vfmacc.vf v3, ft3, v16
+ add amp, amp, astride
+ flw ft8, (amp)
+ add amp, amp, astride
+ vfmacc.vf v4, ft4, v16
+ flw ft9, (amp)
+ add amp, amp, astride
+ vfmacc.vf v5, ft5, v16
+ flw ft10, (amp)
+ add amp, amp, astride
+ vfmacc.vf v6, ft6, v16
+ flw ft11, (amp)
+ add amp, amp, astride
+ vfmacc.vf v7, ft7, v16
+ flw ft12, (amp)
+ add amp, amp, astride
+ vfmacc.vf v8, ft8, v16
+ flw ft13, (amp)
+ add amp, amp, astride
+ vfmacc.vf v9, ft9, v16
+ flw ft14, (amp)
+ add amp, amp, astride
+ vfmacc.vf v10, ft10, v16
+ flw ft15, (amp)
+ add amp, amp, astride
+ addi akp, akp, 4 # Move to next column of a
+ vfmacc.vf v11, ft11, v16
+ beqz kt, 1f # Don't load past end of matrix
+ flw ft0, (akp)
+ add amp, akp, astride
+1: vfmacc.vf v12, ft12, v16
+ beqz kt, 1f
+ flw ft1, (amp)
+ add amp, amp, astride
+1: vfmacc.vf v13, ft13, v16
+ beqz kt, 1f
+ flw ft2, (amp)
+ add amp, amp, astride
+1: vfmacc.vf v14, ft14, v16
+ beqz kt, 1f # Exit out of loop
+ flw ft3, (amp)
+ add amp, amp, astride
+ vfmacc.vf v15, ft15, v16
+ vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
+ j k_loop
+
+1: vfmacc.vf v15, ft15, v16
+
+ # Save C matrix block back to memory
+ vse32.v v0, (cnp); add ccp, cnp, cstride;
+ vse32.v v1, (ccp); add ccp, ccp, cstride;
+ vse32.v v2, (ccp); add ccp, ccp, cstride;
+ vse32.v v3, (ccp); add ccp, ccp, cstride;
+ vse32.v v4, (ccp); add ccp, ccp, cstride;
+ vse32.v v5, (ccp); add ccp, ccp, cstride;
+ vse32.v v6, (ccp); add ccp, ccp, cstride;
+ vse32.v v7, (ccp); add ccp, ccp, cstride;
+ vse32.v v8, (ccp); add ccp, ccp, cstride;
+ vse32.v v9, (ccp); add ccp, ccp, cstride;
+ vse32.v v10, (ccp); add ccp, ccp, cstride;
+ vse32.v v11, (ccp); add ccp, ccp, cstride;
+ vse32.v v12, (ccp); add ccp, ccp, cstride;
+ vse32.v v13, (ccp); add ccp, ccp, cstride;
+ vse32.v v14, (ccp); add ccp, ccp, cstride;
+ vse32.v v15, (ccp)
+
+ # Following tail instructions should be scheduled earlier in free slots during C block save.
+ # Leaving here for clarity.
+
+ # Bump pointers for loop across blocks in one row
+ slli t6, nvl, 2
+ add cnp, cnp, t6 # Move C block pointer over
+ add bnp, bnp, t6 # Move B block pointer over
+ sub nt, nt, nvl # Decrement element count in n dimension
+ bnez nt, c_col_loop # Any more to do?
+
+ # Move to next set of rows
+ addi m, m, -16 # Did 16 rows above
+ slli t6, astride, 4 # Multiply astride by 16
+ add ap, ap, t6 # Move A matrix pointer down 16 rows
+ slli t6, cstride, 4 # Multiply cstride by 16
+ add cp, cp, t6 # Move C matrix pointer down 16 rows
+
+ slti t6, m, 16
+ beqz t6, c_row_loop
+
+ # Handle end of matrix with fewer than 16 rows.
+ # Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
+end_rows:
+ # Not done.
+
+exit:
+ ld s0, OFFSET(sp)
+ ld s1, OFFSET(sp)
+ ld s2, OFFSET(sp)
+ addi sp, sp, FRAMESIZE
+ ret
diff --git a/src/example/strcmp.s b/src/example/strcmp.s
new file mode 100644
index 0000000..c657703
--- /dev/null
+++ b/src/example/strcmp.s
@@ -0,0 +1,34 @@
+ .text
+ .balign 4
+ .global strcmp
+ # int strcmp(const char *src1, const char* src2)
+strcmp:
+ ## Using LMUL=2, but same register names work for larger LMULs
+ li t1, 0 # Initial pointer bump
+loop:
+ vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes
+ add a0, a0, t1 # Bump src1 pointer
+ vle8ff.v v8, (a0) # Get src1 bytes
+ add a1, a1, t1 # Bump src2 pointer
+ vle8ff.v v16, (a1) # Get src2 bytes
+
+ vmseq.vi v0, v8, 0 # Flag zero bytes in src1
+ vmsne.vv v1, v8, v16 # Flag if src1 != src2
+ vmor.mm v0, v0, v1 # Combine exit conditions
+
+ vfirst.m a2, v0 # ==0 or != ?
+ csrr t1, vl # Get number of bytes fetched
+
+ bltz a2, loop # Loop if all same and no zero byte
+
+ add a0, a0, a2 # Get src1 element address
+ lbu a3, (a0) # Get src1 byte from memory
+
+ add a1, a1, a2 # Get src2 element address
+ lbu a4, (a1) # Get src2 byte from memory
+
+ sub a0, a3, a4 # Return value.
+
+ ret
+
+
diff --git a/src/example/strcpy.s b/src/example/strcpy.s
new file mode 100644
index 0000000..109112d
--- /dev/null
+++ b/src/example/strcpy.s
@@ -0,0 +1,20 @@
+ .text
+ .balign 4
+ .global strcpy
+ # char* strcpy(char *dst, const char* src)
+strcpy:
+ mv a2, a0 # Copy dst
+ li t0, -1 # Infinite AVL
+loop:
+ vsetvli x0, t0, e8, m8, ta, ma # Max length vectors of bytes
+ vle8ff.v v8, (a1) # Get src bytes
+ csrr t1, vl # Get number of bytes fetched
+ vmseq.vi v1, v8, 0 # Flag zero bytes
+ vfirst.m a3, v1 # Zero found?
+ add a1, a1, t1 # Bump pointer
+ vmsif.m v0, v1 # Set mask up to and including zero byte.
+ vse8.v v8, (a2), v0.t # Write out bytes
+ add a2, a2, t1 # Bump pointer
+ bltz a3, loop # Zero byte not found, so loop
+
+ ret
diff --git a/src/example/strlen.s b/src/example/strlen.s
new file mode 100644
index 0000000..1c3af4b
--- /dev/null
+++ b/src/example/strlen.s
@@ -0,0 +1,22 @@
+ .text
+ .balign 4
+ .global strlen
+# size_t strlen(const char *str)
+# a0 holds *str
+
+strlen:
+ mv a3, a0 # Save start
+loop:
+ vsetvli a1, x0, e8, m8, ta, ma # Vector of bytes of maximum length
+ vle8ff.v v8, (a3) # Load bytes
+ csrr a1, vl # Get bytes read
+ vmseq.vi v0, v8, 0 # Set v0[i] where v8[i] = 0
+ vfirst.m a2, v0 # Find first set bit
+ add a3, a3, a1 # Bump pointer
+ bltz a2, loop # Not found?
+
+ add a0, a0, a1 # Sum start + bump
+ add a3, a3, a2 # Add index
+ sub a0, a3, a0 # Subtract start address+bump
+
+ ret
diff --git a/src/example/strncpy.s b/src/example/strncpy.s
new file mode 100644
index 0000000..87e5410
--- /dev/null
+++ b/src/example/strncpy.s
@@ -0,0 +1,36 @@
+ .text
+ .balign 4
+ .global strncpy
+ # char* strncpy(char *dst, const char* src, size_t n)
+strncpy:
+ mv a3, a0 # Copy dst
+loop:
+ vsetvli x0, a2, e8, m8, ta, ma # Vectors of bytes.
+ vle8ff.v v8, (a1) # Get src bytes
+ vmseq.vi v1, v8, 0 # Flag zero bytes
+ csrr t1, vl # Get number of bytes fetched
+ vfirst.m a4, v1 # Zero found?
+ vmsbf.m v0, v1 # Set mask up to before zero byte.
+ vse8.v v8, (a3), v0.t # Write out non-zero bytes
+ bgez a4, zero_tail # Zero remaining bytes.
+ sub a2, a2, t1 # Decrement count.
+ add a3, a3, t1 # Bump dest pointer
+ add a1, a1, t1 # Bump src pointer
+ bnez a2, loop # Anymore?
+
+ ret
+
+zero_tail:
+ sub a2, a2, a4 # Subtract count on non-zero bytes.
+ add a3, a3, a4 # Advance past non-zero bytes.
+ vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes.
+ vmv.v.i v0, 0 # Splat zero.
+
+zero_loop:
+ vse8.v v0, (a3) # Store zero.
+ sub a2, a2, t1 # Decrement count.
+ add a3, a3, t1 # Bump pointer
+ vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes.
+ bnez a2, zero_loop # Anymore?
+
+ ret
diff --git a/src/example/vvaddint32.s b/src/example/vvaddint32.s
new file mode 100644
index 0000000..22305d9
--- /dev/null
+++ b/src/example/vvaddint32.s
@@ -0,0 +1,22 @@
+ .text
+ .balign 4
+ .global vvaddint32
+ # vector-vector add routine of 32-bit integers
+ # void vvaddint32(size_t n, const int*x, const int*y, int*z)
+ # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+ #
+ # a0 = n, a1 = x, a2 = y, a3 = z
+ # Non-vector instructions are indented
+vvaddint32:
+ vsetvli t0, a0, e32, ta, ma # Set vector length based on 32-bit vectors
+ vle32.v v0, (a1) # Get first vector
+ sub a0, a0, t0 # Decrement number done
+ slli t0, t0, 2 # Multiply number done by 4 bytes
+ add a1, a1, t0 # Bump pointer
+ vle32.v v1, (a2) # Get second vector
+ add a2, a2, t0 # Bump pointer
+ vadd.vv v2, v0, v1 # Sum vectors
+ vse32.v v2, (a3) # Store result
+ add a3, a3, t0 # Bump pointer
+ bnez a0, vvaddint32 # Loop back
+ ret # Finished