aboutsummaryrefslogtreecommitdiff
path: root/src/rv32.tex
blob: 11cb0635624a1f74c06b01a666c3c02405f20763 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
\chapter{RV32I Base Integer Instruction Set, Version 2.0}
\label{rv32}

This chapter describes version 2.0 of the RV32I base integer
instruction set.  Much of the commentary also applies to the RV64I
variant.

\begin{commentary}
RV32I was designed to be sufficient to form a compiler target and to
support modern operating system environments.  The ISA was also
designed to reduce the hardware required in a minimal implementation.
RV32I contains 47 unique instructions, though a simple implementation
might cover the eight ECALL/EBREAK/CSRR* instructions with a single
SYSTEM hardware instruction that always traps and might be able to
implement the FENCE and FENCE.I instructions as NOPs, reducing
hardware instruction count to 38 total.  RV32I can emulate almost any
other ISA extension (except the A extension, which requires additional
hardware support for atomicity).

Subsets of the base integer ISA might be useful for pedagogical
purposes, but the base has been defined such that there should be
little incentive to subset a real hardware implementation beyond
omitting support for misaligned memory accesses and treating all SYSTEM
instructions as a single trap.
\end{commentary}

\section{Programmers' Model for Base Integer Subset}

Figure~\ref{gprs} shows the user-visible state for the base integer
subset.  There are 31 general-purpose registers {\tt x1}--{\tt x31},
which hold integer values.  Register {\tt x0} is hardwired to the
constant 0.  There is no hardwired subroutine return address link
register, but the standard software calling convention uses register
{\tt x1} to hold the return address on a call.  For RV32, the {\tt x}
registers are 32 bits wide, and for RV64, they are 64 bits wide.  This
document uses the term XLEN to refer to the current width of an {\tt
  x} register in bits (either 32 or 64).

There is one additional user-visible register: the program counter {\tt pc}
holds the address of the current instruction.

\begin{commentary}
The number of available architectural registers can have large impacts
on code size, performance, and energy consumption.  Although 16
registers would arguably be sufficient for an integer ISA running
compiled code, it is impossible to encode a complete ISA with 16
registers in 16-bit instructions using a 3-address format.  Although a
2-address format would be possible, it would increase instruction
count and lower efficiency.  We wanted to avoid intermediate
instruction sizes (such as Xtensa's 24-bit instructions) to simplify
base hardware implementations, and once a 32-bit instruction size was
adopted, it was straightforward to support 32 integer registers.  A
larger number of integer registers also helps performance on
high-performance code, where there can be extensive use of loop
unrolling, software pipelining, and cache tiling.

For these reasons, we chose a conventional size of 32 integer
registers for the base ISA.  Dynamic register usage tends to be
dominated by a few frequently accessed registers, and regfile
implementations can be optimized to reduce access energy for the
frequently accessed registers~\cite{jtseng:sbbci}.  The optional
compressed 16-bit instruction format mostly only accesses 8 registers
and hence can provide a dense instruction encoding, while additional
instruction-set extensions could support a much larger register space
(either flat or hierarchical) if desired.

For resource-constrained embedded applications, we have defined the
RV32E subset, which only has 16 registers (Chapter~\ref{rv32e}).
\end{commentary}

\begin{figure}[H]
{\footnotesize
\begin{center}
\begin{tabular}{p{2in}}
\instbitrange{XLEN-1}{0}                                  \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ \ \ x0 / zero}}      \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x1\ \ \ \ \ }}            \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x2\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x3\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x4\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x5\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x6\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x7\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x8\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ \ x9\ \ \ \ \ }}       \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x10\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x11\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x12\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x13\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x14\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x15\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x16\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x17\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x18\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x19\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x20\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x21\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x22\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x23\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x24\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x25\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x26\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x27\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x28\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x29\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x30\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{\ \ \ x31\ \ \ \ \ }}        \\ \cline{1-1}
\multicolumn{1}{c}{XLEN}                                  \\

\instbitrange{XLEN-1}{0}                                  \\ \cline{1-1}
\multicolumn{1}{|c|}{\reglabel{pc}}                         \\ \cline{1-1}
\multicolumn{1}{c}{XLEN}                                  \\
\end{tabular}
\end{center}
}
\caption{RISC-V user-level base integer register state.}
\label{gprs}
\end{figure}

\newpage

\section{Base Instruction Formats}

In the base ISA, there are four core instruction formats (R/I/S/U), as
shown in Figure~\ref{fig:baseinstformats}.  All are a fixed 32 bits in
length and must be aligned on a four-byte boundary in memory.  An
instruction-address-misaligned exception is generated on a taken
branch or unconditional jump if the target address is not four-byte
aligned.  No instruction-address-misaligned exception is generated for
a conditional branch that is not taken.

\begin{commentary}
The alignment constraint for base ISA instructions is relaxed to a
two-byte boundary when instruction extensions with 16-bit lengths or
other odd multiples of 16-bit lengths are added (i.e., IALIGN=16).
\end{commentary}

\vspace{-0.2in}
\begin{figure}[h]
\begin{center}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{p{1.2in}@{}p{0.8in}@{}p{0.8in}@{}p{0.6in}@{}p{0.8in}@{}p{1in}l}
\\
\instbitrange{31}{25} &
\instbitrange{24}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\cline{1-6}
\multicolumn{1}{|c|}{funct7} &
\multicolumn{1}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} &
R-type \\
\cline{1-6}
\\
\cline{1-6}
\multicolumn{2}{|c|}{imm[11:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} &
I-type \\
\cline{1-6}
\\
\cline{1-6}
\multicolumn{1}{|c|}{imm[11:5]} &
\multicolumn{1}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{imm[4:0]} &
\multicolumn{1}{c|}{opcode} &
S-type \\
\cline{1-6}
\\
\cline{1-6}
\multicolumn{4}{|c|}{imm[31:12]} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} &
U-type \\
\cline{1-6}
\end{tabular}
\end{center}
\caption{RISC-V base instruction formats.  Each immediate subfield is
  labeled with the bit position (imm[{\em x}\,]) in the immediate
  value being produced, rather than the bit position within the
  instruction's immediate field as is usually done.  }
\label{fig:baseinstformats}
\end{figure}

The RISC-V ISA keeps the source ({\em rs1} and {\em rs2}) and
destination ({\em rd}) registers at the same position in all formats
to simplify decoding.  Except for the 5-bit immediates used in CSR
instructions (Section~\ref{sec:csrinsts}), immediates are always
sign-extended, and are generally packed towards the leftmost available
bits in the instruction and have been allocated to reduce hardware
complexity.  In particular, the sign bit for all immediates is always
in bit 31 of the instruction to speed sign-extension circuitry.

\begin{commentary}
Decoding register specifiers is usually on the critical paths in
implementations, and so the instruction format was chosen to keep all
register specifiers at the same position in all formats at the expense
of having to move immediate bits across formats (a property shared
with RISC-IV aka. SPUR~\cite{spur-jsscc1989}).

In practice, most immediates are either small or require all XLEN
bits.  We chose an asymmetric immediate split (12 bits in regular
instructions plus a special load upper immediate instruction with 20
bits) to increase the opcode space available for regular instructions.

Immediates are sign-extended because we did not observe a benefit to
using zero-extension for some immediates as in the MIPS ISA and wanted
to keep the ISA as simple as possible.
\end{commentary}

\section{Immediate Encoding Variants}

There are a further two variants of the instruction formats (B/J)
based on the handling of immediates, as shown in
Figure~\ref{fig:baseinstformatsimm}.

The only difference between the S and B formats is that the 12-bit
immediate field is used to encode branch offsets in multiples of 2 in
the B format.  Instead of shifting all bits in the
instruction-encoded immediate left by one in hardware as is
conventionally done, the middle bits (imm[10:1]) and sign bit stay in
fixed positions, while the lowest bit in S format (inst[7]) encodes a
high-order bit in B format.

Similarly, the only difference between the U and J formats is
that the 20-bit immediate is shifted left by 12 bits to form U
immediates and by 1 bit to form J immediates.  The location of
instruction bits in the U and J format immediates is chosen to
maximize overlap with the other formats and with each other.

\begin{figure}[h]
\begin{small}
\begin{center}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{p{0.3in}@{}p{0.8in}@{}p{0.6in}@{}p{0.18in}@{}p{0.7in}@{}p{0.6in}@{}p{0.6in}@{}p{0.3in}@{}p{0.5in}l}
\\
\multicolumn{1}{c}{\instbit{31}} &
\instbitrange{30}{25} &
\instbitrange{24}{21} &
\multicolumn{1}{c}{\instbit{20}} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{8} &
\multicolumn{1}{c}{\instbit{7}} &
\instbitrange{6}{0} \\
\cline{1-9}
\multicolumn{2}{|c|}{funct7} &
\multicolumn{2}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{2}{c|}{rd} &
\multicolumn{1}{c|}{opcode} &
R-type \\
\cline{1-9}
\\
\cline{1-9}
\multicolumn{4}{|c|}{imm[11:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{2}{c|}{rd} &
\multicolumn{1}{c|}{opcode} &
I-type \\
\cline{1-9}
\\
\cline{1-9}
\multicolumn{2}{|c|}{imm[11:5]} &
\multicolumn{2}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{2}{c|}{imm[4:0]} &
\multicolumn{1}{c|}{opcode} &
S-type \\
\cline{1-9}
\\
\cline{1-9}
\multicolumn{1}{|c|}{imm[12]} &
\multicolumn{1}{c|}{imm[10:5]} &
\multicolumn{2}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{imm[4:1]} &
\multicolumn{1}{c|}{imm[11]} &
\multicolumn{1}{c|}{opcode} &
B-type \\
\cline{1-9}
\\
\cline{1-9}
\multicolumn{6}{|c|}{imm[31:12]} &
\multicolumn{2}{c|}{rd} &
\multicolumn{1}{c|}{opcode} &
U-type \\
\cline{1-9}
\\
\cline{1-9}
\multicolumn{1}{|c|}{imm[20]} &
\multicolumn{2}{c|}{imm[10:1]} &
\multicolumn{1}{c|}{imm[11]} &
\multicolumn{2}{c|}{imm[19:12]} &
\multicolumn{2}{c|}{rd} &
\multicolumn{1}{c|}{opcode} &
J-type \\
\cline{1-9}
\end{tabular}
\end{center}
\end{small}
\caption{RISC-V base instruction formats showing immediate variants.}
\label{fig:baseinstformatsimm}
\end{figure}

Figure~\ref{fig:immtypes} shows the immediates produced by each of the
base instruction formats, and is labeled to show which instruction
bit (inst[{\em y}\,]) produces each bit of the immediate value.

\begin{figure}[h]
\begin{center}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{p{0.2in}@{}p{1.2in}@{}p{1.0in}@{}p{0.2in}@{}p{0.7in}@{}p{0.7in}@{}p{0.2in}l}
\\
\multicolumn{1}{c}{\instbit{31}} &
\instbitrange{30}{20} &
\instbitrange{19}{12} &
\multicolumn{1}{c}{\instbit{11}} &
\instbitrange{10}{5} &
\instbitrange{4}{1} &
\multicolumn{1}{c}{\instbit{0}} &
\\
\cline{1-7}
\multicolumn{4}{|c|}{--- inst[31] ---} &
\multicolumn{1}{c|}{inst[30:25]} &
\multicolumn{1}{c|}{inst[24:21]} &
\multicolumn{1}{c|}{inst[20]} &
I-immediate \\
\cline{1-7}
\\
\cline{1-7}
\multicolumn{4}{|c|}{--- inst[31] ---} &
\multicolumn{1}{c|}{inst[30:25]} &
\multicolumn{1}{c|}{inst[11:8]} &
\multicolumn{1}{c|}{inst[7]} &
S-immediate \\
\cline{1-7}
\\
\cline{1-7}
\multicolumn{3}{|c|}{--- inst[31] ---} &
\multicolumn{1}{c|}{inst[7]} &
\multicolumn{1}{c|}{inst[30:25]} &
\multicolumn{1}{c|}{inst[11:8]} &
\multicolumn{1}{c|}{0} &
B-immediate \\
\cline{1-7}
\\
\cline{1-7}
\multicolumn{1}{|c|}{inst[31]} &
\multicolumn{1}{c|}{inst[30:20]} &
\multicolumn{1}{c|}{inst[19:12]} &
\multicolumn{4}{c|}{--- 0 ---} &
U-immediate \\
\cline{1-7}
\\
\cline{1-7}
\multicolumn{2}{|c|}{--- inst[31] ---} &
\multicolumn{1}{c|}{inst[19:12]} &
\multicolumn{1}{c|}{inst[20]} &
\multicolumn{1}{c|}{inst[30:25]} &
\multicolumn{1}{c|}{inst[24:21]} &
\multicolumn{1}{c|}{0} &
J-immediate \\
\cline{1-7}
\end{tabular}
\end{center}
\caption{Types of immediate produced by RISC-V instructions.  The fields are labeled with the
  instruction bits used to construct their value.  Sign extension
  always uses inst[31].}
\label{fig:immtypes}
\end{figure}

\begin{commentary}
Sign-extension is one of the most critical operations on immediates
(particularly in RV64I), and in RISC-V the sign bit for all immediates
is always held in bit 31 of the instruction to allow sign-extension to
proceed in parallel with instruction decoding.

Although more complex implementations might have separate adders for
branch and jump calculations and so would not benefit from keeping the
location of immediate bits constant across types of instruction, we
wanted to reduce the hardware cost of the simplest implementations.
By rotating bits in the instruction encoding of B and J immediates
instead of using dynamic hardware muxes to multiply the immediate by
2, we reduce instruction signal fanout and immediate mux costs by
around a factor of 2.  The scrambled immediate encoding will add
negligible time to static or ahead-of-time compilation.  For dynamic
generation of instructions, there is some small additional
overhead, but the most common short forward branches have
straightforward immediate encodings.
\end{commentary}

\section{Integer Computational Instructions}

Most integer computational instructions operate on XLEN bits of values
held in the integer register file.  Integer computational instructions
are either encoded as register-immediate operations using the I-type
format or as register-register operations using the R-type format.
The destination is register {\em rd} for both register-immediate and
register-register instructions.  No integer computational instructions
cause arithmetic exceptions.

\begin{commentary}
We did not include special instruction-set support for overflow checks
on integer arithmetic operations in the base instruction set, as many
overflow checks can be cheaply implemented using RISC-V branches.
Overflow checking for unsigned addition requires only a single
additional branch instruction after the addition:
\verb! add t0, t1, t2; bltu t0, t1, overflow!.

For signed addition, if one operand's sign is known, overflow checking
requires only a single branch after the addition:
\verb! addi t0, t1, +imm; blt t0, t1, overflow!.  This covers the
common case of addition with an immediate operand.

For general signed addition, three additional instructions after the
addition are required, leveraging the observation that the sum should
be less than one of the operands if and only if the other operand is
negative.
\begin{verbatim}
         add t0, t1, t2
         slti t3, t2, 0
         slt t4, t0, t1
         bne t3, t4, overflow
\end{verbatim}
In RV64, checks of 32-bit signed additions can be optimized further by
comparing the results of ADD and ADDW on the operands.
\end{commentary}

\subsubsection*{Integer Register-Immediate Instructions}
\vspace{-0.4in}
\begin{center}
\begin{tabular}{M@{}R@{}S@{}R@{}O}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[11:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
I-immediate[11:0] & src & ADDI/SLTI[U]  & dest & OP-IMM \\
I-immediate[11:0] & src & ANDI/ORI/XORI & dest & OP-IMM \\
\end{tabular}
\end{center}
ADDI adds the sign-extended 12-bit immediate to register {\em rs1}.
Arithmetic overflow is ignored and the result is simply the low
XLEN bits of the result.  ADDI {\em rd, rs1, 0} is used to implement the
MV {\em rd, rs1} assembler pseudoinstruction.

SLTI (set less than immediate) places the value 1 in register {\em rd}
if register {\em rs1} is less than the sign-extended immediate when
both are treated as signed numbers, else 0 is written to {\em rd}.
SLTIU is similar but compares the values as unsigned numbers (i.e.,
the immediate is first sign-extended to XLEN bits then treated as an
unsigned number).  Note, SLTIU {\em rd, rs1, 1} sets {\em rd}
to 1 if {\em rs1} equals zero, otherwise sets {\em rd} to 0 (assembler
pseudoinstruction SEQZ {\em rd, rs}).

ANDI, ORI, XORI are logical operations that perform bitwise AND, OR,
and XOR on register {\em rs1} and the sign-extended 12-bit immediate
and place the result in {\em rd}.  Note, XORI {\em rd, rs1, -1}
performs a bitwise logical inversion of register {\em rs1} (assembler
pseudoinstruction NOT {\em rd, rs}).

\vspace{-0.2in}
\begin{center}
\begin{tabular}{S@{}R@{}R@{}S@{}R@{}O}
\\
\instbitrange{31}{25} &
\instbitrange{24}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[11:5]} &
\multicolumn{1}{c|}{imm[4:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
7 & 5 & 5 & 3 & 5 & 7 \\
0000000 & shamt[4:0]  & src & SLLI & dest & OP-IMM \\
0000000 & shamt[4:0]  & src & SRLI & dest & OP-IMM \\
0100000 & shamt[4:0]  & src & SRAI & dest & OP-IMM \\
\end{tabular}
\end{center}

Shifts by a constant are encoded as a specialization of the
I-type format.  The operand to be shifted is in {\em rs1}, and the
shift amount is encoded in the lower 5 bits of the I-immediate field.
The right shift type is encoded in bit 30.
SLLI is a logical left shift (zeros are shifted into the lower bits);
SRLI is a logical right shift (zeros are shifted into the upper bits);
and SRAI is an arithmetic right shift (the original sign bit is copied
into the vacated upper bits).

\vspace{-0.2in}
\begin{center}
\begin{tabular}{U@{}R@{}O}
\\
\instbitrange{31}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[31:12]} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
20 & 5 & 7 \\
U-immediate[31:12] & dest & LUI \\
U-immediate[31:12] & dest & AUIPC
\end{tabular}
\end{center}

LUI (load upper immediate) is used to build 32-bit constants and uses
the U-type format.  LUI places the U-immediate value in the top 20
bits of the destination register {\em rd}, filling in the lowest 12
bits with zeros.

AUIPC (add upper immediate to {\tt pc}) is used to build {\tt pc}-relative
addresses and uses the U-type format.  AUIPC forms a 32-bit offset from the
20-bit U-immediate, filling in the lowest 12 bits with zeros, adds this offset
to the {\tt pc} of the AUIPC instruction, then places the result in register {\em rd}.

\begin{commentary}
The AUIPC instruction supports two-instruction sequences to access
arbitrary offsets from the PC for both control-flow transfers and data
accesses.  The combination of an AUIPC and the 12-bit immediate in a
JALR can transfer control to any 32-bit PC-relative address, while an
AUIPC plus the 12-bit immediate offset in regular load or store
instructions can access any 32-bit PC-relative data address.

The current PC can be obtained by setting the U-immediate to 0.
Although a JAL +4 instruction could also be used to obtain the local
PC (of the instruction following the JAL), it might cause pipeline
breaks in simpler microarchitectures or pollute BTB structures in more
complex microarchitectures.
\end{commentary}

\subsubsection*{Integer Register-Register Operations}

RV32I defines several arithmetic R-type operations.  All operations
read the {\em rs1} and {\em rs2} registers as source operands and
write the result into register {\em rd}.  The {\em funct7} and {\em
  funct3} fields select the type of operation.

\vspace{-0.2in}
\begin{center}
\begin{tabular}{S@{}R@{}R@{}S@{}R@{}O}
\\
\instbitrange{31}{25} &
\instbitrange{24}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{funct7} &
\multicolumn{1}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
7 & 5 & 5 & 3 & 5 & 7 \\
0000000 & src2 & src1 & ADD/SLT/SLTU & dest & OP    \\
0000000 & src2 & src1 & AND/OR/XOR  & dest & OP    \\
0000000 & src2 & src1 & SLL/SRL     & dest & OP    \\
0100000 & src2 & src1 & SUB/SRA     & dest & OP    \\
\end{tabular}
\end{center}

ADD performs the addition of {\em rs1} and {\em rs2}. SUB performs the
subtraction of {\em rs2} from {\em rs1}.  Overflows are ignored and the low XLEN
bits of results are written to the destination {\em rd}.
SLT and SLTU perform signed and unsigned compares
respectively, writing 1 to {\em rd} if $\mbox{\em rs1} < \mbox{\em
  rs2}$, 0 otherwise.  Note, SLTU {\em rd}, {\em x0}, {\em rs2} sets
{\em rd} to 1 if {\em rs2} is not equal to zero, otherwise sets {\em
  rd} to zero (assembler pseudoinstruction SNEZ {\em rd, rs}).  AND, OR, and
XOR perform bitwise logical operations.

SLL, SRL, and SRA perform logical left, logical right, and arithmetic
right shifts on the value in register {\em rs1} by the shift amount
held in the lower 5 bits of register {\em rs2}.

\subsubsection*{NOP Instruction}
\vspace{-0.4in}
\begin{center}
\begin{tabular}{M@{}R@{}S@{}R@{}O}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[11:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
0 & 0 & ADDI & 0 & OP-IMM \\
\end{tabular}
\end{center}

The NOP instruction does not change any architecturally visible state, except for
advancing the {\tt pc} and incrementing any applicable performance
counters.  NOP is encoded as ADDI {\em x0, x0, 0}.

\begin{commentary}
NOPs can be used to align code segments to microarchitecturally
significant address boundaries, or to leave space for inline code
modifications.  Although there are many possible ways to encode a NOP,
we define a canonical NOP encoding to allow microarchitectural
optimizations as well as for more readable disassembly output.  The
other NOP encodings are made available for HINT instructions
(Section~\ref{sec:rv32i-hints}).

ADDI was chosen for the NOP encoding as this is most likely to take
fewest resources to execute across a range of systems (if not
optimized away in decode).  In particular, the instruction only reads
one register.  Also, an ADDI functional unit is more likely to be
available in a superscalar design as adds are the most common
operation.  In particular, address-generation functional units can
execute ADDI using same hardware needed for base+offset address
calculations, while register-register ADD or logical/shift operations
require additional hardware.
\end{commentary}

\section{Control Transfer Instructions}

RV32I provides two types of control transfer instructions:
unconditional jumps and conditional branches.  Control transfer
instructions in RV32I do {\em not} have architecturally visible delay
slots.

\subsubsection*{Unconditional Jumps}

\vspace{-0.1in} The jump and link (JAL) instruction uses the J-type
format, where the J-immediate encodes a signed offset in multiples of
2 bytes.  The offset is sign-extended and added to the {\tt pc}
to form the jump target address.  Jumps can therefore target a
$\pm$\wunits{1}{MiB} range. JAL stores the address of the instruction
following the jump ({\tt pc}+4) into register {\em rd}.  The standard
software calling convention uses {\tt x1} as the return address
register and {\tt x5} as an alternate link register.

\begin{commentary}
The alternate link register supports calling millicode routines (e.g.,
those to save and restore registers in compressed code) while
preserving the regular return address register.  The register {\tt x5}
was chosen as the alternate link register as it maps to a temporary in
the standard calling convention, and has an encoding that is only one
bit different than the regular link register.
\end{commentary}

Plain unconditional jumps (assembler pseudoinstruction J) are encoded as a JAL
with {\em rd}={\tt x0}.

\vspace{-0.2in}
\begin{center}
\begin{tabular}{W@{}E@{}W@{}R@{}R@{}O}
\\
\multicolumn{1}{c}{\instbit{31}} &
\instbitrange{30}{21} &
\multicolumn{1}{c}{\instbit{20}} &
\instbitrange{19}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[20]} &
\multicolumn{1}{c|}{imm[10:1]} &
\multicolumn{1}{c|}{imm[11]} &
\multicolumn{1}{c|}{imm[19:12]} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
1 & 10 & \multicolumn{1}{c}{1} & 8 & 5 & 7 \\
\multicolumn{4}{c}{offset[20:1]} & dest & JAL \\
\end{tabular}
\end{center}

The indirect jump instruction JALR (jump and link register) uses the
I-type encoding.  The target address is obtained by adding the sign-extended
12-bit I-immediate to the register {\em rs1}, then setting the
least-significant bit of the result to zero.  The address of
the instruction following the jump ({\tt pc}+4) is written to register
{\em rd}.  Register {\tt x0} can be used as the destination if the
result is not required.
\vspace{-0.4in}
\begin{center}
\begin{tabular}{M@{}R@{}F@{}R@{}O}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[11:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
offset[11:0] & base & 0 & dest & JALR \\
\end{tabular}
\end{center}

\begin{commentary}
The unconditional jump instructions all use PC-relative addressing to
help support position-independent code.  The JALR instruction was
defined to enable a two-instruction sequence to jump anywhere in a
32-bit absolute address range.  A LUI instruction can first load {\em
  rs1} with the upper 20 bits of a target address, then JALR can add
in the lower bits. Similarly, AUIPC then JALR can jump
anywhere in a 32-bit {\tt pc}-relative address range.

Note that the JALR instruction does not treat the 12-bit immediate as
multiples of 2 bytes, unlike the conditional branch instructions.
This avoids one more immediate format in hardware.  In
practice, most uses of JALR will have either a zero immediate or be
paired with a LUI or AUIPC, so the slight reduction in range is not
significant.

Clearing the least-significant bit when calculating the JALR target
address both simplifies the hardware slightly and allows the
low bit of function pointers to be used to store auxiliary
information.  Although there is potentially a slight loss of error
checking in this case, in practice jumps to an incorrect instruction
address will usually quickly raise an exception.

When used with a base {\em rs1}$=${\tt x0}, JALR can be used to implement
a single instruction subroutine call to the lowest \wunits{2}{KiB} or highest
\wunits{2}{KiB} address region from anywhere in the address space, which could
be used to implement fast calls to a small runtime library.
\end{commentary}

The JAL and JALR instructions will generate an
instruction-address-misaligned exception if the target address is not
aligned to a four-byte boundary.

\begin{commentary}
Instruction-address-misaligned exceptions are not possible on machines
that support extensions with 16-bit aligned instructions, such as the
compressed instruction-set extension, C.
\end{commentary}

Return-address prediction stacks are a common feature of
high-performance instruction-fetch units, but require accurate
detection of instructions used for procedure calls and returns to be
effective.  For RISC-V, hints as to the instructions' usage are encoded
implicitly via the register numbers used.  A JAL instruction should
push the return address onto a return-address stack (RAS) only when
{\em rd}$=${\tt x1}/{\tt x5}.  JALR instructions should push/pop a
RAS as shown in the Table~\ref{rashints}.
\begin{table}[hbt]
\centering
\begin{tabular}{|c|c|c|l|}
  \hline
  \em rd & \em rs1 & {\em rs1}$=${\em rd} & RAS action \\
  \hline
  !{\em link} & !{\em link} & - & none \\
  !{\em link} &  {\em link} & - & pop \\
   {\em link} & !{\em link} & - & push  \\
   {\em link} &  {\em link} & 0 & pop, then push \\
   {\em link} &  {\em link} & 1 & push \\
   \hline
\end{tabular}
\caption{Return-address stack prediction hints encoded in register
  specifiers used in the instruction.  In the above, {\em link} is
  true when the register is either {\tt x1} or {\tt x5}.}
\label{rashints}
\end{table}

\begin{commentary}
Some other ISAs added explicit hint bits to their indirect-jump instructions
to guide return-address stack manipulation.  We use implicit hinting tied to
register numbers and the calling convention to reduce the encoding space used
for these hints.

When two different link registers ({\tt x1} and {\tt x5}) are given as
{\em rs1} and {\em rd}, then the RAS is both popped and pushed to
support coroutines.  If {\em rs1} and {\em rd} are the same link
register (either {\tt x1} or {\tt x5}), the RAS is only pushed to
enable macro-op fusion of the sequences:\linebreak
{\tt lui ra, imm20; jalr ra, imm12(ra)} \ and \ 
{\tt auipc ra, imm20; jalr ra, imm12(ra)}
\end{commentary}

\subsubsection*{Conditional Branches}

All branch instructions use the B-type instruction format.  The
12-bit B-immediate encodes signed offsets in multiples of 2, and is
added to the current {\tt pc} to give the target address.  The
conditional branch range is $\pm$\wunits{4}{KiB}.

\vspace{-0.2in}
\begin{center}
\begin{tabular}{W@{}R@{}F@{}F@{}R@{}R@{}F@{}S}
\\
\multicolumn{1}{c}{\instbit{31}} &
\instbitrange{30}{25} &
\instbitrange{24}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{8} &
\multicolumn{1}{c}{\instbit{7}} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[12]} &
\multicolumn{1}{c|}{imm[10:5]} &
\multicolumn{1}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{imm[4:1]} &
\multicolumn{1}{c|}{imm[11]} &
\multicolumn{1}{c|}{opcode} \\
\hline
1 & 6 & 5 & 5 & 3 & 4 & 1 & 7 \\
\multicolumn{2}{c}{offset[12,10:5]} & src2 & src1 & BEQ/BNE & \multicolumn{2}{c}{offset[11,4:1]} & BRANCH \\
\multicolumn{2}{c}{offset[12,10:5]} & src2 & src1 & BLT[U] & \multicolumn{2}{c}{offset[11,4:1]} & BRANCH \\
\multicolumn{2}{c}{offset[12,10:5]} & src2 & src1 & BGE[U]  & \multicolumn{2}{c}{offset[11,4:1]} & BRANCH \\
\end{tabular}
\end{center}

Branch instructions compare two registers.  BEQ and BNE take the
branch if registers {\em rs1} and {\em rs2} are equal or unequal
respectively.  BLT and BLTU take the branch if {\em rs1} is less than
{\em rs2}, using signed and unsigned comparison respectively.  BGE and
BGEU take the branch if {\em rs1} is greater than or equal to {\em rs2},
using signed and unsigned comparison respectively. Note, BGT, BGTU,
BLE, and BLEU can be synthesized by reversing the operands to BLT,
BLTU, BGE, and BGEU, respectively.

\begin{commentary}
Signed array bounds may be checked with a single BLTU instruction, since
any negative index will compare greater than any nonnegative bound.
\end{commentary}

Software should be optimized such that the sequential code path is the
most common path, with less-frequently taken code paths placed out of
line.  Software should also assume that backward branches will be
predicted taken and forward branches as not taken, at least the
first time they are encountered.  Dynamic predictors should quickly
learn any predictable branch behavior.

Unlike some other architectures, the RISC-V jump (JAL with {\em
  rd}={\tt x0}) instruction should always be used for unconditional
branches instead of a conditional branch instruction with an
always-true condition.  RISC-V jumps are also PC-relative and support
a much wider offset range than branches, and will not pollute
conditional-branch prediction tables.

\begin{commentary}
The conditional branches were designed to include arithmetic
comparison operations between two registers (as also done in PA-RISC
and Xtensa ISA), rather than use condition codes (x86, ARM, SPARC,
PowerPC), or to only compare one register against zero (Alpha, MIPS),
or two registers only for equality (MIPS).  This design was motivated
by the observation that a combined compare-and-branch instruction fits
into a regular pipeline, avoids additional condition code state or use
of a temporary register, and reduces static code size and dynamic
instruction fetch traffic.  Another point is that comparisons against
zero require non-trivial circuit delay (especially after the move to
static logic in advanced processes) and so are almost as expensive as
arithmetic magnitude compares.  Another advantage of a fused
compare-and-branch instruction is that branches are observed earlier
in the front-end instruction stream, and so can be predicted earlier.
There is perhaps an advantage to a design with condition codes in the
case where multiple branches can be taken based on the same condition
codes, but we believe this case to be relatively rare.

We considered but did not include static branch hints in the
instruction encoding.  These can reduce the pressure on dynamic
predictors, but require more instruction encoding space and
software profiling for best results, and can result in poor
performance if production runs do not match profiling runs.

We considered but did not include conditional moves or predicated
instructions, which can effectively replace unpredictable short
forward branches.  Conditional moves are the simpler of the two, but
are difficult to use with conditional code that might cause exceptions
(memory accesses and floating-point operations).  Predication adds
additional flag state to a system, additional instructions to set and
clear flags, and additional encoding overhead on every instruction.
Both conditional move and predicated instructions add complexity to
out-of-order microarchitectures, adding an implicit third source
operand due to the need to copy the original value of the destination
architectural register into the renamed destination physical register
if the predicate is false.  Also, static compile-time decisions to use
predication instead of branches can result in lower performance on
inputs not included in the compiler training set, especially given
that unpredictable branches are rare, and becoming rarer as branch
prediction techniques improve.

We note that various microarchitectural techniques exist to
dynamically convert unpredictable short forward branches into
internally predicated code to avoid the cost of flushing pipelines on
a branch mispredict~\cite{heil-tr1996,Klauser-1998,Kim-micro2005} and
have been implemented in commercial processors~\cite{ibmpower7}.
The simplest techniques just reduce the penalty of recovering from a
mispredicted short forward branch by only flushing instructions in the
branch shadow instead of the entire fetch pipeline, or by fetching
instructions from both sides using wide instruction fetch or idle
instruction fetch slots.  More complex techniques for out-of-order
cores add internal predicates on instructions in the branch shadow,
with the internal predicate value written by the branch instruction,
allowing the branch and following instructions to be executed
speculatively and out-of-order with respect to other code~\cite{ibmpower7}.
\end{commentary}

\section{Load and Store Instructions}

RV32I is a load-store architecture, where only load and store
instructions access memory and arithmetic instructions only operate on
CPU registers.  RV32I provides a 32-bit user address space that is
byte-addressed and little-endian.  The execution environment will
define what portions of the address space are legal to access with
which instructions (e.g., some addresses might be read only, or
support word access only).  Loads with a destination of {\tt x0} must
still raise any exceptions and action any other side effects even
though the load value is discarded.

\vspace{-0.4in}
\begin{center}
\begin{tabular}{M@{}R@{}F@{}R@{}O}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[11:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
offset[11:0] & base & width & dest & LOAD \\
\end{tabular}
\end{center}

\vspace{-0.2in}
\begin{center}
\begin{tabular}{O@{}R@{}R@{}F@{}R@{}O}
\\
\instbitrange{31}{25} &
\instbitrange{24}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[11:5]} &
\multicolumn{1}{c|}{rs2} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{imm[4:0]} &
\multicolumn{1}{c|}{opcode} \\
\hline
7 & 5 & 5 & 3 & 5 & 7 \\
offset[11:5] & src & base & width & offset[4:0] & STORE \\
\end{tabular}
\end{center}

Load and store instructions transfer a value between the registers and
memory.  Loads are encoded in the I-type format and stores are
S-type.  The effective byte address is obtained by adding register
{\em rs1} to the sign-extended 12-bit offset.  Loads copy a value
from memory to register {\em rd}.  Stores copy the value in register
{\em rs2} to memory.

The LW instruction loads a 32-bit value from memory into {\em rd}.  LH
loads a 16-bit value from memory, then sign-extends to 32-bits before
storing in {\em rd}. LHU loads a 16-bit value from memory but then
zero extends to 32-bits before storing in {\em rd}.  LB and LBU are
defined analogously for 8-bit values.  The SW, SH, and SB instructions
store 32-bit, 16-bit, and 8-bit values from the low bits of register
{\em rs2} to memory.

For best performance, the effective address for all loads and stores
should be naturally aligned for each data type (i.e., on a four-byte
boundary for 32-bit accesses, and a two-byte boundary for 16-bit
accesses).  The base ISA supports misaligned accesses, but these might
run extremely slowly depending on the implementation.  Furthermore,
naturally aligned loads and stores are guaranteed to execute
atomically, whereas misaligned loads and stores might not, and hence
require additional synchronization to ensure atomicity.

\begin{commentary}
Misaligned accesses are occasionally required when porting legacy
code, and are essential for good performance on many applications when
using any form of packed-SIMD extension.  Our rationale for supporting
misaligned accesses via the regular load and store instructions is to
simplify the addition of misaligned hardware support.  One option
would have been to disallow misaligned accesses in the base ISA and
then provide some separate ISA support for misaligned accesses, either
special instructions to help software handle misaligned accesses or a
new hardware addressing mode for misaligned accesses.  Special
instructions are difficult to use, complicate the ISA, and often add
new processor state (e.g., SPARC VIS align address offset register) or
complicate access to existing processor state (e.g., MIPS LWL/LWR
partial register writes).  In addition, for loop-oriented packed-SIMD
code, the extra overhead when operands are misaligned motivates
software to provide multiple forms of loop depending on operand
alignment, which complicates code generation and adds to loop startup
overhead.  New misaligned hardware addressing modes take considerable
space in the instruction encoding or require very simplified
addressing modes (e.g., register indirect only).

We do not mandate atomicity for misaligned accesses so simple
implementations can just use a machine trap and software handler to
handle some or all misaligned accesses.  If hardware misaligned support is
provided, software can exploit this by simply using regular load and
store instructions.  Hardware can then automatically optimize accesses
depending on whether runtime addresses are aligned.
\end{commentary}

\section{Control and Status Register Instructions}
\label{sec:csrinsts}

SYSTEM instructions are used to access system functionality that might
require privileged access and are encoded using the I-type instruction
format.  These can be divided into two main classes: those that
atomically read-modify-write control and status registers (CSRs), and
all other potentially privileged instructions. CSR instructions are
described in this section, with the two other user-level SYSTEM
instructions described in the following section.

\begin{commentary}
The SYSTEM instructions are defined to allow simpler implementations
to always trap to a single software trap handler.  More sophisticated
implementations might execute more of each system instruction in
hardware.
\end{commentary}

\subsubsection*{CSR Instructions}

We define the full set of CSR instructions here, although in the standard
user-level base ISA, only a handful of read-only counter CSRs are accessible.

\vspace{-0.2in}
\begin{center}
\begin{tabular}{M@{}R@{}F@{}R@{}S}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{csr} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
source/dest  & source & CSRRW  & dest & SYSTEM \\
source/dest  & source & CSRRS  & dest & SYSTEM \\
source/dest  & source & CSRRC  & dest & SYSTEM \\
source/dest  & uimm[4:0]   & CSRRWI & dest & SYSTEM \\
source/dest  & uimm[4:0]   & CSRRSI & dest & SYSTEM \\
source/dest  & uimm[4:0]   & CSRRCI & dest & SYSTEM \\
\end{tabular}
\end{center}

The CSRRW (Atomic Read/Write CSR) instruction atomically swaps values
in the CSRs and integer registers. CSRRW reads the old value of the
CSR, zero-extends the value to XLEN bits, then writes it to integer
register {\em rd}.  The initial value in {\em rs1} is written to the
CSR.  If {\em rd}={\tt x0}, then the instruction shall not read the CSR
and shall not cause any of the side-effects that might occur on a CSR
read.

The CSRRS (Atomic Read and Set Bits in CSR) instruction reads the
value of the CSR, zero-extends the value to XLEN bits, and writes it
to integer register {\em rd}.  The initial value in integer register
{\em rs1} is treated as a bit mask that specifies bit positions to be
set in the CSR.  Any bit that is high in {\em rs1} will cause the
corresponding bit to be set in the CSR, if that CSR bit is writable.
Other bits in the CSR are unaffected (though CSRs might have side
effects when written).

The CSRRC (Atomic Read and Clear Bits in CSR) instruction reads the
value of the CSR, zero-extends the value to XLEN bits, and writes it
to integer register {\em rd}.  The initial value in integer register
{\em rs1} is treated as a bit mask that specifies bit positions to be
cleared in the CSR.  Any bit that is high in {\em rs1} will cause the
corresponding bit to be cleared in the CSR, if that CSR bit is
writable.  Other bits in the CSR are unaffected.

For both CSRRS and CSRRC, if {\em rs1}={\tt x0}, then the instruction
will not write to the CSR at all, and so shall not cause any of the
side effects that might otherwise occur on a CSR write, such as
raising illegal instruction exceptions on accesses to read-only CSRs.
Note that if {\em rs1} specifies a register holding a zero value other
than {\tt x0}, the instruction will still attempt to write the
unmodified value back to the CSR and will cause any attendant side effects.

The CSRRWI, CSRRSI, and CSRRCI variants are similar to CSRRW, CSRRS,
and CSRRC respectively, except they update the CSR using an XLEN-bit
value obtained by zero-extending a 5-bit unsigned immediate (uimm[4:0]) field
encoded in the {\em rs1} field instead of a value from an integer
register.  For CSRRSI and CSRRCI, if the uimm[4:0] field is zero, then
these instructions will not write to the CSR, and shall not cause any
of the side effects that might otherwise occur on a CSR write.  For
CSRRWI, if {\em rd}={\tt x0}, then the instruction shall not read the
CSR and shall not cause any of the side-effects that might occur on a
CSR read.

Some CSRs, such as the instructions retired counter, {\tt instret}, may be
modified as side effects of instruction execution.  In these cases, if a CSR
access instruction reads a CSR, it reads the value prior to the execution of
the instruction.  If a CSR access instruction writes a CSR, the update occurs
after the execution of the instruction.  In particular, a value written to
{\tt instret} by one instruction will be the value read by the following
instruction (i.e., the increment of {\tt instret} caused by the first
instruction retiring happens before the write of the new value).

The assembler pseudoinstruction to read a CSR, CSRR {\em rd, csr}, is
encoded as CSRRS {\em rd, csr, x0}.  The assembler pseudoinstruction
to write a CSR, CSRW {\em csr, rs1}, is encoded as CSRRW {\em x0, csr,
  rs1}, while CSRWI {\em csr, uimm}, is encoded as CSRRWI {\em x0,
  csr, uimm}.

Further assembler pseudoinstructions are defined to set and clear
bits in the CSR when the old value is not required: CSRS/CSRC {\em
  csr, rs1}; CSRSI/CSRCI {\em csr, uimm}.

\subsubsection*{Timers and Counters}

\vspace{-0.2in}
\begin{center}
\begin{tabular}{M@{}R@{}F@{}R@{}S}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{csr} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
RDCYCLE[H]   & 0 & CSRRS  & dest & SYSTEM \\
RDTIME[H]    & 0 & CSRRS  & dest & SYSTEM \\
RDINSTRET[H] & 0 & CSRRS  & dest & SYSTEM \\
\end{tabular}
\end{center}

RV32I provides a number of 64-bit read-only user-level counters, which
are mapped into the 12-bit CSR address space and accessed in 32-bit
pieces using CSRRS instructions.

\begin{commentary}
Some execution environments might prohibit access to counters to
impede timing side-channel attacks.
\end{commentary}

The RDCYCLE pseudoinstruction reads the low XLEN bits of the {\tt
  cycle} CSR which holds a count of the number of clock cycles
executed by the processor core on which the hart is running from
an arbitrary start time in the past.  RDCYCLEH is
an RV32I-only instruction that reads bits 63--32 of the same cycle
counter.  The underlying 64-bit counter should never overflow in
practice.  The rate at which the cycle counter advances will depend on
the implementation and operating environment.  The execution
environment should provide a means to determine the current rate
(cycles/second) at which the cycle counter is incrementing.

\begin{commentary}
RDCYCLE is intended to return the number of cycles executed by the
processor core, not the hart.  Precisely defining what is a ``core'' is
difficult given some implementation choices (e.g., AMD Bulldozer).
Precisely defining what is a ``clock cycle'' is also difficult given the
range of implementations (including software emulations), but the
intent is that RDCYCLE is used for performance monitoring along with the
other performance counters.  In particular, where there is one
hart/core, one would expect cycle-count/instructions-retired to
measure CPI for a hart.

Cores don't have to be exposed to software at all, and an implementor
might choose to pretend multiple harts on one physical core are
running on separate cores with one hart/core, and provide separate
cycle counters for each hart.  This might make sense in a simple
barrel processor (e.g., CDC 6600 peripheral processors) where
inter-hart timing interactions are non-existent or minimal.

Where there is more than one hart/core and dynamic multithreading, it
is not generally possible to separate out cycles per hart (especially
with SMT).  It might be possible to define a separate performance
counter that tried to capture the number of cycles a particular hart
was running, but this definition would have to be very fuzzy to cover
all the possible threading implementations.  For example, should we
only count cycles for which any instruction was issued to execution
for this hart, and/or cycles any instruction retired, or include
cycles this hart was occupying machine resources but couldn't execute
due to stalls while other harts went into execution? Likely, ``all of
the above'' would be needed to have understandable performance stats.
This complexity of defining a per-hart cycle count, and also the need
in any case for a total per-core cycle count when tuning multithreaded
code led to just standardizing the per-core cycle counter, which also
happens to work well for the common single hart/core case.

Standardizing what happens during ``sleep'' is not practical given
that what ``sleep'' means is not standardized across execution
environments, but if the entire core is paused (entirely clock-gated
or powered-down in deep sleep), then it is not executing clock cycles,
and the cycle count shouldn't be increasing per the spec.  There are
many details, e.g., whether clock cycles required to reset a processor
after waking up from a power-down event should be counted, and these
are considered execution-environment-specific details.

Even though there is no precise definition that works for all
platforms, this is still a useful facility for most platforms, and an
imprecise, common, ``usually correct'' standard here is better than no
standard.  The intent of RDCYCLE was primarily performance
monitoring/tuning, and the specification was written with that goal in
mind.
\end{commentary}

The RDTIME pseudoinstruction reads the low XLEN bits of the {\tt
  time} CSR, which counts wall-clock real time that has passed from an
arbitrary start time in the past.  RDTIMEH is an RV32I-only instruction
that reads bits 63--32 of the same real-time counter.  The underlying 64-bit
counter should never overflow in practice.  The execution environment
should provide a means of determining the period of the real-time
counter (seconds/tick).  The period must be constant.  The
real-time clocks of all harts in a single user application
should be synchronized to within one tick of the real-time clock.  The
environment should provide a means to determine the accuracy of the
clock.

\begin{commentary}
On some simple platforms, cycle count might represent a valid
implementation of RDTIME, but in this case, platforms should implement
the RDTIME instruction as an alias for RDCYCLE to make code more
portable, rather than using RDCYCLE to measure wall-clock time.
\end{commentary}

The RDINSTRET pseudoinstruction reads the low XLEN bits of the {\tt
  instret} CSR, which counts the number of instructions retired by
this hart from some arbitrary start point in the past.  RDINSTRETH is
an RV32I-only instruction that reads bits 63--32 of the same
instruction counter. The underlying 64-bit counter that should never
overflow in practice.

The following code sequence will read a valid 64-bit cycle counter value into
{\tt x3}:{\tt x2}, even if the counter overflows between reading its upper
and lower halves.

\begin{figure}[h!]
\begin{center}
\begin{verbatim}
    again:
        rdcycleh     x3
        rdcycle      x2
        rdcycleh     x4
        bne          x3, x4, again
\end{verbatim}
\end{center}
\caption{Sample code for reading the 64-bit cycle counter in RV32.}
\label{rdcycle}
\end{figure}

\begin{commentary}
We would like these basic counters be provided in all implementations as
they are essential for basic performance analysis, adaptive and
dynamic optimization, and to allow an application to work with
real-time streams.  Additional counters should be provided to help
diagnose performance problems and these should be made accessible from
user-level application code with low overhead.

We required the counters be 64 bits wide, even on RV32, as otherwise
it is very difficult for software to determine if values have
overflowed.  For a low-end implementation, the upper 32 bits of each
counter can be implemented using software counters incremented by a
trap handler triggered by overflow of the lower 32 bits.  The sample
code described above shows how the full 64-bit width value can be
safely read using the individual 32-bit instructions.

In some applications, it is important to be able to read multiple
counters at the same instant in time.  When run under a multitasking
environment, a user thread can suffer a context switch while
attempting to read the counters.  One solution is for the user thread
to read the real-time counter before and after reading the other
counters to determine if a context switch occurred in the middle of the
sequence, in which case the reads can be retried.  We considered
adding output latches to allow a user thread to snapshot the counter
values atomically, but this would increase the size of the user
context, especially for implementations with a richer set of counters.
\end{commentary}


\section{Environment Call and Breakpoints}

\vspace{-0.2in}
\begin{center}
\begin{tabular}{M@{}R@{}F@{}R@{}S}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{funct12} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
ECALL   & 0 & PRIV & 0 & SYSTEM \\
EBREAK  & 0 & PRIV & 0 & SYSTEM \\
\end{tabular}
\end{center}

The ECALL instruction is used to make a request to the supporting
execution environment, which is usually an operating system.  The ABI
for the system will define how parameters for the environment request
are passed, but usually these will be in defined locations in the
integer register file.

The EBREAK instruction is used by debuggers to cause control to be
transferred back to a debugging environment.

\begin{commentary}
ECALL and EBREAK were previously named SCALL and SBREAK.  The
instructions have the same functionality and encoding, but were
renamed to reflect that they can be used more generally than to call a
supervisor-level operating system or debugger.
\end{commentary}

\section{Memory Ordering Instructions}
\label{sec:fence}

\vspace{-0.2in}
\begin{center}
\begin{tabular}{F@{}IIIIIIIIF@{}F@{}F@{}S}
\\
\instbitrange{31}{28} &
\multicolumn{1}{c}{\instbit{27}} &
\multicolumn{1}{c}{\instbit{26}} &
\multicolumn{1}{c}{\instbit{25}} &
\multicolumn{1}{c}{\instbit{24}} &
\multicolumn{1}{c}{\instbit{23}} &
\multicolumn{1}{c}{\instbit{22}} &
\multicolumn{1}{c}{\instbit{21}} &
\multicolumn{1}{c}{\instbit{20}} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{fm} &
\multicolumn{1}{c|}{PI} &
\multicolumn{1}{c|}{PO} &
\multicolumn{1}{c|}{PR} &
\multicolumn{1}{c|}{PW} &
\multicolumn{1}{|c|}{SI} &
\multicolumn{1}{c|}{SO} &
\multicolumn{1}{c|}{SR} &
\multicolumn{1}{c|}{SW} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
4 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 5 & 3 & 5 & 7 \\
FM & \multicolumn{4}{c}{predecessor} & \multicolumn{4}{c}{successor} & 0 & FENCE & 0 & MISC-MEM \\
\end{tabular}
\end{center}

The FENCE instruction is used to order device I/O and
memory accesses as viewed by other RISC-V harts and external devices
or coprocessors.  Any combination of device input (I), device output
(O), memory reads (R), and memory writes (W) may be ordered with
respect to any combination of the same.  Informally, no other RISC-V
hart or external device can observe any operation in the {\em
  successor} set following a FENCE before any operation in the {\em
  predecessor} set preceding the FENCE.
Chapter~\ref{ch:memorymodel} provides a precise description of the
RISC-V memory consistency model.
  
The execution environment
will define what I/O operations are possible, and in particular, which
load and store instructions might be treated and ordered as device
input and device output operations respectively rather than memory
reads and writes.  For example, memory-mapped I/O devices will
typically be accessed with uncached loads and stores that are ordered
using the I and O bits rather than the R and W bits.  Instruction-set
extensions might also describe new coprocessor I/O instructions that
will also be ordered using the I and O bits in a FENCE.

\begin{table}[htp]
\begin{small}
\begin{center}
\begin{tabular}{|c|c|l|}
\hline
{\em fm} field & Mnemonic & Meaning \\
\hline
0000 & \em none & Normal Fence \\
\hline
\multirow{2}{*}{1000} & \multirow{2}{*}{TSO} & With FENCE RW,RW: exclude write-to-read ordering \\
                      &                      & Otherwise: \em Reserved for future use. \\
\hline
\multicolumn{2}{|c|}{\em other} & \em Reserved for future use. \\
\hline
\end{tabular}
\end{center}
\end{small}
\caption{Fence mode encoding.}
\label{fm}
\end{table}

The fence mode field {\em fm} defines the semantics of the FENCE.
A FENCE with {\em fm}=0000 orders all memory operations in its predecessor set before all memory operations in its successor set.
A FENCE.TSO instruction orders all load operations in its predecessor set before all memory operations in its successor set, and all store operations in its predecessor set before all store operations in its successor set.
This leaves non-AMO store operations in the FENCE.TSO's predecessor set unordered with non-AMO loads in its successor set.

The unused fields in the FENCE instructions---{\em rs1} and {\em rd}---are
reserved for finer-grain fences in future extensions.  For forward
compatibility, base implementations shall ignore these fields, and standard
software shall zero these fields.  Likewise, many {\em fm} and
predecessor/successor set settings in Table~\ref{fm} are also reserved
for future use.  Base implementations shall treat all such reserved
configurations as normal fences with {\em fm}=0000, and standard
software shall use only non-reserved configurations.

\begin{commentary}
We chose a relaxed memory model to allow high performance from simple
machine implementations and from likely future
coprocessor or accelerator extensions.  We separate out I/O ordering
from memory R/W ordering to avoid unnecessary serialization within a
device-driver hart and also to support alternative non-memory paths
to control added coprocessors or I/O devices.  Simple implementations
may additionally ignore the {\em predecessor} and {\em successor}
fields and always execute a conservative fence on all operations.
\end{commentary}

\vspace{-0.4in}
\begin{center}
\begin{tabular}{M@{}R@{}S@{}R@{}O}
\\
\instbitrange{31}{20} &
\instbitrange{19}{15} &
\instbitrange{14}{12} &
\instbitrange{11}{7} &
\instbitrange{6}{0} \\
\hline
\multicolumn{1}{|c|}{imm[11:0]} &
\multicolumn{1}{c|}{rs1} &
\multicolumn{1}{c|}{funct3} &
\multicolumn{1}{c|}{rd} &
\multicolumn{1}{c|}{opcode} \\
\hline
12 & 5 & 3 & 5 & 7 \\
0 & 0 & FENCE.I & 0 & MISC-MEM \\
\end{tabular}
\end{center}

The FENCE.I instruction is used to synchronize the instruction and
data streams.  RISC-V does not guarantee that stores to instruction
memory will be made visible to instruction fetches on the same RISC-V
hart until a FENCE.I instruction is executed.  A FENCE.I instruction
only ensures that a subsequent instruction fetch on a RISC-V hart
will see any previous data stores already visible to the same RISC-V
hart.  FENCE.I does {\em not} ensure that other RISC-V harts'
instruction fetches will observe the local hart's stores in a
multiprocessor system. To make a store to instruction memory visible
to all RISC-V harts, the writing hart has to execute a data FENCE
before requesting that all remote RISC-V harts execute a FENCE.I.

The unused fields in the FENCE.I instruction, {\em imm[11:0]}, {\em rs1}, and
{\em rd}, are reserved for finer-grain fences in future extensions.  For
forward compatibility, base implementations shall ignore these fields, and
standard software shall zero these fields.

\begin{commentary}
Because FENCE.I only orders stores with a hart's own instruction fetches,
application code should only rely upon FENCE.I if the application thread will
not be migrated to a different hart.  The ABI will provide mechanisms for
multiprocessor instruction-stream synchronization.
\end{commentary}

\begin{commentary}
The FENCE.I instruction was designed to support a wide variety of
implementations.  A simple implementation can flush the local
instruction cache and the instruction pipeline when the FENCE.I is
executed.  A more complex implementation might snoop the instruction
(data) cache on every data (instruction) cache miss, or use an
inclusive unified private L2 cache to invalidate lines from the
primary instruction cache when they are being written by a local store
instruction.  If instruction and data caches are kept coherent in this
way, then only the pipeline needs to be flushed at a FENCE.I.

We considered but did not include a ``store instruction word''
instruction (as in MAJC~\cite{majc}).  JIT compilers may generate a
large trace of instructions before a single FENCE.I, and amortize any
instruction cache snooping/invalidation overhead by writing translated
instructions to memory regions that are known not to reside in the
I-cache.
\end{commentary}

\section{HINT Instructions}
\label{sec:rv32i-hints}

RV32I reserves a large encoding space for HINT instructions, which are
usually used to communicate performance hints to the
microarchitecture.  HINTs are encoded as integer computational
instructions with {\em rd}={\tt x0}.  Hence, like the NOP instruction,
HINTs do not change any architecturally visible state, except for
advancing the {\tt pc} and any applicable performance counters.
Implementations are always allowed to ignore the encoded hints.

\begin{commentary}
This HINT encoding has been chosen so that simple implementations can ignore
HINTs altogether, and instead execute a HINT as a regular computational
instruction that happens not to mutate the architectural state.  For example, ADD is
a HINT if the destination register is {\tt x0}; the five-bit {\em rs1} and {\em
rs2} fields encode arguments to the HINT.  However, a simple implementation can
simply execute the HINT as an ADD of {\em rs1} and {\em rs2} that writes {\tt
x0}, which has no architecturally visible effect.
\end{commentary}

Table~\ref{tab:rv32i-hints} lists all RV32I HINT code points.  91\% of the HINT
space is reserved for standard HINTs, but none are presently defined.  The
remainder of the HINT space is reserved for custom HINTs: no standard HINTs
will ever be defined in this subspace.

\begin{commentary}
No standard hints are presently defined (except the privileged WFI
instruction which uses a separately reserved encoding).  We anticipate
standard hints to eventually include memory-system spatial and
temporal locality hints, branch prediction hints, thread-scheduling
hints, security tags, and instrumentation flags for
simulation/emulation.
\end{commentary}

\begin{table}[hbt]
\centering
\begin{tabular}{|l|l|c|l|}
  \hline
  Instruction           & Constraints                                 & Code Points & Purpose \\ \hline \hline
  LUI                   & {\em rd}={\tt x0}                           & $2^{20}$                    & \multirow{15}{*}{\em Reserved for future standard use} \\ \cline{1-3}
  AUIPC                 & {\em rd}={\tt x0}                           & $2^{20}$                    & \\ \cline{1-3}
  \multirow{2}{*}{ADDI} & {\em rd}={\tt x0}, and either               & \multirow{2}{*}{$2^{17}-1$} & \\
                        & {\em rs1}$\neq${\tt x0} or {\em imm}$\neq$0 &                             & \\ \cline{1-3}
  ANDI                  & {\em rd}={\tt x0}                           & $2^{17}$                    & \\ \cline{1-3}
  ORI                   & {\em rd}={\tt x0}                           & $2^{17}$                    & \\ \cline{1-3}
  XORI                  & {\em rd}={\tt x0}                           & $2^{17}$                    & \\ \cline{1-3}
  ADD                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SUB                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  AND                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  OR                    & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  XOR                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SLL                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SRL                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SRA                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \hline \hline
  SLTI                  & {\em rd}={\tt x0}                           & $2^{17}$                    & \multirow{7}{*}{\em Reserved for custom use} \\ \cline{1-3}
  SLTIU                 & {\em rd}={\tt x0}                           & $2^{17}$                    & \\ \cline{1-3}
  SLLI                  & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SRLI                  & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SRAI                  & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SLT                   & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \cline{1-3}
  SLTU                  & {\em rd}={\tt x0}                           & $2^{10}$                    & \\ \hline
\end{tabular}
\caption{RV32I HINT instructions.}
\label{tab:rv32i-hints}
\end{table}