aboutsummaryrefslogtreecommitdiff
path: root/src/memory.tex
blob: 94e3733084ef6cfe7c6fc7aef3c1c42c2c2ee811 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
\newenvironment{tentative}
{ \vspace{-0.2in}
  \begin{quotation}
  \noindent
  \color{red}MEMORY MODEL TASK GROUP TO-DO

  \small \em
  \rule{\linewidth}{1pt}\\
}
{ 
  \end{quotation}
  \vspace{-0.2in}
}
\lstdefinelanguage{alloy}{
  morekeywords={abstract, sig, extends, pred, fun, fact, no, set, one, lone, let, not, all, iden, some, run, for},
  morecomment=[l]{//},
  morecomment=[s]{/*}{*/},
  commentstyle=\color{green!40!black},
  keywordstyle=\color{blue!40!black},
  moredelim=**[is][\color{red}]{@}{@},
  escapeinside={!}{!},
}
\lstset{language=alloy}
\lstset{aboveskip=0pt}
\lstset{belowskip=0pt}

\newcommand{\diagram}{(picture coming soon)}

\chapter{RVWMO Explanatory Material}
\label{sec:explanation}
This section provides more explanation for the RVWMO memory model, using more informal language and concrete examples.
These are intended to clarify the meaning and intent of the axioms and preserved program order rules.
%In case of any discrepancy between the informal descriptions here and the formal descriptions elsewhere, the formal definitions should be considered authoritative.

\section{Why RVWMO?}
\label{sec:whynottso}

Memory consistency models fall along a loose spectrum from weak to strong.
Weak memory models (e.g., ARMv7, Power, Alpha) allow more hardware implementation flexibility and deliver arguably better performance, performance per watt, power, scalability, and hardware verification overheads than strong models, at the expense of a more complex programming model.
%Models which are too weak may not even be properly analyzable with modern formal analysis techniques.
Strong models (e.g., sequential consistency, TSO) provide simpler programming models, but at the cost of imposing more restrictions on the kinds of hardware optimizations that can be performed in the pipeline and in the memory system, with some cost to power and area overheads, and with some added hardware verification burden.

For the base ISA, RISC-V has chosen the RVWMO memory model, which is a variant of release consistency.
This places it in between the two extremes of the memory model spectrum.
It is not as weak as the Power memory model, and this buys back some programming model simplicity without giving up very much in terms of performance.
RVWMO is also not as restrictive as RVTSO, and hence it remains weak enough to ensure that implementations can be performant and scalable without incurring huge hardware complexity overheads.
RVWMO is similar to the ARMv8 memory model in this regard.

As such, the RVWMO memory model enables architects to build simple implementations, aggressive implementations, implementations embedded deeply inside a much larger system and subject to complex memory system interactions, or any number of other possibilities, all while simultaneously being strong enough to support programming language memory models at high performance.

The risk of a weak memory model lies in the complexity of the programming model.
Buggy code which ``just worked'' on stronger implementations may well break on more aggressive implementations due to the bugs simply not manifesting on the stronger-than-necessary implementations.
For these situations, though, the root cause is the bug in the original software, not the memory model itself.
The risk of finding short-term bugs in code ported from other architectures is outweighed by the long-term benefits that the weak memory model delivers more generally.

To mitigate this risk, some hardware implementations may choose to stick with RVTSO, and that is perfectly acceptable and perfectly compatible with the RVWMO memory model.
The cost that the weak memory model imposes on such implementations is the incremental overhead of fetching instructions (e.g., {\tt fence~r,rw} and {\tt fence rw,w}) which become no-ops on that implementation.
(These fences must remain present in the code to ensure compatibility with other more weakly-ordered RISC-V implementations.)

Most software is also fully compatible with weak memory models.
C/C++, Java, and Linux, to name some of the most notable and more formally analyzed examples, are all entirely compatible with weak non-atomic memory models, as all are designed to run not just on x86 but also on ARM, Power, and many other architectures.
It is true that some code, e.g., code ported from x86, does sometimes (correctly or incorrectly) assume a stronger model such as TSO.
For such code, the RVWMO memory model provides a means for restoring TSO to sections of code through fences and atomics with {\tt .aq} and {\tt .rl} bits in the ``A'' extension, until such code can be ported to RVWMO over time.

Designers who wish to provide drop-in compatibility with x86 code can also implement the Ztso extension which enforces RVTSO.
Code written for RVWMO is automatically and inherently compatible with RVTSO, but code written assuming RVTSO is not guaranteed to run correctly on RVWMO implementations.
In fact, RVWMO implementations will (and should) simply refuse to run TSO-only binaries.
Each implementation must therefore choose whether to prioritize compatibility with RVTSO code (e.g., to facilitate porting from x86) or whether to instead prioritize compatibility with other RISC-V cores implementing RVWMO.

\section{Litmus Tests}
The explanations in this chapter make use of {\em litmus tests}, or small programs designed to test or highlight one particular aspect of a memory model.
Figure~\ref{fig:litmus:sample} shows an example of a litmus test with two harts.
For this figure (and for all figures that follow in this chapter), we assume that {\tt s0}--{\tt s2} are pre-set to the same value in all harts.
As a convention, we will assume that {\tt s0} holds the address labeled {\tt x}, {\tt s1} holds {\tt y}, and {\tt s2} holds {\tt z}, where {\tt x}, {\tt y}, and {\tt z} are different memory addresses.
This figure shows the same program twice: on the left in RISC-V assembly, and again on the right in graphical form.

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
          & $\vdots$    &     & $\vdots$    \\
          & li t1, 1    &     & li t4, 4    \\
      (a) & sw t1,0(s0) & (e) & sw t4,0(s0) \\
          & $\vdots$    &     & $\vdots$    \\
          & li t2, 2    &     &             \\
      (b) & sw t2,0(s0) &     &             \\
          & $\vdots$    &     & $\vdots$    \\
      (c) & lw a0,0(s0) &     &             \\
          & $\vdots$    &     & $\vdots$    \\
          & li t3, 3    &     & li t5, 5    \\
      (d) & sw t3,0(s0) & (f) & sw t5,0(s0) \\
          & $\vdots$    &     & $\vdots$    \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{A sample litmus test}
  \label{fig:litmus:sample}
\end{figure}

Litmus tests are used to understand the implications of the memory model in specific concrete situations.
For example, in the litmus test of Figure~\ref{fig:litmus:sample}, the final value of {\tt a0} in the first hart can be either 2, 4, or 5, depending on the dynamic interleaving of the instruction stream from each hart at runtime.
However, in this example, the final value of {\tt a0} in Hart 0 will never be 1 or 3: the value 1 will no longer be visible at the time the load executes, and the value 3 will not yet be visible by the time the load executes.

We analyze this test and many others below.

\section{Explaining the RVWMO Rules}
In this section, we provide explanation and examples for all of the RVWMO rules and axioms.

\subsection{Preserved Program Order and Global Memory Order}
Preserved program order represents the set of intra-hart orderings that the hart's pipeline must ensure are maintained as the instructions execute, even in the presence of hardware optimizations that might otherwise reorder those operations.
Events from the same hart which are not ordered by preserved program order, on the other hand, may appear reordered from the perspective of other harts and/or observers.

Informally, the global memory order represents the order in which loads and stores perform.
The formal memory model literature has moved away from specifications built around the concept of performing, but the idea is still useful for building up informal intuition.
A load is said to have performed when its return value is determined.
A store is said to have performed not when it has executed inside the pipeline, but rather only when its value has been propagated to globally visible memory.
In this sense, the global memory order also represents the contribution of the coherence protocol and/or the rest of the memory system to interleave the (possibly reordered) memory accesses being issued by each hart into a single total order agreed upon by all harts.

The order in which loads perform does not always directly correspond to the relative age of the values those two loads return.
In particular, a load $b$ may perform before another load $a$ to the same address (i.e., $b$ may execute before $a$, and $b$ may appear before $a$ in the global memory order), but $a$ may nevertheless return an older value than $b$.
This discrepancy captures the reordering effects of store buffers placed between the core and memory: a younger load may read from a value in the store buffer, while an older load which appears before that store in program order may ignore that younger store and read an older value from memory instead.
To account for this, at the time each load performs, the value it returns is determined by the load value axiom, not just strictly by determining the most recent store to the same address in the global memory order, as described below.

\subsection{Store Buffering (Load Value Axiom)}
\begin{tabular}{p{1cm}|p{12cm}} &
Load value axiom:\loadvalueaxiom
\end{tabular}

Preserved program order is {\em not} required to respect the ordering of a store followed by a load to an overlapping address.
This complexity arises due to the ubiquity of store buffers in nearly all implementations.
Informally, the load may perform (return a value) by forwarding from the store while the store is still in the store buffer, and hence before the store itself performs (writes back to globally visible memory).
Any other hart will therefore observe the load as performing before the store.

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
          & li t1, 1    &     & li t1, 1    \\
      (a) & sw t1,0(s0) & (e) & sw t1,0(s1) \\
      (b) & lw a0,0(s0) & (f) & lw a2,0(s1) \\
      (c) & fence r,r   & (g) & fence r,r   \\
      (d) & lw a1,0(s1) & (h) & lw a3,0(s0) \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{A store buffer forwarding litmus test}
  \label{fig:litmus:storebuffer}
\end{figure}

Consider the litmus test of Figure~\ref{fig:litmus:storebuffer}.
When running this program on an implementation with store buffers, it is possible to arrive at the final outcome
{\tt a0=1, a1=0, a2=1, a3=0}
as follows:
\begin{itemize}
  \item (a) executes and enters the first hart's private store buffer
  \item (b) executes and forwards its return value 1 from (a) in the store buffer
  \item (c) executes since all previous loads (i.e., (b)) have completed
  \item (d) executes and reads the value 0 from memory
  \item (e) executes and enters the second hart's private store buffer
  \item (f) executes and forwards its return value 1 from (d) in the store buffer
  \item (g) executes since all previous loads (i.e., (f)) have completed
  \item (h) executes and reads the value 0 from memory
  \item (a) drains from the first hart's store buffer to memory
  \item (e) drains from the second hart's store buffer to memory
\end{itemize}
Therefore, the memory model must be able to account for this behavior.

To put it another way, suppose the definition of preserved program order did include the following hypothetical rule:
memory access $a$ precedes memory access $b$ in preserved program order (and hence also in the global memory order) if $a$ precedes $b$ in program order and $a$ and $b$ are accesses to the same memory location, $a$ is a write, and $b$ is a read.  Call this ``Rule X''.  Then we get the following:

\begin{itemize}
  \item (a) precedes (b): by rule X
  \item (b) precedes (d): by rule \ref{ppo:fence}
  \item (d) precedes (e): by the load value axiom.  Otherwise, if (d) preceded (c), then (d) would be required to return the value 1.  (This is a perfectly legal execution; it's just not the one in question)
  \item (e) precedes (f): by rule X
  \item (f) precedes (h): by rule \ref{ppo:fence}
  \item (h) precedes (a): by the load value axiom, as above.
\end{itemize}
The global memory order must be a total order and cannot be cyclic, because a cycle would imply that every event in the cycle happens before itself, which is impossible.
Therefore, the execution proposed above would be forbidden, and hence the addition of rule X would break the memory model.

Nevertheless, even if (b) precedes (a) and/or (f) precedes (e) in the global memory order, the only sensible possibility in this example is for (b) to return the value written by (a), and likewise for (f) and (e).  This combination of circumstances is what leads to the second option in the definition of the load value axiom.
Even though (b) precedes (a) in the global memory order, (a) will still be visible to (b) by virtue of sitting in the store buffer at the time (b) executes.
Therefore, even if (b) precedes (a) in the global memory order, (b) should return the value written by (a) because (a) precedes (b) in program order.
Likewise for (e) and (f).

\subsection{Same-Address Orderings, Part 1 (Rule~\ref{ppo:->st})}
\begin{tabular}{p{1cm}|p{12cm}} &
Rule \ref{ppo:->st}: \ppost
\end{tabular}

Same-address orderings where the latter is a store are straightforward: a load or store can never be reordered with a later store to an overlapping memory location.  From a microarchitecture perspective, generally speaking, it is difficult or impossible to undo a speculatively reordered store if the speculation turns out to be invalid, so such behavior is simply disallowed by the model.

Same-address load-load orderings are far more subtle; see Chapter~\ref{sec:ppo:rdw}.

\begin{comment}
The formal model captures this as follows:
\begin{itemize}
  \item (a) precedes (b) in preserved program order because both are stores to the same address, and (b) is a store (Rule~\ref{ppo:->st}).  Therefore, (c) cannot return the value written by (a), because (b) is a later store to the same address in both program order and the global memory order, and so returning the value written by (a) would violate the load value axiom.
  \item (c) precedes (d) in preserved program order because both are accesses to the same address, and (d) is a store.  (c) also precedes (d) in program order.  Therefore, (c) is not able to return the value written by (d), because neither option in the load value axiom applies.
\end{itemize}
\end{comment}

\subsection{Fences (Rule~\ref{ppo:fence})}\label{sec:fence}
\begin{tabular}{p{1cm}|p{12cm}} &
Rule \ref{ppo:fence}: \ppofence
\end{tabular}

By default, the {\tt fence} instruction ensures that all memory accesses from instructions preceding the fence in program order (the ``predecessor set'') appear earlier in the global memory order than memory accesses from instructions appearing after the fence in program order (the ``successor set'').
However, fences can optionally further restrict the predecessor set and/or the successor set to  a smaller set of memory accesses in order to provide some speedup.
Specifically, fences have {\tt .pr}, {\tt .pw}, {\tt .sr}, and {\tt .sw} bits which restrict the predecessor and/or successor sets.
The predecessor set includes loads (resp.\@ stores) if and only if {\tt .pr} (resp.\@ {\tt .pw}) is set.
Similarly, the successor set includes loads (resp.\@ stores) if and only if {\tt .sr} (resp.\@ {\tt .sw}) is set.

The full RISC-V opcode encoding currently has nine non-trivial combinations of the four bits {\tt pr}, {\tt pw}, {\tt sr}, and {\tt sw}, plus one extra encoding which is expected to be added to facilitate mapping of ``acquire+release'' or TSO semantics.
The remaining seven combinations have empty predecessor and/or successor sets and hence are no-ops.
Of the ten non-trivial options, only six are commonly used in practice:
{\tt
\begin{itemize}
  \item fence rw,rw
  \item fence.tso \textrm{(i.e., a combined {\tt fence r,rw} $+$ {\tt fence rw,w})}
  \item fence rw,w
  \item fence r,rw
  \item fence r,r
  \item fence w,w
\end{itemize}
}
We strongly recommend that programmers stick to these six, as these are the best understood.  {\tt fence} instructions using any other combination of {\tt .pr}, {\tt .pw}, {\tt .sr}, and {\tt .sw} are reserved.

Finally, we note that since RISC-V uses a multi-copy atomic memory model, programmers can reason about fences and the {\tt .aq} and {\tt .rl} bits in a thread-local manner.  There is no complex notion of ``fence cumulativity'' as found in memory models which are not multi-copy atomic.

\subsection{Acquire/Release Ordering (Rules~\ref{ppo:acquire}--\ref{ppo:amoload})}\label{sec:acqrel}
\begin{tabular}{p{1cm}|p{12cm}}
  & Rule \ref{ppo:acquire}: \ppoacquire \\
  %& Rule \ref{ppo:loadtoacq}: \ppoloadtoacq \\
  & Rule \ref{ppo:release}: \pporelease \\
  & Rule \ref{ppo:strongacqrel}: \ppostrongacqrel \\
  & Rule \ref{ppo:amostore}: \ppoamostore \\
  & Rule \ref{ppo:amoload}: \ppoamoload \\
\end{tabular}

An {\em acquire} operation is used at the start of a critical section.  The general requirement for acquire semantics is that all loads and stores inside the critical section are up to date with respect to the synchronization variable being used to protect it.  In other words, an acquire operation requires load-to-load/store ordering.
Acquire ordering can be enforced in one of two ways: setting {\tt .aq}, which enforces ordering with respect to just the synchronization variable itself, or with a {\tt FENCE r,rw}, which enforces ordering with respect to all previous loads.  

\begin{figure}[h!]
  \centering\small
  \begin{verbatim}
          sd           x1, (a1)     # Random unrelated store
          ld           x2, (a2)     # Random unrelated load
          li           t0, 1        # Initialize swap value.
      again:
          amoswap.w.aq t0, t0, (a0) # Attempt to acquire lock.
          bnez         t0, again    # Retry if held.
          # ...
          # Critical section.
          # ...
          amoswap.w.rl x0, x0, (a0) # Release lock by storing 0.
          sd           x3, (a3)     # Random unrelated store
          ld           x4, (a4)     # Random unrelated load
  \end{verbatim}
  \caption{A spinlock with atomics}
  \label{fig:litmus:spinlock_atomics}
\end{figure}

Consider Figure~\ref{fig:litmus:spinlock_atomics}:
Because this example uses {\tt .aq}, the loads and stores in the critical section are guaranteed to appear in the global memory order after the {\tt amoswap} used to acquire the lock.  However, assuming {\tt a0}, {\tt a1}, and {\tt a2} point to different memory locations, the loads and stores in the critical section may or may not appear after the ``random unrelated load'' at the beginning of the example in the global memory order.

\begin{figure}[h!]
  \centering\small
  \begin{verbatim}
          sd           x1, (a1)     # Random unrelated store
          ld           x2, (a2)     # Random unrelated load
          li           t0, 1        # Initialize swap value.
      again:
          amoswap.w    t0, t0, (a0) # Attempt to acquire lock.
          fence        r, rw        # Enforce "acquire" memory ordering
          bnez         t0, again    # Retry if held.
          # ...
          # Critical section.
          # ...
          fence        rw, w        # Enforce "release" memory ordering
          amoswap.w    x0, x0, (a0) # Release lock by storing 0.
          sd           x3, (a3)     # Random unrelated store
          ld           x4, (a4)     # Random unrelated load
  \end{verbatim}
  \caption{A spinlock with fences}
  \label{fig:litmus:spinlock_fences}
\end{figure}

Now, consider the alternative in Figure~\ref{fig:litmus:spinlock_fences}.
In this case, even though the {\tt amoswap} does not enforce ordering with an {\tt .aq} bit, the fence nevertheless enforces that the acquire {\tt amoswap} appears earlier in the global memory order than all loads and stores in the critical section.
Note, however, that in this case, the fence also enforces additional orderings: it also requires that the ``random unrelated load'' at the start of the program appears also appears earlier in the global memory order than the loads and stores of the critical section.  (This particular fence does not, however, enforce any ordering with respect to the ``random unrelated store'' at the start of the snippet.)
In this way, fence-enforced orderings are slightly coarser than orderings enforced by {\tt.aq}.

\begin{comment}
A load-acquire also appears later in the global memory order than any previous paired loads to overlapping addresses.
This rule is in place primarily to ensure compatibility with C/C++ release sequences.
Consider the example of Figure~\ref{fig:relseq}:

\begin{figure}[h!]
  \centering
  {\tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Thread 0} & \multicolumn{2}{c}{Thread 1} \\
    \hline
      (a)  & x = 1;                     & (d) & atomic\_exchange(y, 2, memory\_order\_relaxed); \\
      (bc) & atomic\_store(y, 1,        & (e) & atomic\_exchange(y, 2, memory\_order\_acquire); \\
           & ~~memory\_order\_release); & (f) & int a1 = x;            \\
    \end{tabular}
  }

  \bigskip

  {\tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
          & li t1, 1    &     & li t1, 1    \\
      (a) & sd t1,0(s0) & (d) & amoswap~~~ a0,t1,0(s1) \\
      (b) & fence rw,w  & (e) & amoswap.aq x0,a1,0(s1) \\
      (c) & sd t1,0(s1) & (f) & ld~~~~~~~~ a1,0(s0)    \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{C/C++ release sequence example}
  \label{fig:relseq}
\end{figure}

In Figure~\ref{fig:relseq}, the original C/C++ source code has a ``synchronizes-with'' relationship from (c) to (e) via (d), where the latter is part of the ``release sequence'' of (c).
Therefore, RVWMO must somehow require (c) to appear before (e) in the global memory order.
Without rule~\ref{ppo:loadtoacq}, (c) would be ordered before (d), but (d) would {\em not} be ordered before (e) due to ``fri; rfi'' behavior (Chapter~\ref{sec:ppo:rdw}).
Rule~\ref{ppo:loadtoacq} therefore fixes the missing link by placing both the load and the store part of (d) before (e) in the global memory order.
\end{comment}

Release orderings work exactly the same as acquire orderings, just in the opposite direction.  Release semantics require all loads and stores in the critical section to appear before the lock-releasing store (here, an {\tt amoswap}) in the global memory order.  This can be enforced using the {\tt .rl} bit or with a {\tt fence rw,w} operations.  Likewise, the ordering between the loads and stores in the critical section and the ``random unrelated store'' at the end of the code snippet is enforced only by the {\tt fence rw,w} in the second example, not by the {\tt .rl} in the first example.
%Note that a corollary of rule~\ref{ppo:loadtoacq} is not needed for release operations because it would be redundant with rule~\ref{ppo:->st}.

By default, store-release-to-load-acquire ordering is not enforced.  This facilitates the porting of code written under the TSO and/or RCpc memory models; see Chapter~\ref{sec:porting} for details.
To enforce store-release-to-load-acquire ordering, use store-release-RCsc and load-acquire-RCsc operations, so that PPO rule \ref{ppo:strongacqrel} applies.
The use of only store-release-RCsc and load-acquire-RCsc operations implies sequential consistency, as the combination of PPO rules \ref{ppo:acquire}--\ref{ppo:strongacqrel} implies that all RCsc accesses will respect program order.

AMOs with both {\tt .aq} and {\tt .rl} set are fully-ordered operations.
Treating the load part and the store part as independent RCsc operations is not in and of itself sufficient to enforce full fencing behavior, but this subtle weak behavior is counterintuitive and not much of an advantage architecturally, especially with {\tt lr} and {\tt sc} also available.
For this reason, AMOs annotated with {\tt .aqrl} are strengthened to being fully-ordered under RVWMO.

%The RVWMO memory model rules do not place any explicit restrictions on whether atomics can forward values from stores still in a store buffer, as some particularly aggressive microarchitectures may do this at times.
%Such behavior is compatible with higher-level software memory model such as the one used by C/C++ if the mappings of Chapter~\ref{sec:porting} are used.
%However, such forwarding can be prevented manually if desired by placing a {\tt fence w,r,[addr]} between the store and the load in question.

\subsection{Dependencies (Rules~\ref{ppo:addr}--\ref{ppo:success})}
\label{sec:depspart1}
\begin{tabular}{p{1cm}|p{12cm}}
  & Rule \ref{ppo:addr}: \ppoaddr \\
  & Rule \ref{ppo:data}: \ppodata \\
  & Rule \ref{ppo:ctrl}: \ppoctrl \\
  & Rule \ref{ppo:success}: \pposuccess \\
\end{tabular}

Dependencies from a load to a later memory operation in the same hart are respected by the RVWMO memory model.
The Alpha memory model was notable for choosing {\em not} to enforce the ordering of such dependencies, but most modern hardware and software memory models consider allowing dependent instructions to be reordered too confusing and counterintuitive.
Furthermore, modern code sometimes intentionally uses such dependencies as a particularly lightweight ordering enforcement mechanism.

Like other modern memory models, the RVWMO memory model uses syntactic rather than semantic dependencies.
In other words, this definition depends on the identities of the
registers being accessed by different instructions, not the actual
contents of those registers.  This means that an address, control, or
data dependency must be enforced even if the calculation could seemingly
be ``optimized away''.
This choice ensures that RVWMO remains compatible with programmers that use these false syntactic dependencies intentionally to form a lightweight type of ordering mechanism.

For example, there is a syntactic address
dependency from the first instruction to the last instruction in the
Figure~\ref{fig:litmus:address}, even though {\tt a1} XOR {\tt a1} is zero and
hence has no effect on the address accessed by the second load.
\begin{verbbox}
ld  a1,0(s0)
xor a2,a1,a1
add s1,s1,a2
ld  a5,0(s1)
\end{verbbox}
\begin{figure}[h!]
  \centering\small
  \theverbbox
  \caption{A syntactic address dependency}
  \label{fig:litmus:address}
\end{figure}

The benefit of using dependencies as a lightweight synchronization mechanism is that the ordering enforcement requirement is limited only to the specific two instructions in question.
Other non-dependent instructions may be freely-reordered by aggressive implementations.
One alternative would be to use a load-acquire, but this would enforce ordering for the first load with respect to {\em all} subsequent instructions.
Another would be to use a {\tt fence r,r}, but this would include all previous and all subsequent loads, making this option each more expensive.

Control dependencies behave differently from address and data dependencies in the sense that a control dependency always extends to all instructions following the original target in program order.
Consider Figure~\ref{fig:litmus:control1}: the instruction at {\tt next} will always execute, but it nevertheless still has control dependency from the first instruction.
\begin{verbbox}
      lw  x1,0(x2)
      bne x1,x0,NEXT
      sw  x3,0(x4)
next: sw  x5,0(x6)
\end{verbbox}
\begin{figure}[h!]
  \centering\small
  \theverbbox
  \caption{A syntactic control dependency}
  \label{fig:litmus:control1}
\end{figure}

\begin{verbbox}
        lw  x1,0(x2)
        bne x1,x0,NEXT
  next: sw  x3,0(x4)
\end{verbbox}
\begin{figure}[h!]
  \centering\small
  \theverbbox
  \caption{Another syntactic control dependency}
  \label{fig:litmus:control2}
\end{figure}

Likewise, consider Figure~\ref{fig:litmus:control2}.
Even though both branch outcomes have the same target, there is still a control dependency from the first instruction in this snippet to the last.
This definition of control dependency is subtly stronger than what might be seen in other contexts (e.g., C++), but it conforms with standard definitions of control dependencies in the literature.

\begin{figure}[h!]
  \center
  {
    \tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
      (a) & ld a0,0(s0)    & (e) & ld a3,0(s2) \\
      (b) & lr a1,0(s1)    & (f) & sd a3,0(s0) \\
      (c) & sc a2,a0,0(s1) &                    \\
      (d) & sd a2,0(s2)    &                    \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{A variant of the LB litmus test}
  \label{fig:litmus:successdeps}
\end{figure}

Finally, we highlight a unique new rule regarding the success registers written by store-conditional instructions.
In certain cases, without PPO rule \ref{ppo:success}, a store conditional could in theory be made to store its own success output value as its data, in a manner reminiscent of so-called out-of-thin-air behavior.
This is shown in Figure~\ref{fig:litmus:successdeps}.
Suppose a hypothetical implementation could occasionally make some early guarantee that a store-conditional operation will succeed.
In this case, (c) could return 0 to {\tt a2} early (before actually executing), allowing the sequence (d), (e), (f), (a), and then (b) to execute, and then (c) might execute (successfully) only at that point.
This would imply that (c) writes its own success value to {\tt 0(s1)}!

To rule out this bizarre behavior, PPO rule~\ref{ppo:success} says that store-conditional instructions may not return success or failure into the destination register until both the address and data for the instruction have been resolved.
In the example above, this would enforce an ordering from (a) to (d), and this would in turn form a cycle that rules out the strange proposed execution.

\subsection{Same-Address Load-Load Ordering (Rule~\ref{ppo:rdw})}
\label{sec:ppo:rdw}
\begin{tabular}{p{1cm}|p{12cm}}
  & Rule \ref{ppo:rdw}: \ppordw \\
\end{tabular}

In contrast to same-address orderings ending in a store, same-address load-load ordering requirements are very subtle.

The basic requirement is that a younger load must not return a value which is older than a value returned by an older load in the same hart to the same address.  This is often known as ``CoRR'' (Coherence for Read-Read pairs), or as part of a broader ``coherence'' or ``sequential consistency per location'' requirement.
Some architectures in the past have relaxed same-address load-load ordering, but in hindsight this is generally considered to complicate the programming model too much, and so RVWMO requires CoRR ordering to be enforced.
However, because the global memory order corresponds to the order in which loads perform rather than the ordering of the values being returned, capturing CoRR requirements in terms of the global memory order requires a bit of indirection.

\begin{figure}[h!]
  \center
  {
    \tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
          & li t1, 1    &     & li~ t2, 2    \\
      (a) & sw t1,0(s0) & (d) & lw~ a0,0(s1) \\
      (b) & fence w, w  & (e) & sw~ t2,0(s1) \\
      (c) & sw t1,0(s1) & (f) & lw~ a1,0(s1) \\
          &             & (g) & xor t3,a1,a1 \\
          &             & (h) & add s0,s0,t3 \\
          &             & (i) & lw~ a2,0(s0) \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Litmus test MP+FENCE+fri-rfi-addr}
  \label{fig:litmus:frirfi}
\end{figure}

Consider the litmus test of Figure~\ref{fig:litmus:frirfi}, which is one particular instance of the more general ``fri-rfi'' pattern.
The term ``fri-rfi'' refers to the sequence (d),(e),(f): (d) ``from-reads'' (i.e., reads from an earlier write than) (e) which is the same hart, and (f) reads from (e) which is in the same hart.

From a microarchitectural perspective, outcome {\tt a0=1, a1=2, a2=0} is legal (as are various other less subtle outcomes).  Intuitively, the following would produce the outcome in question:
\begin{itemize}
  \item (a), (b), (c) execute
  \item (d) stalls (for whatever reason; perhaps it's stalled waiting for some other preceding instruction)
  \item (e) executes and enters the store buffer
  \item (f) forwards from (e) in the store buffer
  \item (g), (h), and (i) execute
  \item (d) unstalls and executes
  \item (e) drains from the store buffer to memory
\end{itemize}
This corresponds to a global memory order of (e),(f),(i),(a),(c),(d).
Note that even though (f) performs before (d), the value returned by (f) is newer than the value returned by (d).
Therefore, this execution is legal and does not violate the CoRR requirements even though (f) appears before (d) in global memory order.

Likewise, if two back-to-back loads return the values written by the same store, then they may also appear out-of-order in the global memory order without violating CoRR.  Note that this is not the same as saying that the two loads return the same value, since two different stores may write the same value.   Consider the litmus test of Figure~\ref{fig:litmus:rsw}:

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
          & li t1, 1    & (d) & lw~ a0,0(s1) \\
      (a) & sw t1,0(s0) & (e) & xor t2,a0,a0 \\
      (b) & fence w, w  & (f) & add s2,s2,t2 \\
      (c) & sw t1,0(s1) & (g) & lw~ a1,0(s2) \\
          &             & (h) & lw~ a2,0(s2) \\
          &             & (i) & xor t3,a2,a2 \\
          &             & (j) & add s0,s0,t3 \\
          &             & (k) & lw~ a3,0(s0) \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Litmus test RSW}
  \label{fig:litmus:rsw}
\end{figure}

The outcome {\tt a0=1,a1=a2,a3=0} can be observed by allowing (g) and (h) to be reordered.  This might be done speculatively, and the speculation can be justified by the microarchitecture (e.g., by snooping for cache invalidations and finding none) because replaying (h) after (g) would return the value written by the same store anyway.
Hence assuming {\tt a1=a2}, (g) and (h) can be reordered.
The global memory order corresponding to this execution would be (h),(k),(a),(c),(d),(g).

Executions of the above test in which {\tt a1} does not equal {\tt a2} do in fact require that (g) appears before (h) in the global memory order.
Allowing (h) to appear before (g) in the global memory order would in fact result in a violation of CoRR, because then (h) would return an older value than that returned by (g).
Therefore, PPO rule~\ref{ppo:rdw} forbids this CoRR violation from occurring.
As such, PPO rule~\ref{ppo:rdw} strikes a careful balance between enforcing CoRR in all cases while simultaneously being weak enough to permit ``RSW'' and ``fri-rfi'' patterns that commonly appear in real microarchitectures.


\subsection{Atomics and LR/SCs (Atomicity Axiom)}
\begin{tabular}{p{1cm}|p{12cm}} &
Atomicity axiom: \atomicityaxiom
\end{tabular}

The RISC-V architecture decouples the notion of atomicity from the notion of ordering.  Unlike architectures such as TSO, RISC-V atomics under RVWMO do not impose any ordering requirements by default.  Ordering semantics are only guaranteed by the PPO rules that otherwise apply.
This relaxed nature allows implementations to be aggressive about forwarding values even before a paired store has been committed to memory.

Roughly speaking, the atomicity rule states that there can be no store from another hart during the time the reservation is held.
For AMOs, the reservation is held as the AMO is being performed.
For successful {\tt lr}/{\tt sc} pairs, the reservation is held between the time the {\tt lr} is performed and the time the {\tt sc} is performed.
In most cases, the atomicity rule states that there can be no store from another hart between the load and its paired store in global memory order.

There is one exception, however: if the paired load returns its value from a store $s$ still in the store buffer (which some implementations may permit), then the reservation may not need to be acquired until $s$ is ready to leave the store buffer, and this may occur after the paired load has already performed.
Therefore, in this case, the requirement is only that no other store from another hart to an overlapping address can appear between time that $s$ performs and the time that the paired store performs.
Consider the example of Figure~\ref{fig:litmus:lateatomic}:

\begin{figure}[h!]
  \centering
  {
    \setlength{\tabcolsep}{2mm}
    \tt\footnotesize
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
          & li~~~~~~ t1, 2       &     & li~~~~~~~~ t3, 2    \\
          & li~~~~~~ t2, 1       &     & li~~~~~~~~ t4, 1    \\
      (a) & sd~~~~~~ t1,0(s0)    & (d) & sd~~~~~~~~ t3,0(s1) \\
      (b) & amoor.aq a0,t2,0(s0) & (e) & amoswap.rl x0,t4,0(s0) \\
      (c) & sd~~~~~~ t2,0(s1)    &     &             \\
    \end{tabular}
  }
  ~~
  \diagram
  \caption{A litmus test where the reservation for {\tt 0(s0)} may not be acquired until after the load of (b) has already completed}
  \label{fig:litmus:lateatomic}
\end{figure}

The outcome {\tt 0(s0)=3, 0(s1)=2} is legal, with the global memory order of (b0),(c),(d),(e),(a),(b1), where (b0) and (b1) represent the load and store parts, respectively, of (b).
The atomic operation (b) does not need to grab the reservation until (a) is ready to leave the store buffer.
Therefore, although (e) is a store to the same address from another hart, and even though (e) lies between (b0) and (b1) in global memory order, this execution does not violate the atomicity axiom because (e) comes after (a) in global memory order.

\begin{verbbox}
(a) lr t0, 0(a0)
(b) sd t1, 0(a0)
(c) sc t2, 0(a0)
\end{verbbox}
\begin{figure}[h!]
  \centering\small
  \theverbbox
  \caption{Store-conditional (c) may succeed on some implementations}
  \label{fig:litmus:lrsdsc}
\end{figure}

The atomicity rule does not forbid loads from being interleaved between the paired operations in program order or in the global memory order, nor does it forbid stores from the same hart from appearing between the paired operations in either program order or in the global memory order.
For example, the sequence in Figure~\ref{fig:litmus:lrsdsc} is legal, and the {\tt sc} may (but is not guaranteed to) succeed.
By preserved program order rule \ref{ppo:->st}, the program order of the three operations must be maintained in the global memory order.  This does not violate the atomicity axiom, because the intervening non-conditional store is from the same hart as the paired load-reserved and store-conditional instructions.

\begin{figure}[h!]
  \centering
  {
    \setlength{\tabcolsep}{1mm}
    \tt\footnotesize
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
          & li t0, 1              &     &             \\
      (a) & amoor.aq a0,t0,0(s0)  & (c) & amoadd.aq a1,x0,0(s1) \\
      (b) & sd~~~~~~ a0,0(s1)     & (d) & ld~~~~~~~ a2,0(s0) \\
    \end{tabular}
  }
  ~~
  \diagram
  \caption{The {\tt .aq} applies only to the load part of (a), and hence it does not order the store part of (a) before (b)}
  \label{fig:litmus:amoaq}
\end{figure}

Likewise, in the test of Figure~\ref{fig:litmus:amoaq}, the following global memory order could result in the outcome {\tt a1=1, a2=0}: (a0), (b), (c), (d), (a1).

Overall, the atomicity rule ensures that non-synchronization atomic operations (e.g., incrementing a counter) can be made as efficient as possible in high-performance implementations, while simultaneously ensuring that the atomicity conditions necessary for achieving consensus are maintained.


\begin{comment}
%\subsection{Atomics and Store Forwarding (Rules \ref{ppo:rmwrfi}--\ref{ppo:rfiaq})}
\subsection{Atomics and Store Forwarding (Rule \ref{ppo:rfiaq})}
\begin{tabular}{p{1cm}|p{12cm}}
%  & Rule \ref{ppo:rmwrfi}: \ppormwrfi \\
  & Rule \ref{ppo:rfiaq}: \pporfiaq \\
\end{tabular}

  There is one exception to the rule that ``fri-rfi'' reordering from Chapter~\ref{sec:ppo:rdw} is permitted: sequences in which the first memory access is part of an AMO or {\tt lr}/{\tt sc} pair and the second is a load with its {\tt .aq} bit set.  Rule~\ref{ppo:rfiaq} ensures compatibility with causality chains of the form ({\tt rf; ppo; rf; ppo;} $\dots$), and with C/C++ release sequences in particular.  Consider the following variant of test MP+FENCE+fri-rfi-addr (with labels reused from the earlier example):

\begin{center}
  \tt\footnotesize
  \begin{tabular}{cl||cl}
  \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
  \hline
        & li t1, 1    &      & li    t2, 2    \\
    (a) & sw t1,0(s0) &      & li    t3, 1    \\
    (b) & fence w, w  & (de) & amoor t3,a0,0(s1) \\
    (c) & sw t1,0(s1) & (f)  & lw.aq a1,0(s1) \\
        &             & (i)  & lw    a2,0(s0) \\
  \end{tabular}
  ~~~~
  \begin{tabular}{cl||cl}
  \multicolumn{4}{c}{\rm Abstracted assembly} \\
  \\
  \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
  \hline
        &            &      &            \\
    (a) & St [x], 1  &      &            \\
    (b) & Fence w,w  & (de) & AMOOr a0, 1, [y] \\ 
    (c) & St [y], 1  & (f)  & Ld.aq a1, [y] \\
        &            & (i)  & Ld a2, [x] \\
  \end{tabular}
\end{center}

Programmers (and C/C++) expect the causality chain from (a) to (c) to (de) to (f) to (i) to be enforced.
However, the PPO rules covered so far only enforce global memory ordering from (a) to (c) to (d) (the load of the AMOOr) to (e) (the store of the AMOOr), and from (f) to (i), but not from (e) to (f).
\ref{ppo:rfiaq} fills this missing link by ensuring that the ordering from (e) to (f) is respected, and hence that the entire ordering chain from (a) to (i) is respected.
\end{comment}


\subsection{Pipeline Dependency Artifacts (Rules~\ref{ppo:ld->st->ld}--\ref{ppo:addrpocfence})}
\label{sec:ppopipeline}
\begin{tabular}{p{1cm}|p{12cm}}
  & Rule \ref{ppo:ld->st->ld}: \ppoldstld \\
  & Rule \ref{ppo:addrpo}: \ppoaddrpo \\
  & Rule \ref{ppo:ctrlcfence}: \ppoctrlcfence \\
  & Rule \ref{ppo:addrpocfence}: \ppoaddrpocfence \\
\end{tabular}

These four ``compound dependency'' rules reflect behaviors of almost all real processor pipeline implementations, and they are added into the model explicitly to simplify the definition of the formal operational memory model and to improve compatibility with known patterns on other architectures.

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl}
      (a) & lw a0, 0(s0)   \\
      (b) & sw a0, 0(s1)   \\
      (c) & lw a1, 0(s1)   \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Because of the data dependency from (a) to (b), (a) is also ordered before (c)}
  \label{fig:litmus:addrdatarfi}
\end{figure}

Rule~\ref{ppo:ld->st->ld} states that a load forward from a store until the address and data for that store are known.
Consider Figure~\ref{fig:litmus:addrdatarfi}:
(c) cannot be executed until the data for (b) has been resolved, because (c) must return the value written by (b) (or by something even later in the global memory order).  Therefore, (c) will never execute before (a) has executed.

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl}
          & li t1, 1       \\
      (a) & lw a0, 0(s0)   \\
      (b) & sw a0, 0(s1)   \\
          & sw t1, 0(s1)   \\
      (c) & lw a1, 0(s1)   \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Because of the extra store between (b) and (c), (a) is no longer necessarily ordered before (c)}
  \label{fig:litmus:addrdatarfi_no}
\end{figure}

If there were another store to the same address in between (b) and (c), as in Figure~\ref{fig:litmus:addrdatarfi_no}, then (c) would no longer dependent on the data of (b) being resolved, and hence the dependency of (c) on (a), which produces the data for (b), would be broken.

One subtle related note is that {\tt amoswap} does not contain a data dependency from its load to its store.  Nor does every {\tt sc} have a data dependency on its paired {\tt lr}.
Therefore, Rule~\ref{ppo:ld->st->ld} does not enforce an ordering from paired loads of this category to subsequent loads to overlapping addresses.
%Rule~\ref{ppo:loadtoacq} is therefore not quite redundant with Rule~\ref{ppo:ld->st->ld}, even when the first load and the store in this example are paired.

Rule~\ref{ppo:addrpo} makes a similar observation to the previous rule: a store cannot be performed at memory until all previous loads which might access the same address have themselves been performed.
Such a load must appear to execute before the store, but it cannot do so if the store were to overwrite the value in memory before the load had a chance to read the old value.

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl}
        & li t1, 1       \\
    (a) & lw a0, 0(s0)   \\
    (b) & lw a1, 0(a0)   \\
    (c) & sw t1, 0(s1)   \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Because of the address dependency from (a) to (b), (a) is also ordered before (c)}
  \label{fig:litmus:addrpo}
\end{figure}

Consider Figure~\ref{fig:litmus:addrpo}:
(c) cannot be executed until the address for (b) is resolved, because it may turn out that the addresses match; i.e., that {\tt a0=s1}.  Therefore, (c) cannot be sent to memory before (a) has executed and confirmed whether the addresses to indeed overlap.

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{rl}
    (a)       & ld a0, 0(s0) \\
              & xor a1,a0,a0 \\
              & bne a1, critical \\
    critical: & fence.i \\
    (c)       & ld a1, 0(s1) \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Because of the control dependency from (a) to (c), (a) is also ordered before (c)}
  \label{fig:litmus:ctrlcfence}
\end{figure}

Rule~\ref{ppo:ctrlcfence} reflects the idiom of Figure~\ref{fig:litmus:ctrlcfence} for a lightweight acquire fence:
In this code snippet, (c) cannot execute until the {\tt fence.i} is cleared.  The {\tt fence.i} cannot clear until the branch has executed and drained.  The branch cannot execute until it receives the value from (a) through the {\tt xor}.  Therefore, (a) must be ordered before (c) in the global memory order.

\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl}
    (a)       & ld a0, 0(s0) \\
    (b)       & ld a1, 0(a0) \\
              & fence.i \\
    (c)       & ld a1, 0(s1) \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Because of the address dependency from (a) to (b) and the {\tt fence.i} between (b) and (c), (a) is also ordered before (c)}
  \label{fig:litmus:addrpocfence}
\end{figure}

Rule~\ref{ppo:addrpocfence} and Figure~\ref{fig:litmus:addrpocfence} present a similar situation:
Once again, (c) cannot execute until the {\tt fence.i} is cleared.  The {\tt fence.i} cannot clear until both (a) and (b) have at least issued (even if they have not yet returned a value).  Finally, (b) cannot issue until it receives its address from (a).  Therefore, (a) must be ordered before (c).


\section{FENCE.I, SFENCE.VMA, and I/O Fences}

In this section, we provide an informal description of how the {\tt fence.i}, {\tt sfence.vma}, and I/O fences interact with the memory model.

Instruction fetches and address translation operations (where applicable) follow the RISC-V memory model as well as the rules below.
\begin{itemize}
  \item {\tt fence.i}: Conceptually, {\tt fence.i} ensures that no instructions following the {\tt fence.i} are issued until all instructions prior to the {\tt fence.i} have executed (but not necessarily performed globally).
    This implies that the fetch of each instruction following the {\tt fence.i} in program order appears later in the global memory order than all stores prior to the {\tt fence.i} in program order.
    That in turn means that instruction caches which hardware does not keep coherent with normal memory must be flushed when a {\tt fence.i} instruction is executed.
    ({\tt fence.i} is also used form the patterns of Chapter~\ref{sec:ppopipeline}.)
  \item {\tt sfence.vma}: Conceptually, the instruction fetch and address translation operations of each instruction following the {\tt sfence.vma} in program order appears later in the global memory order than all stores prior to the {\tt sfence.vma} in program order.
    This implies that stale entries in the local hart's TLBs must be invalidated.
  \item Conceptually, updates to the page table made by a hardware page table walker form a paired atomic read-modify-write operation subject to the rules of the atomicity axiom
\end{itemize}

\subsection{Coherence and Cacheability}

The RISC-V ISA defines Physical Memory Attributes (PMAs) which specify, among other things, whether portions of the address space are coherent and/or cacheable.
See the privileged spec for the complete details.
Here, we simply discuss how the various details in each PMA relate to the memory model:

\begin{itemize}
  \item Main memory vs.\@ I/O, and I/O memory ordering PMAs: the memory model as defined applies to main memory regions.  I/O ordering is discussed below.
  \item Supported access types and atomicity PMAs: the memory model is simply applied on top of whatever primitives each region supports.
  \item Coherence and cacheability PMAs: neither the coherence nor the cacheability PMAs affect the memory model.  The RISC-V privileged specification suggests that hardware-incoherent regions of main memory are discouraged, but the memory model is compatible with hardware coherence, software coherence, implicit coherence due to read-only memory, implicit coherence due to only one agent having access, or otherwise.  Likewise, non-cacheable regions may have more restrictive behavior than cacheable regions, but the set of allowed behaviors does not change regardless.
  \item Idempotency PMAs: Idempotency PMAs are used to specify memory regions for which loads and/or stores may have side effects, and this in turn is used by the microarchitecture to determine, e.g., whether prefetches are legal.  This distinction does not affect the memory model.
\end{itemize}


\subsection{I/O Ordering}

For I/O, the load value axiom and atomicity axiom in general do not apply, as both reads and writes might have device-specific side effects.
The preserved program order rules do not generally apply to I/O either.
Instead, we informally say that memory access $a$ is ordered before memory access $b$ if $a$ precedes $b$ in program order and one or more of the following holds:
\begin{enumerate}
  \item $a$ and $b$ are accesses to overlapping addresses in an I/O region
  \item $a$ and $b$ are accesses to the same strongly-ordered I/O region
  \item $a$ and $b$ are accesses to I/O regions, and the channel associated with the I/O region accessed by either $a$ or $b$ is channel 1
  \item $a$ and $b$ are accesses to I/O regions associated with the same channel (except for channel 0)
  \item $a$ and $b$ are separated in program order by a FENCE, $a$ is in the predecessor set of the FENCE, and $b$ is in the successor set of the FENCE.  The predecessor and successor sets include the sets described by all eight FENCE bits {\tt .pr}, {\tt .pw}, {\tt .pi}, {\tt .po}, {\tt .sr}, {\tt .sw}, {\tt .si}, and {\tt .so}.
  \item $a$ and $b$ are accesses to I/O regions, and $a$ has {\tt .aq} set
  \item $a$ and $b$ are accesses to I/O regions, and $b$ has {\tt .rl} set
  \item $a$ and $b$ are accesses to I/O regions, and $a$ and $b$ both have {\tt .aq} and {\tt .rl} set
  \item $a$ and $b$ are accesses to I/O regions, and $a$ is an AMO that has {\tt .aq} and {\tt .rl} set
  \item $a$ and $b$ are accesses to I/O regions, and $b$ is an AMO that has {\tt .aq} and {\tt .rl} set
\end{enumerate}

As described above, accesses to I/O memory require stronger synchronization that what is enforced by the RVWMO PPO rules.
For such cases, {\tt FENCE} operations with {\tt .pi}, {\tt .po}, {\tt .si}, and/or {\tt .so} are needed.
For example, to enforce ordering between a write to normal memory and an MMIO write to a device register, a {\tt FENCE w,o} or stronger is needed.
Even {\tt .aq} and {\tt .rl} do not enforce ordering between normal memory accesses and accesses to I/O memory.
When a fence is in fact used, implementations must assume that the device may attempt to access memory immediately after receiving the MMIO signal, and subsequent memory accesses from that device to memory must observe the effects of all accesses ordered prior to that MMIO operation.

\begin{verbbox}
  sd t0, 0(a0)
  fence w,o
  sd a0, 0(a1)
\end{verbbox}
\begin{figure}[h!]
  \centering\small
  \theverbbox
  \caption{Ordering memory and I/O accesses}
  \label{fig:litmus:wo}
\end{figure}

In other words, in Figure~\ref{fig:litmus:wo}, suppose {\tt 0(a0)} is in normal memory and {\tt 0(a1)} is the address of a device register in I/O memory.
If the device accesses {\tt 0(a0)} upon receiving the MMIO write, then that load must conceptually appear after the first store to {\tt 0(a0)} according to the rules of the RVWMO memory model.
In some implementations, the only way to ensure this will be to require that the first store does in fact complete before the MMIO write is issued.
Other implementations may find ways to be more aggressive, while others still may not need to do anything different at all for I/O and normal memory accesses.
Nevertheless, the RVWMO memory model does not distinguish between these options; it simply provides an implementation-agnostic mechanism to specify the orderings that must be enforced.

Many architectures include separate notions of ``ordering'' and ``completion'' fences, especially as it relates to I/O (as opposed to normal memory).
Ordering fences simply ensure that memory operations stay in order, while completion fences ensure that predecessor accesses have all completed before any successors are made visible.
RISC-V does not explicitly distinguish between ordering and completion fences.
Instead, this distinction is simply inferred from different uses of the FENCE bits.

For implementations that conform to the RISC-V Unix Platform Specification, I/O devices, DMA operations, etc.\@ are required to access memory coherently and via strongly-ordered I/O channels.
Therefore, accesses to normal memory regions that are shared with I/O devices can also use the standard synchronization mechanisms.
Implementations which do not conform to the Unix Platform Specification and/or in which devices do not access memory coherently will need to use platform-specific mechanisms (such as cache flushes) to enforce coherency.

I/O regions in the address space should be considered non-cacheable regions in the PMAs for those regions.  Such regions can be considered coherent by the PMA if they are not cached by any agent.

The ordering guarantees in this section may not apply beyond a platform-specific boundary between the RISC-V cores and the device.  In particular, I/O accesses sent across an external bus (e.g., PCIe) may be reordered before they reach their ultimate destination.  Ordering must be enforced in such situations according to the platform-specific rules of those external devices and buses.

\section{Code Examples}
\label{sec:mmcode}

\subsection{Compare and Swap}
An example
using {\tt lr}/{\tt sc} to implement a compare-and-swap function is shown in
Figure~\ref{cas}.  If inlined, compare-and-swap functionality need
only take three instructions.

\begin{figure}[h!]
\begin{center}
\begin{verbatim}
        # a0 holds address of memory location 
        # a1 holds expected value
        # a2 holds desired value
        # a0 holds return value, 0 if successful, !0 otherwise
    cas:
        lr.w t0, (a0)        # Load original value.
        bne t0, a1, fail     # Doesn't match, so fail.
        sc.w a0, a2, (a0)    # Try to update.
        jr ra                # Return.
    fail:
        li a0, 1             # Set return to failure.
        jr ra                # Return.
\end{verbatim}
\end{center}
  \caption{Sample code for compare-and-swap function using {\tt lr}/{\tt sc}.}
\label{cas}
\end{figure}

\subsection{Spinlocks}
\label{sec:spinlock}

An example code sequence for a critical section guarded by a
test-and-set spinlock is shown in Figure~\ref{critical}.  Note the
first AMO is marked {\tt .aq} to order the lock acquisition before the
critical section, and the second AMO is marked {\tt .rl} to order
the critical section before the lock relinquishment.

\begin{figure}[h!]
\begin{center}
\begin{verbatim}
        li           t0, 1        # Initialize swap value.
    again:
        amoswap.w.aq t0, t0, (a0) # Attempt to acquire lock.
        bnez         t0, again    # Retry if held.
        # ...
        # Critical section.
        # ...
        amoswap.w.rl x0, x0, (a0) # Release lock by storing 0.
\end{verbatim}
\end{center}
\caption{Sample code for mutual exclusion.  {\tt a0} contains the address of the lock.}
\label{critical}
\end{figure}

\section{Code Porting Guidelines}
\label{sec:porting}

Normal x86 loads and stores are all inherently acquire and release operations: TSO enforces all load-load, load-store, and store-store ordering by default.
All TSO loads must be mapped onto {\tt l\{b|h|w|d\}; fence r,rw}, and all TSO stores must either be mapped onto {\tt amoswap.rl x0} or onto {\tt fence rw,w; s\{b|h|w|d\}}.
Alternatively, TSO loads and stores can be mapped onto {\tt l\{b|h|w|d\}.aq} and {\tt s\{b|h|w|d\}.rl} assembler pseudoinstructions to facilitate forwards compatibility in case such instructions are added to the ISA one day.
However, in the meantime, the assembler will generate the same fence-based and/or {\tt amoswap}-based versions for these pseudoinstructions.
%However, the more correct solution to porting code from x86-TSO (which is generally overly-constrained at the assembly level compared to DRF software requirements) is to rewrite the algorithm to determine which orderings the original algorithm actually required, and then to re-code the algorithm in terms of the RVWMO memory model.
x86 atomics using the LOCK prefix are all sequentially consistent and when ported naively to RISC-V must be marked as {\tt .aqrl}.

A Power {\tt sync}/{\tt hwsync} fence, an ARM {\tt dmb} fence, and an x86 {\tt mfence} are all equivalent to a RISC-V {\tt fence rw,rw}.
Power {\tt isync} and ARM {\tt isb} map to RISC-V {\tt fence.i}.
A Power {\tt lwsync} map onto {\tt fence.tso}, or onto {\tt fence rw,rw} when {\tt fence.tso} is not available.
ARM {\tt dmb ld} and {\tt dmb st} fences map to RISC-V {\tt fence r,rw} and {\tt fence w,w}, respectively.

A direct mapping of ARMv8 atomics that maps unordered instructions to unordered instructions, RCpc instructions to RCpc instructions, and RCsc instructions to RCsc instructions is likely to work in the majority of cases.
Mapping even unordered load-reserved instructions onto {\tt lr.aq} (particularly for LR/SC pairs without internal data dependencies) is an even safer bet, as this ensures C/C++ release sequences will be respected.
However, due to a subtle mismatch between the two models, strict theoretical compatibility with the ARMv8 memory model requires that a naive mapping translate all ARMv8 store conditional and load-acquire operations map onto RISC-V RCsc operations.
Any atomics which are naively ported into RCsc operations may revert back to the straightforward mapping if the programmer can verify that the code is not relying on an ordering from the store-conditional to the load-acquire (as this is not common).

%ARMv8 solves the C/C++ release sequence problem of Chapter~\ref{sec:acqrel} through a rule that is different from rule~\ref{ppo:loadtoacq}.
%Therefore, strict formal compatibility requires naively-ported ARMv8 load-acquire operations to be preceded by a {\tt fence w,r,[addr]} or stronger.
%The naive translations of ARM {\tt ldapr}, {\tt ldar}, and {\tt stlr} are therefore ``{\tt fence w,r,[addr]; amoor.aq rd,[addr],x0}'', ``{\tt fence w,r,[addr]; amoor.aq.rl rd,[addr],x0}'' and ``{\tt amoswap.aq.rl} with {\tt rd=x0}'', respectively.
%In general the extra fence would not have been necessary if the original source were recompiled to RISC-V natively, because the RVWMO memory model already solves the same underlying problem just in a different way.
%Naive ports may choose whether to stick to a strict naive port or to assume that the (cheaper) mapping without the fence is more than likely sufficient.

The Linux fences {\tt smp\_mb()}, {\tt smp\_wmb()}, {\tt smp\_rmb()} map onto {\tt fence rw,rw}, {\tt fence w,w}, and {\tt fence r,r}, respectively.  The fence {\tt smp\_read\_barrier\_depends()} map to a no-op due to preserved program order rules \ref{ppo:addr}--\ref{ppo:ctrl}.
The Linux fences {\tt dma\_rmb()} and {\tt dma\_wmb()} map onto {\tt fence r,r} and {\tt fence w,w}, respectively, since the RISC-V Unix Platform requires coherent DMA.
The Linux fences {\tt rmb()}, {\tt wmb()}, and {\tt mb()} map onto {\tt fence ri,ri}, {\tt fence wo,wo}, and {\tt fence rwio,rwio}, respectively.

%\begin{table}[h!]
%  \begin{tabular}{|l|l|l|}
%    \hline
%    C/C++ Construct                            & Base ISA Mapping & `A' Extension Mapping \\
%    \hline
%    \hline
%    Non-atomic load                            & \multicolumn{2}{l|}{\tt ld}               \\
%    \hline
%    \tt atomic\_load(memory\_order\_relaxed)   & \multicolumn{2}{l|}{\tt ld}               \\
%    \hline
%    %\tt atomic\_load(memory\_order\_consume)   & \multicolumn{2}{l|}{\tt ld; fence r,rw}   \\
%    %\hline
%    \tt atomic\_load(memory\_order\_acquire)   & \tt fence r,r,[addr]; & \tt ld.aq   \\
%                                               & \tt ld; fence r,rw    &                       \\
%    \hline
%    \tt atomic\_load(memory\_order\_seq\_cst)  & \tt fence rw,rw; ld; & \tt ld.aq.rl rs2=x0   \\
%                                               & \tt fence r,rw       &                       \\
%    \hline
%    \hline
%    Non-atomic store                           & \multicolumn{2}{l|}{\tt sd}               \\
%    \hline
%    \tt atomic\_store(memory\_order\_relaxed)  & \multicolumn{2}{l|}{\tt sd}               \\
%    \hline
%    \tt atomic\_store(memory\_order\_release)  & \tt fence rw,w; sd  & \tt sd.rl x0   \\
%    \hline
%    \tt atomic\_store(memory\_order\_seq\_cst) & \tt fence rw,rw; sd & \tt sd.aq.rl x0  \\
%    \hline
%    \hline
%    \tt atomic\_thread\_fence(memory\_order\_acquire)  & \multicolumn{2}{l|}{\tt fence r,rw} \\
%    \hline
%    \tt atomic\_thread\_fence(memory\_order\_release)  & \multicolumn{2}{l|}{\tt fence rw,w} \\
%    \hline
%    \tt atomic\_thread\_fence(memory\_order\_acq\_rel) & \multicolumn{2}{l|}{{\tt fence rw,rw~} or {~\tt fence rw,w; fence r,rw}} \\
%    \hline
%    \tt atomic\_thread\_fence(memory\_order\_seq\_cst) & \multicolumn{2}{l|}{\tt fence rw,rw} \\
%    \hline
%  \end{tabular}
%  \caption{Mappings from C/C++ primitives to RISC-V primitives.  The atomics mapping is preferable where available.}
%  \label{tab:mappings}
%\end{table}

\begin{table}[h!]
  \begin{tabular}{|l|l|}
    \hline
    C/C++ Construct                            & RVWMO Mapping \\
    \hline
    \hline
    Non-atomic load                            & \tt l\{b|h|w|d\}               \\
    \hline
    \tt atomic\_load(memory\_order\_relaxed)   & \tt l\{b|h|w|d\}               \\
    \hline
    %\tt atomic\_load(memory\_order\_consume)   & \multicolumn{2}{l|}{\tt l\{b|h|w|d\}; fence r,rw}   \\
    %\hline
    \tt atomic\_load(memory\_order\_acquire)   & \tt l\{b|h|w|d\}; fence r,rw    \\
    \hline
    \tt atomic\_load(memory\_order\_seq\_cst)  & \tt fence rw,rw; l\{b|h|w|d\}; fence r,rw       \\
    \hline
    \hline
    Non-atomic store                           & \tt s\{b|h|w|d\}               \\
    \hline
    \tt atomic\_store(memory\_order\_relaxed)  & \tt s\{b|h|w|d\}               \\
    \hline
    \tt atomic\_store(memory\_order\_release)  & \tt fence rw,w; s\{b|h|w|d\}  \\
    \hline
    \tt atomic\_store(memory\_order\_seq\_cst) & \tt fence rw,rw; s\{b|h|w|d\} \\
    \hline
    \hline
    \tt atomic\_thread\_fence(memory\_order\_acquire)  & \tt fence r,rw \\
    \hline
    \tt atomic\_thread\_fence(memory\_order\_release)  & \tt fence rw,w \\
    \hline
    \tt atomic\_thread\_fence(memory\_order\_acq\_rel) & {\tt fence.tso} \\
    \hline
    \tt atomic\_thread\_fence(memory\_order\_seq\_cst) & \tt fence rw,rw \\
    \hline
  \end{tabular}
  \caption{Mappings from C/C++ primitives to RISC-V primitives.} %  The atomics mapping is preferable where available.}
  \label{tab:mappings}
\end{table}

The C11/C++11 {\tt memory\_order\_*} primitives should be mapped as shown in Table~\ref{tab:mappings}.
The {\tt memory\_order\_acquire} orderings in particular must use fences rather than atomics to ensure that release sequences behave correctly even in the presence of {\tt amoswap}.
The {\tt memory\_order\_release} mappings may use {\tt .rl} as an alternative.

\begin{table}[h!]
\centering
  \begin{tabular}{|l|l|}
    \hline
    Ordering Annotation & Fence-based Equivalent \\
    \hline
    \tt l\{b|h|w|d|r\}.aq        & \tt l\{b|h|w|d|r\}; fence r,rw \\
    \hline
    \tt l\{b|h|w|d|r\}.aqrl      & \tt fence rw,rw; l\{b|h|w|d|r\}; fence r,rw \\
    \hline
    \tt s\{b|h|w|d|c\}.rl        & \tt fence rw,w; s\{b|h|w|d|c\} \\
    \hline
    \tt s\{b|h|w|d|c\}.aqrl      & \tt fence rw,w; s\{b|h|w|d|c\} \\
    \hline
    \tt amo<op>.aq               & \tt amo<op>; fence r,rw \\
    \hline
    \tt amo<op>.rl               & \tt fence rw,w; amo<op> \\
    \hline
    \tt amo<op>.aqrl             & \tt fence rw,rw; amo<op>; fence rw,rw \\
    \hline
  \end{tabular}
  \caption{Mappings from {\tt .aq} and/or {\tt .rl} to fence-based equivalents.  An alternative mapping places a {\tt fence rw,rw} after the existing {\tt s\{b|h|w|d|c\}} mapping rather than at the front of the {\tt l\{b|h|w|d|r\}} mapping.}
  \label{tab:aqrltofence}
\end{table}

It is also safe to translate any {\tt .aq}, {\tt .rl}, or {\tt .aqrl} annotation into the fence-based snippets of Table~\ref{tab:aqrltofence}.
These can also be used as a legal implementation of {\tt l\{b|h|w|d\}} or {\tt s\{b|h|w|d\}} pseudoinstructions for as long as those instructions are not added to the ISA.

\section{Implementation Guidelines}

The RVWMO and RVTSO memory models by no means preclude microarchitectures from employing sophisticated speculation techniques or other forms of optimization in order to deliver higher performance.
The models also do not impose any requirement to use any one particular cache hierarchy, nor even to use a cache coherence protocol at all.
Instead, these models only specify the behaviors that can be exposed to software.
Microarchitectures are free to use any pipeline design, any coherent or non-coherent cache hierarchy, any on-chip interconnect, etc., as long as the design satisfy the memory model rules.
That said, to help people understand the actual implementations of the memory model, in this section we provide some guidelines below on how architects and programmers should interpret the models' rules.

Both RVWMO and RVTSO are multi-copy atomic (or ``other-multi-copy-atomic''): any store value which is visible to a hart other than the one that originally issued it must also be conceptually visible to all other harts in the system.
In other words, harts may forward from their own previous stores before those stores have become globally visible to all harts, but no other early intra-hart forwarding is permitted.
Multi-copy atomicity may be enforced in a number of ways.
It might hold inherently due to the physical design of the caches and store buffers, it may be enforced via a single-writer/multiple-reader cache coherence protocol, or it might hold due to some other mechanism.

Although multi-copy atomicity does impose some restrictions on the microarchitecture, it is one of the key properties keeping the memory model from becoming extremely complicated.
For example, a hart may not legally forward a value from a neighbor hart's private store buffer, unless those two harts are the only two in the system.
Nor may a cache coherence protocol forward a value from one hart to another until the coherence protocol has invalidated all older copies from other caches.
Of course, microarchitectures may (and high-performance implementations likely will) violate these rules under the covers through speculation or other optimizations, as long as any non-compliant behaviors are not exposed to the programmer.

As a rough guideline for interpreting the PPO rules in RVWMO, we expect the following from the software perspective:
\begin{itemize}
  \item programmers will use PPO rules \ref{ppo:->st}--\ref{ppo:amoload} regularly and actively.
  \item expert programmers will use PPO rules \ref{ppo:addr}--\ref{ppo:ctrl} to speed up critical paths of important data structures.
  %\item expert programmers will occasionally use PPO rules \ref{ppo:rdw}--\ref{ppo:rfiaq} in very aggressive code and/or as part of a longer chain of synchronization.
  \item even expert programmers will rarely if ever use PPO rules \ref{ppo:success}--\ref{ppo:addrpocfence} directly.  These are included to facilitate common microarchitectural optimizations (rule~\ref{ppo:rdw}) and the operational formal modeling approach (rules \ref{ppo:success} and \ref{ppo:ld->st->ld}--\ref{ppo:addrpocfence}) described in Chapter~\ref{sec:operational}.  They also facilitate the process of porting code from other architectures which have similar rules.
\end{itemize}

We also expect the following from the hardware perspective:
\begin{itemize}
  \item PPO rules \ref{ppo:->st}--\ref{ppo:release} and \ref{ppo:amostore}--\ref{ppo:amoload} reflect well-understood rules that should pose few surprises to architects.
  \item PPO rule \ref{ppo:strongacqrel} may not be immediately obvious to architects, but is somewhat standard nevertheless
  \item The load value axiom, the atomicity axiom, and PPO rules \ref{ppo:addr}--\ref{ppo:ctrl} and \ref{ppo:ld->st->ld}--\ref{ppo:addrpocfence} reflect rules that most hardware implementations will enforce naturally, unless they contain extreme optimizations.  Of course, implementations should make sure to double check these rules nevertheless.  Hardware must also ensure that syntactic dependencies are not ``optimized away''.
  %\item PPO rules \ref{ppo:strongacqrel} and \ref{ppo:rfiaq} may not be obvious or intuitive, and hence they deserve particular attention.
  %\item PPO rules \ref{ppo:strongacqrel} and \ref{ppo:rmwrfi}--\ref{ppo:rfiaq} may not be obvious or intuitive, and hence they deserve particular attention.
  \item PPO rule \ref{ppo:success} is not obvious, but it is necessary to avoid certain out-of-thin-air-like behavior that appears with store-conditional success values
  \item PPO rule \ref{ppo:rdw} reflects a natural and common hardware optimization, but one that is very subtle and hence is worth double checking carefully.
\end{itemize}

Architectures are free to implement any of the memory model rules as conservatively as they choose.  For example, a hardware implementation may choose to do any or all of the following:
  \begin{itemize}
    \item interpret all fences as if they were {\tt fence rw,rw} (or {\tt fence iorw,iorw}, if I/O is involved), regardless of the bits actually set
    \item implement all fences with {\tt .pw} and {\tt .sr} as if they were {\tt fence~rw,rw} (or {\tt fence~iorw,iorw}, if I/O is involved), as ``{\tt w,r}'' is the most expensive of the four possible normal memory orderings anyway
    \item ignore any addresses passed to a fence instruction and simply implement the fence for all addresses
    \item implement an instruction with {\tt .aq} set as being preceded immediately by {\tt fence r,rw}
    \item implement an instruction with {\tt .rl} set as being succeeded immediately by {\tt fence rw,w}
    \item enforcing all same-address load-load ordering, even in the presence of patterns such as ``fri-rfi'' and ``RSW''
    \item forbid any forwarding of a value from a store in the store buffer to a subsequent AMO or {\tt lr} to the same address
    \item forbid any forwarding of a value from an AMO or {\tt sc} in the store buffer to a subsequent load to the same address
    \item implement TSO on all memory accesses, and ignore any normal memory fences that do not include ``{\tt w,r}'' ordering
    \item implement all atomics to be RCsc; i.e., always enforce all store-release-to-load-acquire ordering
  \end{itemize}
%PPO rules~\ref{ppo:ld->st->ld}--\ref{ppo:addrpocfence} are not intended to impose any ordering requirements onto a processor pipeline beyond constraints which arise naturally, but extremely-optimized pipelines should be careful not to violate these rules nevertheless (or to ensure that any speculation-based optimizations do not make illegal behaviors visible to software). 

Architectures which implement RVTSO can safely do the following:
\begin{itemize}
  \item Ignore all {\tt .aq} and {\tt .rl} bits, since these are implicitly always set under RVTSO.  ({\tt .aqrl} cannot be ignored, however, due to PPO rules \ref{ppo:strongacqrel}--\ref{ppo:amoload}.)
  \item Ignore all fences which do not have both {\tt .pw} and {\tt .sr} (unless the fence also orders I/O)
  \item Ignore PPO rules \ref{ppo:->st} and \ref{ppo:addr}--\ref{ppo:addrpocfence}, since these are redundant with other PPO rules under RVTSO assumptions
\end{itemize}

Other general notes:

\begin{itemize}
  \item Silent stores (i.e., stores which write the same value that already exists at a memory location) do not have any special behavior from a memory model point of view.  Microarchitectures that attempt to implement silent stores must take care to ensure that the memory model is still obeyed, particularly in cases such as RSW (Chapter~\ref{sec:ppo:rdw}) which tend to be incompatible with silent stores.
  \item Writes may be merged (i.e., two consecutive writes to the same address may be merged) or subsumed (i.e., the earlier of two back-to-back writes to the same address may be elided) as long as the resulting behavior does not otherwise violate the memory model semantics.
\end{itemize}

The question of write subsumption can be understood from the following example:
\begin{figure}[h!]
  \centering
  {
    \tt\small
    \begin{tabular}{cl||cl}
    \multicolumn{2}{c}{Hart 0} & \multicolumn{2}{c}{Hart 1} \\
    \hline
        & li t1, 3    &     & li  t3, 2    \\
        & li t2, 1    &     &              \\
    (a) & sw t1,0(s0) & (d) & lw  a0,0(s1) \\
    (b) & fence w, w  & (e) & sw  a0,0(s0) \\
    (c) & sw t2,0(s1) & (f) & lw  t3,0(s0) \\
    \end{tabular}
  }
  ~~~~
  \diagram
  \caption{Write subsumption litmus test}
  \label{fig:litmus:subsumption}
\end{figure}

As written, (a) must follow (f) in the global memory order:
\begin{itemize}
  \item (a) follows (c) in the global memory order because of rule 2
  \item (c) follows (d) in the global memory order because of the Load Value axiom
  \item (d) follows (e) in the global memory order because of rule 7
  \item (e) follows (f) in the global memory order because of rule 1
\end{itemize}

A very aggressive microarchitecture might erroneously decide to discard (e), as (f) supersedes it, and this may in turn lead the microarchitecture to break the now-eliminated dependency between (d) and (f) (and hence also between (a) and (f)).
This would violate the memory model rules, and hence it is forbidden.
Write subsumption may in other cases be legal, if for example there were no data dependency between (d) and (e).

\section{Summary of New/Modified ISA Features}

At a high level, PPO rules \ref{ppo:strongacqrel}, \ref{ppo:success}, and \ref{ppo:ld->st->ld}--\ref{ppo:addrpocfence} are all new rules that did not exist in the original ISA spec.  Rule~\ref{ppo:rdw} and the specifics of the atomicity axiom were addressed but not stated in detail.

Other new or modified ISA details:
\begin{itemize}
  \item There is an RCpc ({\tt .aq} and {\tt .rl}) vs.\@ RCsc ({\tt .aqrl}) distinction
  \item Load-release and store-acquire are deprecated
  \item {\tt lr}/{\tt sc} behavior was clarified
  %\item Fences reserve two bits for platform-specific use
\end{itemize}

\subsection{Possible Future Extensions}

We expect that any or all of the following possible future extensions would be compatible with the RVWMO memory model:

\begin{itemize}
  \item `V' vector ISA extensions
  \item A transactional memory subset of the `T' ISA extension
  \item `J' JIT extension
  \item Native encodings for {\tt l\{b|h|w|d\}.aq}/{\tt s\{b|h|w|d\}.rl}
  \item Fences limited to certain addresses
  \item Cache writeback/flush/invalidate/etc.\@ hints, but these should be considered hints, not functional requirements.  Any cache management operations which are required for basic correctness should be described as (possibly address range-limited) fences to comply with the RISC-V philosophy (see also {\tt fence.i} and {\tt sfence.vma}).  For example, a functional cache writeback instruction might instead be written as ``{\tt fence~rw[addr],w[addr]}''.
\end{itemize}

\section{Litmus Tests}

These litmus tests represent some of the better-known litmus tests in the field, plus some tests that are randomly-generated, plus some tests that are generated to be particularly relevant to the RVWMO memory model.

All will be made available for download once they are generated.

We expect that these tests will one day serve as part of a compliance test suite, and we expect that many architects will use them for verification purposes as well.

COMING SOON!

\chapter{Formal Memory Model Specifications}

\begin{commentary}
  To facilitate formal analysis of RVWMO, we present a set of formalizations in this chapter.  Any discrepancies are unintended; the expectation is that the models will describe exactly the same sets of legal behaviors, pending some memory model changes that have not quite been added to all of the formalizations yet.

  As such, these formalizations should be considered snapshots from some point in time during the development process rather than finalized specifications.

  At this point, no individual formalization is considered authoritative, but we may designate one as such in collaboration with the ISA specification and/or formalization task groups.
\end{commentary}

\section{Formal Axiomatic Specification in Alloy}
\label{sec:alloy}

We present two formal specifications of the RVWMO memory model in Alloy (\url{http://alloy.mit.edu}).

The first corresponds directly to the natural language model earlier in this chapter.

\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
////////////////////////////////////////////////////////////////////////////////
// =RISC-V RVWMO axioms=

// Preserved Program Order
fun ppo : Event->Event {
  // same-address ordering
  po_loc :> Store

  // explicit synchronization
  + ppo_fence
  + Load.aq <: ^po
  + ^po :> Store.rl
  + Store.aq.rl <: ^po :> Load.aq.rl
  + ^po :> Load.sc
  + Store.sc <: ^po

  // dependencies
  + addr
  + data
  + ctrl :> Store
  + (addr+data).successdep

  // CoRR
  + rdw & po_loc_no_intervening_write

  // pipeline dependency artifacts
  + (addr+data).rfi
  + addr.^po :> Store
  + ctrl.(FenceI <: ^po)
  + addr.^po.(FenceI <: ^po)
}

// the global memory order respects preserved program order
fact { ppo in gmo }
\end{lstlisting}}
  \caption{The RVWMO memory model formalized in Alloy (1/4: PPO)}
  \label{fig:alloy1}
\end{figure}
\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
// Load value axiom
fun candidates[r: Load] : set Store {
  (r.~gmo & Store & same_addr[r]) // writes preceding r in gmo
  + (r.^~po & Store & same_addr[r]) // writes preceding r in po
}

fun latest_among[s: set Event] : Event { s - s.~gmo }

pred LoadValue {
  all w: Store | all r: Load |
    w->r in rf <=> w = latest_among[candidates[r]]
}

fun after_reserve_of[r: Load] : Event { latest_among[r + r.~rf].gmo }

pred Atomicity {
  all r: Store.~rmw |               // starting from the read r of an atomic,
    no x: Store & same_addr[r + r.rmw] | // there is no write x to the same addr
      x not in same_hart[r]         // from a different hart, such that
      and x in after_reserve_of[r]  // x follows (the write r reads from) in gmo
      and r.rmw in x.gmo            // and r follows x in gmo
}

pred RISCV_mm { LoadValue and Atomicity }
\end{lstlisting}}
  \caption{The RVWMO memory model formalized in Alloy (2/4: Axioms)}
  \label{fig:alloy2}
\end{figure}
\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
////////////////////////////////////////////////////////////////////////////////
// Basic model of memory

sig Hart {  // hardware thread
  start : one Event
}
sig Address {}
abstract sig Event {
  po: lone Event // program order
}

abstract sig MemoryEvent extends Event {
  address: one Address,
  aq: lone MemoryEvent, // opcode bit
  rl: lone MemoryEvent, // opcode bit
  sc: lone MemoryEvent, // for AMOs with .aq and .rl, to distinguish from lr/sc
  gmo: set MemoryEvent   // global memory order
}
sig Load extends MemoryEvent {
  addr: set Event,
  ctrl: set Event,
  data: set Store,
  successdep: set Event,
  rmw: lone Store
}
sig Store extends MemoryEvent {
  rf: set Load
}
sig Fence extends Event {
  pr: lone Fence, // opcode bit
  pw: lone Fence, // opcode bit
  sr: lone Fence, // opcode bit
  sw: lone Fence  // opcode bit
}
sig FenceI extends Event {}
   
// FENCE PPO
fun FencePRSR : Fence { Fence.(pr & sr) } 
fun FencePRSW : Fence { Fence.(pr & sw) } 
fun FencePWSR : Fence { Fence.(pw & sr) } 
fun FencePWSW : Fence { Fence.(pw & sw) } 

fun ppo_fence : MemoryEvent->MemoryEvent {
    (Load  <: ^po :> FencePRSR).(^po :> Load)
  + (Load  <: ^po :> FencePRSW).(^po :> Store)
  + (Store <: ^po :> FencePWSR).(^po :> Load)
  + (Store <: ^po :> FencePWSW).(^po :> Store)
}
\end{lstlisting}}
  \caption{The RVWMO memory model formalized in Alloy (3/4: model of memory)}
  \label{fig:alloy3}
\end{figure}
\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
// auxiliary definitions
fun po_loc_no_intervening_write : MemoryEvent->MemoryEvent {
  po_loc - ((po_loc :> Store).po_loc)
}

fun RFInit : Load { Load - Store.rf }
fun rsw : Load->Load { ~rf.rf + (RFInit <: address.~address :> RFInit) }
fun rdw : Load->Load { (Load <: po_loc :> Load) - rsw }

fun po_loc : Event->Event { ^po & address.~address }
fun same_hart[e: Event] : set Event { e + e.^~po + e.^po }
fun same_addr[e: Event] : set Event { e.address.~address }

// basic facts about well-formed execution candidates
fact { acyclic[po] }
fact { all e: Event | one e.*~po.~start }  // each event is in exactly one hart
fact { rf.~rf in iden } // each read returns the value of only one write
fact { total[gmo, MemoryEvent] } // gmo is a total order over all MemoryEvents

//rf
fact { rf in address.~address }
fun rfi : Store->Load { Store <: po_loc_no_intervening_write :> Load }

//dep
fact { addr + ctrl + data in ^po }
fact { successdep in (Write.~rmw) <: ^po }
fact { ctrl.*po in ctrl }
fact { rmw in ^po }

////////////////////////////////////////////////////////////////////////////////
// =Opcode encoding restrictions=

// opcode bits are either set (encoded, e.g., as f.pr in iden) or unset
// (f.pr not in iden).  The bits cannot be used for anything else
fact { pr + pw + sr + sw + aq + rl + sc in iden }
fact { sc in aq + rl }
fact { Load.sc.rmw in Store.sc and Store.sc.~rmw in Load.sc }

// Fences must have either pr or pw set, and either sr or sw set
fact { Fence in Fence.(pr + pw) & Fence.(sr + sw) }

// there is no write-acquire, but there is write-strong-acquire
fact { Store & Acquire in Release }
fact { Load & Release in Acquire }

////////////////////////////////////////////////////////////////////////////////
// =Alloy shortcuts=
pred acyclic[rel: Event->Event] { no iden & ^rel }
pred total[rel: Event->Event, bag: Event] {
  all disj e, e': bag | e->e' in rel + ~rel
  acyclic[rel]
}
\end{lstlisting}}
  \caption{The RVWMO memory model formalized in Alloy (4/4: Auxiliaries)}
  \label{fig:alloy4}
\end{figure}

\clearpage
The second is an equivalent formulation which is slightly more complex but which is more computationally efficient.  We expect that analysis tools will be built off of this second formulation.  Also included are empirical checks that the two models match.

This formulation, however, does not apply when mixed-size accesses are used, nor when {\tt lr}/{\tt sc} to different addresses are used.

\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
// coherence order: a total order on the writes to each address
fun co : Write->Write { Write <: ((address.~address) & gmo) :> Write }
// from-read: from a read to the coherence successors of the rf-source of the write
fun fr : Read->Write { ~rf.co + ((Read - Write.rf) <: address.~address :> Write) }

// e = external; i.e., from a different hart
fun rfe : Store->Load  { rf - iden - ^po - ^~po }
fun coe : Store->Store { co - iden - ^po - ^~po }
fun fre :  Load->Store { fr - iden - ^po - ^~po }

pred sc_per_location  { acyclic[rf + co + fr + po_loc] }
pred atomicity { no rmw & fre.coe }
pred causality { acyclic[rfe + co + fr + ppo] }

// equality checks
run RISCV_mm_com_sanity { RISCV_mm_com } for 3
check RISCV_mm_gmo_com { RISCV_mm => RISCV_mm_com } for 6
check RISCV_mm_com_gmo {
  rmw in address.~address => // the rf/co/fr model assumes rmw in same addr
  RISCV_mm_com =>
  rfe + co + fr in gmo =>    // pick a gmo which matches rfe+co+fr
  RISCV_mm
} for 6
\end{lstlisting}
  }
  \caption{An alternative, more computationally efficient but less complete axiomatic definition}
  \label{fig:com}
\end{figure}

\clearpage
\section{Formal Axiomatic Specification in Herd}

See also: \url{http://moscova.inria.fr/~maranget/cats7/riscv}

(This herd model is not yet updated to account for rules \ref{ppo:amostore}--\ref{ppo:amoload} and \ref{ppo:success}, and rule {\tt r4} has been tentatively removed from RVWMO.  Updates to come...)

\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
(*************)
(* Utilities *)
(*************)

let fence.r.r = [R];fencerel(Fence.r.r);[R]
let fence.r.w = [R];fencerel(Fence.r.w);[W]
let fence.r.rw = [R];fencerel(Fence.r.rw);[M]
let fence.w.r = [W];fencerel(Fence.w.r);[R]
let fence.w.w = [W];fencerel(Fence.w.w);[W]
let fence.w.rw = [W];fencerel(Fence.w.rw);[M]
let fence.rw.r = [M];fencerel(Fence.rw.r);[R]
let fence.rw.w = [M];fencerel(Fence.rw.w);[W]
let fence.rw.rw = [M];fencerel(Fence.rw.rw);[M]

let fence = 
  fence.r.r | fence.r.w | fence.r.rw |
  fence.w.r | fence.w.w | fence.w.rw |
  fence.rw.r | fence.rw.w | fence.rw.rw


let po-loc-no-w = po-loc \ (po-loc;[W];po-loc)
let rsw = rf^-1;rf

let LD-ACQ = R & (Acq|AcqRel)
and ST-REL = W & (Rel|AcqRel)

(*************)
(* ppo rules *)
(*************)

let r1 = [M];po-loc;[W]
and r2 = fence
and r3 = [LD-ACQ];po;[M]
and r4 = [R];po-loc;[LD-ACQ]
and r5 = [M];po;[ST-REL]
and r6 = [W & AcqRel];po;[R & AcqRel]
and r7 = [R];addr;[M]
and r8 = [R];data;[W]
and r9 = [R];ctrl;[W]
and r10 = ([R];po-loc-no-w;[R]) \ rsw
and r11 = [R];(addr|data);[W];po-loc-no-w;[R]
and r12 = [R];addr;[M];po;[W]
and r13 = [R];ctrl;[Fence.i];po;[R]
and r14 = [R];addr;[M];po;[Fence.i];po;[M]

let ppo =
 r1
| r2
| r3
| r4
| r5
| r6
| r7
| r8
| r9
| r10
| r11
| r12
| r13
| r14
\end{lstlisting}
  }
  \caption{{\tt riscv-defs.cat}, part of a herd version of the RVWMO memory model (1/3)}
  \label{fig:herd1}
\end{figure}

\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
Total

(* Notice that herd has defined its own rf relation *)

(* Define ppo *)
include "riscv-defs.cat"

(********************************)
(* Generate global memory order *)
(********************************)

let gmo0 = (* precursor: ie build gmo as an total order that include gmo0 *)
  loc & (W\FW) * FW | # Final write before any write to the same location
  ppo |               # ppo compatible
  rfe                 # first half of 

(* Walk over all linear extensions of gmo0 *)
with  gmo from linearisations(M\IW,gmo0)

(* Add initial writes upfront -- convenient for computing rfGMO *)
let gmo = gmo | loc & IW * (M\IW)

(**********)
(* Axioms *)
(**********)

(* Compute rf according to the load value axiom, aka rfGMO *)
let WR = loc & ([W];(gmo|po);[R])
let rfGMO = WR \ (loc&([W];gmo);WR)

(* Check equality of herd rf and of rfGMO *)
empty (rf\rfGMO)|(rfGMO\rf) as RfCons

(* Atomic axion *)
let infloc = (gmo & loc)^-1
let inflocext = infloc & ext

let winside  = (infloc;rmw;inflocext) & (infloc;rf;rmw;inflocext) & [W]
empty winside as Atomic
\end{lstlisting}
  }
  \caption{{\tt riscv.cat}, a herd version of the RVWMO memory model (2/3)}
  \label{fig:herd2}
\end{figure}

\begin{figure}[h!]
  {
  \tt\bfseries\centering\footnotesize
  \begin{lstlisting}
Partial

(***************)
(* Definitions *)
(***************)

(* Define ppo *)
include "riscv-defs.cat"

(* Compute coherence relation *)
include "cos-opt.cat"

(**********)
(* Axioms *)
(**********)

(* Sc per location *)
acyclic co|rf|fr|po-loc as Coherence

(* Main model axiom *)
acyclic co|rfe|fr|ppo as Model

(* Atomicity axiom *)
empty rmw & (fre;coe) as Atomic
\end{lstlisting}
  }
  \caption{{\tt riscv.cat}, part of an alternative herd presentation of the RVWMO memory model (2/3)}
  \label{fig:herd3}
\end{figure}