Remove Duff's Device in main simulation loop (#721)

With recent compilers on recent computers, the much simpler version of the code is actually slightly faster. I suspect, but haven't proven, that more accurate indirect jump prediction is the main explanation. Reduced I$ pressure might be a secondary factor.
author: Andrew Waterman <andrew@sifive.com> 2021-06-02 10:53:23 -0700
committer: GitHub <noreply@github.com> 2021-06-02 10:53:23 -0700
commit: bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722 (patch)
tree: 182a3d99d2ffc56b2f3f5fc85e9d884d604fbafb /riscv/execute.cc
parent: cb7805a9689e2d5de23b959b177940d1b8813bd2 (diff)
download: spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.zip
spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.tar.gz
spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.tar.bz2
1 files changed, 12 insertions, 43 deletions
diff --git a/riscv/execute.cc b/riscv/execute.cc
index ee8257f..62ae019 100644
--- a/riscv/execute.cc
+++ b/riscv/execute.cc
@@ -256,6 +256,7 @@ void processor_t::step(size_t n)
 
       if (unlikely(slow_path()))
       {
+        // Main simulation loop, slow path.
         while (instret < n)
         {
           if (unlikely(!state.serialized && state.single_step == state.STEP_STEPPED)) {
@@ -280,49 +281,17 @@ void processor_t::step(size_t n)
       }
       else while (instret < n)
       {
-        // This code uses a modified Duff's Device to improve the performance
-        // of executing instructions. While typical Duff's Devices are used
-        // for software pipelining, the switch statement below primarily
-        // benefits from separate call points for the fetch.func function call
-        // found in each execute_insn. This function call is an indirect jump
-        // that depends on the current instruction. By having an indirect jump
-        // dedicated for each icache entry, you improve the performance of the
-        // host's next address predictor. Each case in the switch statement
-        // allows for the program flow to contine to the next case if it
-        // corresponds to the next instruction in the program and instret is
-        // still less than n.
-        //
-        // According to Andrew Waterman's recollection, this optimization
-        // resulted in approximately a 2x performance increase.
-
-        // This figures out where to jump to in the switch statement
-        size_t idx = _mmu->icache_index(pc);
-
-        // This gets the cached decoded instruction from the MMU. If the MMU
-        // does not have the current pc cached, it will refill the MMU and
-        // return the correct entry. ic_entry->data.func is the C++ function
-        // corresponding to the instruction.
-        auto ic_entry = _mmu->access_icache(pc);
-
-        // This macro is included in "icache.h" included within the switch
-        // statement below. The indirect jump corresponding to the instruction
-        // is located within the execute_insn() function call.
-        #define ICACHE_ACCESS(i) { \
-          insn_fetch_t fetch = ic_entry->data; \
-          pc = execute_insn(this, pc, fetch); \
-          ic_entry = ic_entry->next; \
-          if (i == mmu_t::ICACHE_ENTRIES-1) break; \
-          if (unlikely(ic_entry->tag != pc)) break; \
-          if (unlikely(instret+1 == n)) break; \
-          instret++; \
-          state.pc = pc; \
-        }
-
-        // This switch statement implements the modified Duff's device as
-        // explained above.
-        switch (idx) {
-          // "icache.h" is generated by the gen_icache script
-          #include "icache.h"
+        // Main simulation loop, fast path.
+        for (auto ic_entry = _mmu->access_icache(pc); ; ) {
+          auto fetch = ic_entry->data;
+          pc = execute_insn(this, pc, fetch);
+          ic_entry = ic_entry->next;
+          if (unlikely(ic_entry->tag != pc))
+            break;
+          if (unlikely(instret + 1 == n))
+            break;
+          instret++;
+          state.pc = pc;
         }
 
         advance_pc();
author	Andrew Waterman <andrew@sifive.com>	2021-06-02 10:53:23 -0700
committer	GitHub <noreply@github.com>	2021-06-02 10:53:23 -0700
commit	bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722 (patch)
tree	182a3d99d2ffc56b2f3f5fc85e9d884d604fbafb /riscv/execute.cc
parent	cb7805a9689e2d5de23b959b177940d1b8813bd2 (diff)
download	spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.zip spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.tar.gz spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.tar.bz2