diff options
author | Andrew Waterman <andrew@sifive.com> | 2021-06-02 10:53:23 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-02 10:53:23 -0700 |
commit | bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722 (patch) | |
tree | 182a3d99d2ffc56b2f3f5fc85e9d884d604fbafb /riscv/execute.cc | |
parent | cb7805a9689e2d5de23b959b177940d1b8813bd2 (diff) | |
download | spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.zip spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.tar.gz spike-bf4b1e09ed8e7a11ecff9891b12ce5d7f3375722.tar.bz2 |
Remove Duff's Device in main simulation loop (#721)
With recent compilers on recent computers, the much simpler version of
the code is actually slightly faster. I suspect, but haven't proven,
that more accurate indirect jump prediction is the main explanation.
Reduced I$ pressure might be a secondary factor.
Diffstat (limited to 'riscv/execute.cc')
-rw-r--r-- | riscv/execute.cc | 55 |
1 files changed, 12 insertions, 43 deletions
diff --git a/riscv/execute.cc b/riscv/execute.cc index ee8257f..62ae019 100644 --- a/riscv/execute.cc +++ b/riscv/execute.cc @@ -256,6 +256,7 @@ void processor_t::step(size_t n) if (unlikely(slow_path())) { + // Main simulation loop, slow path. while (instret < n) { if (unlikely(!state.serialized && state.single_step == state.STEP_STEPPED)) { @@ -280,49 +281,17 @@ void processor_t::step(size_t n) } else while (instret < n) { - // This code uses a modified Duff's Device to improve the performance - // of executing instructions. While typical Duff's Devices are used - // for software pipelining, the switch statement below primarily - // benefits from separate call points for the fetch.func function call - // found in each execute_insn. This function call is an indirect jump - // that depends on the current instruction. By having an indirect jump - // dedicated for each icache entry, you improve the performance of the - // host's next address predictor. Each case in the switch statement - // allows for the program flow to contine to the next case if it - // corresponds to the next instruction in the program and instret is - // still less than n. - // - // According to Andrew Waterman's recollection, this optimization - // resulted in approximately a 2x performance increase. - - // This figures out where to jump to in the switch statement - size_t idx = _mmu->icache_index(pc); - - // This gets the cached decoded instruction from the MMU. If the MMU - // does not have the current pc cached, it will refill the MMU and - // return the correct entry. ic_entry->data.func is the C++ function - // corresponding to the instruction. - auto ic_entry = _mmu->access_icache(pc); - - // This macro is included in "icache.h" included within the switch - // statement below. The indirect jump corresponding to the instruction - // is located within the execute_insn() function call. - #define ICACHE_ACCESS(i) { \ - insn_fetch_t fetch = ic_entry->data; \ - pc = execute_insn(this, pc, fetch); \ - ic_entry = ic_entry->next; \ - if (i == mmu_t::ICACHE_ENTRIES-1) break; \ - if (unlikely(ic_entry->tag != pc)) break; \ - if (unlikely(instret+1 == n)) break; \ - instret++; \ - state.pc = pc; \ - } - - // This switch statement implements the modified Duff's device as - // explained above. - switch (idx) { - // "icache.h" is generated by the gen_icache script - #include "icache.h" + // Main simulation loop, fast path. + for (auto ic_entry = _mmu->access_icache(pc); ; ) { + auto fetch = ic_entry->data; + pc = execute_insn(this, pc, fetch); + ic_entry = ic_entry->next; + if (unlikely(ic_entry->tag != pc)) + break; + if (unlikely(instret + 1 == n)) + break; + instret++; + state.pc = pc; } advance_pc(); |