aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndy Wright <acwright@mit.edu>2016-12-01 15:04:34 -0500
committerAndrew Waterman <aswaterman@gmail.com>2016-12-01 12:04:34 -0800
commit39ee063f47d6467084f4442624cb28430ac0eebc (patch)
tree440ff48f64e157cf90dcd8e00e011a08726bd04c
parent679d5f5e927a0a59bbaaab33955ded79e860cded (diff)
downloadriscv-isa-sim-39ee063f47d6467084f4442624cb28430ac0eebc.zip
riscv-isa-sim-39ee063f47d6467084f4442624cb28430ac0eebc.tar.gz
riscv-isa-sim-39ee063f47d6467084f4442624cb28430ac0eebc.tar.bz2
Added comments about the modified Duff's Device in execute.cc (#77)
-rw-r--r--riscv/execute.cc37
1 files changed, 37 insertions, 0 deletions
diff --git a/riscv/execute.cc b/riscv/execute.cc
index 7b42262..36e7896 100644
--- a/riscv/execute.cc
+++ b/riscv/execute.cc
@@ -40,6 +40,9 @@ inline void processor_t::update_histogram(reg_t pc)
#endif
}
+// This is expected to be inlined by the compiler so each use of execute_insn
+// includes a duplicated body of the function to get separate fetch.func
+// function calls.
static reg_t execute_insn(processor_t* p, reg_t pc, insn_fetch_t fetch)
{
commit_log_stash_privilege(p->get_state());
@@ -121,9 +124,40 @@ void processor_t::step(size_t n)
}
else while (instret < n)
{
+ // This code uses a modified Duff's Device to improve the performance
+ // of executing instructions. While typical Duff's Devices are used
+ // for software pipelining, the switch statement below primarily
+ // benefits from separate call points for the fetch.func function call
+ // found in each execute_insn. This function call is an indirect jump
+ // that depends on the current instruction. By having an indirect jump
+ // dedicated for each icache entry, you improve the performance of the
+ // host's next address predictor. Each case in the switch statement
+ // allows for the program flow to contine to the next case if it
+ // corresponds to the next instruction in the program and instret is
+ // still less than n.
+ //
+ // According to Andrew Waterman's recollection, this optimization
+ // resulted in approximately a 2x performance increase.
+ //
+ // If there is support for compressed instructions, the mmu and the
+ // switch statement get more complicated. Each branch target is stored
+ // in the index corresponding to mmu->icache_index(), but consecutive
+ // non-branching instructions are stored in consecutive indices even if
+ // mmu->icache_index() specifies a different index (which is the case
+ // for 32-bit instructions in the presence of compressed instructions).
+
+ // This figures out where to jump to in the switch statement
size_t idx = _mmu->icache_index(pc);
+
+ // This gets the cached decoded instruction form the MMU. If the MMU
+ // does not have the current pc cached, it will refill the MMU and
+ // return the correct entry. ic_entry->data.func is the C++ function
+ // corresponding to the instruction.
auto ic_entry = _mmu->access_icache(pc);
+ // This macro is included in "icache.h" included within the switch
+ // statement below. The indirect jump corresponding to the instruction
+ // is located within the execute_insn() function call.
#define ICACHE_ACCESS(i) { \
insn_fetch_t fetch = ic_entry->data; \
ic_entry++; \
@@ -135,7 +169,10 @@ void processor_t::step(size_t n)
state.pc = pc; \
}
+ // This switch statement implements the modified Duff's device as
+ // explained above.
switch (idx) {
+ // "icache.h" is generated by the gen_icache script
#include "icache.h"
}