aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Waterman <waterman@eecs.berkeley.edu>2014-01-13 16:42:02 -0800
committerAndrew Waterman <waterman@eecs.berkeley.edu>2014-01-13 16:42:02 -0800
commit77f281580773fe5217ae40bcf8a0f8bc05c28ded (patch)
tree79885362aa51ce842ec912ba4523f36620b1dce8
parent7f457c47b339cc7c79f56bb277ed8ed989e88ae1 (diff)
downloadspike-77f281580773fe5217ae40bcf8a0f8bc05c28ded.zip
spike-77f281580773fe5217ae40bcf8a0f8bc05c28ded.tar.gz
spike-77f281580773fe5217ae40bcf8a0f8bc05c28ded.tar.bz2
Improve performance for branchy code
We now use a heavily unrolled loop as the software I$, which allows the host machine's branch target prediction to associate target PCs with unique-ish host PCs.
-rw-r--r--Makefile.in2
-rw-r--r--riscv/decode.h9
-rw-r--r--riscv/encoding.h4
-rwxr-xr-xriscv/gen_icache9
-rw-r--r--riscv/htif.cc8
-rw-r--r--riscv/htif.h2
-rw-r--r--riscv/interactive.cc4
-rw-r--r--riscv/mmu.cc4
-rw-r--r--riscv/mmu.h74
-rw-r--r--riscv/processor.cc87
-rw-r--r--riscv/processor.h4
-rw-r--r--riscv/riscv.mk.in4
-rw-r--r--riscv/sim.cc2
-rw-r--r--riscv/sim.h3
14 files changed, 131 insertions, 85 deletions
diff --git a/Makefile.in b/Makefile.in
index 19e6805..45e3a11 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -192,7 +192,7 @@ $$($(2)_objs) : %.o : %.cc $$($(2)_gen_hdrs)
$$($(2)_c_objs) : %.o : %.c $$($(2)_gen_hdrs)
$(COMPILE_C) -c $$<
-$(2)_junk += $$($(2)_objs) $$($(2)_c_objs) $$($(2)_deps) $$($(2)_c_deps)
+$(2)_junk += $$($(2)_objs) $$($(2)_c_objs) $$($(2)_deps) $$($(2)_c_deps) $$($(2)_gen_hdrs)
# Build a library for this subproject
diff --git a/riscv/decode.h b/riscv/decode.h
index 6c26a68..4abd9f9 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -80,8 +80,9 @@ public:
}
void write(size_t i, T value)
{
- if (!(zero_reg && i == 0))
- data[i] = value;
+ data[i] = value;
+ if (zero_reg)
+ data[0] = 0;
}
const T& operator [] (size_t i) const
{
@@ -164,12 +165,14 @@ private:
#define set_pc(x) \
do { if ((x) & 3 /* For now... */) \
throw trap_instruction_address_misaligned(); \
- npc = (x); \
+ npc = sext_xprlen(x); \
} while(0)
#define validate_csr(which, write) ({ \
int write_priv = ((which) >> 10) & 3; \
int read_priv = ((which) >> 8) & 3; \
+ if ((which) == CSR_FCSR || (which) == CSR_FFLAGS || (which) == CSR_FRM) \
+ require_fp; \
if (read_priv > 0 || (write_priv > 0 && (write))) require_supervisor; \
(which); })
diff --git a/riscv/encoding.h b/riscv/encoding.h
index 48a0108..711ef7b 100644
--- a/riscv/encoding.h
+++ b/riscv/encoding.h
@@ -288,7 +288,7 @@
#define MASK_SRET 0xffffffff
#define MATCH_FNMADD_S 0x4f
#define MASK_FNMADD_S 0x600007f
-#define MATCH_JAL 0x67
+#define MATCH_JAL 0x6f
#define MASK_JAL 0x7f
#define MATCH_LWU 0x6003
#define MASK_LWU 0x707f
@@ -392,7 +392,7 @@
#define MASK_DIVU 0xfe00707f
#define MATCH_AMOSWAP_W 0x800202f
#define MASK_AMOSWAP_W 0xf800707f
-#define MATCH_JALR 0x6f
+#define MATCH_JALR 0x67
#define MASK_JALR 0x707f
#define MATCH_FSD 0x3027
#define MASK_FSD 0x707f
diff --git a/riscv/gen_icache b/riscv/gen_icache
new file mode 100755
index 0000000..c581b55
--- /dev/null
+++ b/riscv/gen_icache
@@ -0,0 +1,9 @@
+#!/bin/sh
+echo \#define ICACHE_SIZE $1
+n=$(($1-1))
+echo \#define ICACHE_SWITCH \\
+for i in `seq 0 $n`
+do
+ echo case $i: ICACHE_ACCESS\($i\)\; \\
+done
+echo
diff --git a/riscv/htif.cc b/riscv/htif.cc
index af26faa..741a00f 100644
--- a/riscv/htif.cc
+++ b/riscv/htif.cc
@@ -91,14 +91,14 @@ void htif_isasim_t::tick_once()
old_val = coreid;
break;
case CSR_TOHOST & 0x1f:
- old_val = proc->state.tohost;
+ old_val = proc->get_state()->tohost;
if (write)
- proc->state.tohost = new_val;
+ proc->get_state()->tohost = new_val;
break;
case CSR_FROMHOST & 0x1f:
- old_val = proc->state.fromhost;
+ old_val = proc->get_state()->fromhost;
if (write && old_val == 0)
- proc->state.fromhost = new_val;
+ proc->set_fromhost(new_val);
break;
case CSR_RESET & 0x1f:
old_val = !proc->running();
diff --git a/riscv/htif.h b/riscv/htif.h
index 2a940ad..4e1025e 100644
--- a/riscv/htif.h
+++ b/riscv/htif.h
@@ -18,6 +18,7 @@ class htif_isasim_t : public htif_pthread_t
public:
htif_isasim_t(sim_t* _sim, const std::vector<std::string>& args);
bool tick();
+ bool done();
private:
sim_t* sim;
@@ -25,7 +26,6 @@ private:
uint8_t seqno;
void tick_once();
- bool done();
};
#endif
diff --git a/riscv/interactive.cc b/riscv/interactive.cc
index ad38ace..9014aa0 100644
--- a/riscv/interactive.cc
+++ b/riscv/interactive.cc
@@ -45,7 +45,7 @@ static std::string readline(int fd)
void sim_t::interactive()
{
- while (true)
+ while (!htif->done())
{
std::cerr << ": " << std::flush;
std::string s = readline(2);
@@ -103,7 +103,7 @@ void sim_t::interactive_run(const std::string& cmd, const std::vector<std::strin
size_t steps = args.size() ? atoll(args[0].c_str()) : -1;
ctrlc_pressed = false;
set_procs_debug(noisy);
- for (size_t i = 0; i < steps && !ctrlc_pressed; i++)
+ for (size_t i = 0; i < steps && !ctrlc_pressed && !htif->done(); i++)
step(1);
}
diff --git a/riscv/mmu.cc b/riscv/mmu.cc
index f8efd5a..4675f75 100644
--- a/riscv/mmu.cc
+++ b/riscv/mmu.cc
@@ -16,7 +16,7 @@ mmu_t::~mmu_t()
void mmu_t::flush_icache()
{
- for (size_t i = 0; i < ICACHE_ENTRIES; i++)
+ for (size_t i = 0; i < ICACHE_SIZE; i++)
icache[i].tag = -1;
}
@@ -32,7 +32,7 @@ void mmu_t::flush_tlb()
void* mmu_t::refill_tlb(reg_t addr, reg_t bytes, bool store, bool fetch)
{
reg_t idx = (addr >> PGSHIFT) % TLB_ENTRIES;
- reg_t expected_tag = addr & ~(PGSIZE-1);
+ reg_t expected_tag = addr >> PGSHIFT;
reg_t pte = walk(addr);
diff --git a/riscv/mmu.h b/riscv/mmu.h
index 551fa46..c09cfc4 100644
--- a/riscv/mmu.h
+++ b/riscv/mmu.h
@@ -4,6 +4,7 @@
#define _RISCV_MMU_H
#include "decode.h"
+#include "icache.h"
#include "trap.h"
#include "common.h"
#include "config.h"
@@ -21,6 +22,21 @@ const reg_t VPN_BITS = PTIDXBITS * LEVELS;
const reg_t PPN_BITS = 8*sizeof(reg_t) - PGSHIFT;
const reg_t VA_BITS = VPN_BITS + PGSHIFT;
+struct insn_fetch_t
+{
+ insn_func_t func;
+ union {
+ insn_t insn;
+ uint_fast32_t pad;
+ } insn;
+};
+
+struct icache_entry_t {
+ reg_t tag;
+ reg_t pad;
+ insn_fetch_t data;
+};
+
// this class implements a processor's port into the virtual memory system.
// an MMU and instruction cache are maintained for simulator performance.
class mmu_t
@@ -32,8 +48,6 @@ public:
// template for functions that load an aligned value from memory
#define load_func(type) \
type##_t load_##type(reg_t addr) __attribute__((always_inline)) { \
- if(unlikely(addr % sizeof(type##_t))) \
- throw trap_load_address_misaligned(addr); \
void* paddr = translate(addr, sizeof(type##_t), false, false); \
return *(type##_t*)paddr; \
}
@@ -53,8 +67,6 @@ public:
// template for functions that store an aligned value to memory
#define store_func(type) \
void store_##type(reg_t addr, type##_t val) { \
- if(unlikely(addr % sizeof(type##_t))) \
- throw trap_store_address_misaligned(addr); \
void* paddr = translate(addr, sizeof(type##_t), true, false); \
*(type##_t*)paddr = val; \
}
@@ -65,40 +77,34 @@ public:
store_func(uint32)
store_func(uint64)
- struct insn_fetch_t
- {
- insn_func_t func;
- union {
- insn_t insn;
- uint_fast32_t pad;
- } insn;
- };
-
// load instruction from memory at aligned address.
- inline insn_fetch_t load_insn(reg_t addr)
+ inline icache_entry_t access_icache(reg_t addr)
{
- reg_t offset = addr & (sizeof(insn_t) * (ICACHE_ENTRIES-1));
- offset *= sizeof(icache_entry_t) / sizeof(insn_t);
- icache_entry_t* entry = (icache_entry_t*)((char*)icache + offset);
- insn_fetch_t data = entry->data;
- if (likely(entry->tag == addr))
- return data;
+ reg_t idx = (addr / sizeof(insn_t)) % ICACHE_SIZE;
+ icache_entry_t entry = icache[idx];
+ if (likely(entry.tag == addr))
+ return entry;
void* iaddr = translate(addr, sizeof(insn_t), false, true);
insn_fetch_t fetch;
fetch.insn.pad = *(decltype(fetch.insn.insn.bits())*)iaddr;
fetch.func = proc->decode_insn(fetch.insn.insn);
- entry->tag = addr;
- entry->data = fetch;
+ icache[idx].tag = addr;
+ icache[idx].data = fetch;
reg_t paddr = (char*)iaddr - mem;
if (!tracer.empty() && tracer.interested_in_range(paddr, paddr + sizeof(insn_t), false, true))
{
- entry->tag = -1;
+ icache[idx].tag = -1;
tracer.trace(paddr, sizeof(insn_t), false, true);
}
- return entry->data;
+ return icache[idx];
+ }
+
+ inline insn_fetch_t load_insn(reg_t addr)
+ {
+ return access_icache(addr).data;
}
void set_processor(processor_t* p) { proc = p; flush_tlb(); }
@@ -115,13 +121,7 @@ private:
memtracer_list_t tracer;
// implement an instruction cache for simulator performance
- static const reg_t ICACHE_ENTRIES = 2048;
- struct icache_entry_t {
- reg_t tag;
- reg_t pad;
- insn_fetch_t data;
- };
- icache_entry_t icache[ICACHE_ENTRIES];
+ icache_entry_t icache[ICACHE_SIZE];
// implement a TLB for simulator performance
static const reg_t TLB_ENTRIES = 256;
@@ -141,11 +141,15 @@ private:
__attribute__((always_inline))
{
reg_t idx = (addr >> PGSHIFT) % TLB_ENTRIES;
- reg_t expected_tag = addr & ~(PGSIZE-1);
-
- reg_t* tlb_tag = fetch ? tlb_insn_tag : store ? tlb_store_tag :tlb_load_tag;
+ reg_t expected_tag = addr >> PGSHIFT;
+ reg_t* tags = fetch ? tlb_insn_tag : store ? tlb_store_tag :tlb_load_tag;
+ reg_t tag = tags[idx];
void* data = tlb_data[idx] + addr;
- if (likely(tlb_tag[idx] == expected_tag))
+
+ if (unlikely(addr & (bytes-1)))
+ store ? throw trap_store_address_misaligned(addr) : throw trap_load_address_misaligned(addr);
+
+ if (likely(tag == expected_tag))
return data;
return refill_tlb(addr, bytes, store, fetch);
diff --git a/riscv/processor.cc b/riscv/processor.cc
index b12a8e0..17b4181 100644
--- a/riscv/processor.cc
+++ b/riscv/processor.cc
@@ -5,7 +5,9 @@
#include "common.h"
#include "config.h"
#include "sim.h"
+#include "htif.h"
#include "disasm.h"
+#include "icache.h"
#include <cinttypes>
#include <cmath>
#include <cstdlib>
@@ -95,9 +97,10 @@ void processor_t::step(size_t n)
if(!run)
return;
- size_t i = 0;
- reg_t npc = state.pc;
mmu_t* _mmu = mmu;
+ auto count32 = decltype(state.compare)(state.count);
+ bool count_le_compare = count32 <= state.compare;
+ n = std::min(n, size_t(state.compare - count32) | 1);
try
{
@@ -106,9 +109,9 @@ void processor_t::step(size_t n)
// execute_insn fetches and executes one instruction
#define execute_insn(noisy) \
do { \
- mmu_t::insn_fetch_t fetch = _mmu->load_insn(npc); \
+ insn_fetch_t fetch = mmu->load_insn(state.pc); \
if(noisy) disasm(fetch.insn.insn); \
- npc = fetch.func(this, fetch.insn.insn, npc); \
+ state.pc = fetch.func(this, fetch.insn.insn, state.pc); \
} while(0)
@@ -118,50 +121,65 @@ void processor_t::step(size_t n)
#undef execute_insn
#define execute_insn(noisy) \
do { \
- mmu_t::insn_fetch_t fetch = _mmu->load_insn(npc); \
+ insn_fetch_t fetch = _mmu->load_insn(state.pc); \
if(noisy) disasm(fetch.insn.insn); \
bool in_spvr = state.sr & SR_S; \
- if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") ", npc, fetch.insn.insn.bits()); \
- /*if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") %s ", npc, fetch.insn.insn.bits(), disasmblr.disassemble(fetch.insn.insn).c_str());*/ \
- npc = fetch.func(this, fetch.insn.insn, npc); \
+ if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") ", state.pc, fetch.insn.insn.bits()); \
+ /*if (!in_spvr) fprintf(stderr, "\n0x%016" PRIx64 " (0x%08" PRIx32 ") %s ", state.pc, fetch.insn.insn.bits(), disasmblr.disassemble(fetch.insn.insn).c_str());*/ \
+ state.pc = fetch.func(this, fetch.insn.insn, state.pc); \
} while(0)
#endif
- if(debug) for( ; i < n; i++) // print out instructions as we go
- execute_insn(true);
- else
+ if (debug) // print out instructions as we go
{
- // unrolled for speed
- for( ; n > 3 && i < n-3; i+=4)
+ for (size_t i = 0; i < n; state.count++, i++)
+ execute_insn(true);
+ }
+ else while (n > 0)
+ {
+ size_t idx = (state.pc / sizeof(insn_t)) % ICACHE_SIZE;
+ auto ic_entry_init = &_mmu->icache[idx], ic_entry = ic_entry_init;
+
+ #define update_count() { \
+ size_t i = ic_entry - ic_entry_init; \
+ state.count += i; \
+ if (i >= n) break; \
+ n -= i; }
+
+ #define ICACHE_ACCESS(idx) { \
+ insn_t insn = ic_entry->data.insn.insn; \
+ insn_func_t func = ic_entry->data.func; \
+ if (unlikely(ic_entry->tag != state.pc)) break; \
+ ic_entry++; \
+ state.pc = func(this, insn, state.pc); }
+
+ switch (idx) while (true)
{
- execute_insn(false);
- execute_insn(false);
- execute_insn(false);
- execute_insn(false);
+ ICACHE_SWITCH;
+ update_count();
+ ic_entry_init = ic_entry = &_mmu->icache[0];
}
- for( ; i < n; i++)
- execute_insn(false);
- }
- state.pc = npc;
+ _mmu->access_icache(state.pc);
+ update_count();
+ }
}
catch(trap_t& t)
{
- take_trap(npc, t);
+ take_trap(t);
}
- // update timer and possibly register a timer interrupt
- uint32_t old_count = state.count;
- state.count += i;
- if(old_count < state.compare && uint64_t(old_count) + i >= state.compare)
+ bool count_ge_compare =
+ uint64_t(n) + decltype(state.compare)(state.count) >= state.compare;
+ if (count_le_compare && count_ge_compare)
set_interrupt(IRQ_TIMER, true);
}
-void processor_t::take_trap(reg_t pc, trap_t& t)
+void processor_t::take_trap(trap_t& t)
{
if (debug)
fprintf(stderr, "core %3d: exception %s, epc 0x%016" PRIx64 "\n",
- id, t.name(), pc);
+ id, t.name(), state.pc);
// switch to supervisor, set previous supervisor bit, disable interrupts
set_pcr(CSR_STATUS, (((state.sr & ~SR_EI) | SR_S) & ~SR_PS & ~SR_PEI) |
@@ -170,7 +188,7 @@ void processor_t::take_trap(reg_t pc, trap_t& t)
yield_load_reservation();
state.cause = t.cause();
- state.epc = pc;
+ state.epc = state.pc;
state.pc = state.evec;
t.side_effects(&state); // might set badvaddr etc.
@@ -255,14 +273,19 @@ reg_t processor_t::set_pcr(int which, reg_t val)
state.tohost = val;
break;
case CSR_FROMHOST:
- set_interrupt(IRQ_HOST, val != 0);
- state.fromhost = val;
+ set_fromhost(val);
break;
}
return old_pcr;
}
+void processor_t::set_fromhost(reg_t val)
+{
+ set_interrupt(IRQ_HOST, val != 0);
+ state.fromhost = val;
+}
+
reg_t processor_t::get_pcr(int which)
{
switch (which)
@@ -306,8 +329,10 @@ reg_t processor_t::get_pcr(int which)
case CSR_SUP1:
return state.pcr_k1;
case CSR_TOHOST:
+ sim->get_htif()->tick(); // not necessary, but faster
return state.tohost;
case CSR_FROMHOST:
+ sim->get_htif()->tick(); // not necessary, but faster
return state.fromhost;
default:
return -1;
diff --git a/riscv/processor.h b/riscv/processor.h
index e27aa82..9e52d3d 100644
--- a/riscv/processor.h
+++ b/riscv/processor.h
@@ -65,6 +65,7 @@ public:
void deliver_ipi(); // register an interprocessor interrupt
bool running() { return run; }
reg_t set_pcr(int which, reg_t val);
+ void set_fromhost(reg_t val);
void set_interrupt(int which, bool on);
reg_t get_pcr(int which);
mmu_t* get_mmu() { return mmu; }
@@ -91,13 +92,12 @@ private:
std::vector<insn_desc_t> opcode_store;
void take_interrupt(); // take a trap if any interrupts are pending
- void take_trap(reg_t pc, trap_t& t); // take an exception
+ void take_trap(trap_t& t); // take an exception
void disasm(insn_t insn); // disassemble and print an instruction
friend class sim_t;
friend class mmu_t;
friend class extension_t;
- friend class htif_isasim_t;
void build_opcode_map();
insn_func_t decode_insn(insn_t insn);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index fd506c8..45c5ee7 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -43,10 +43,14 @@ riscv_srcs = \
riscv_test_srcs =
riscv_gen_hdrs = \
+ icache.h \
riscv_gen_srcs = \
$(addsuffix .cc, $(call get_insn_list,$(src_dir)/riscv/encoding.h))
+icache.h:
+ $(src_dir)/riscv/gen_icache 1024 > $@
+
$(riscv_gen_srcs): %.cc: insns/%.h insn_template.cc
sed 's/NAME/$(subst .cc,,$@)/' $(src_dir)/riscv/insn_template.cc | sed 's/OPCODE/$(call get_opcode,$(src_dir)/riscv/encoding.h,$(subst .cc,,$@))/' > $@
diff --git a/riscv/sim.cc b/riscv/sim.cc
index c800e87..59fe593 100644
--- a/riscv/sim.cc
+++ b/riscv/sim.cc
@@ -34,7 +34,7 @@ sim_t::sim_t(size_t nprocs, size_t mem_mb, const std::vector<std::string>& args)
while ((mem = (char*)calloc(1, memsz)) == NULL)
memsz = memsz*10/11/quantum*quantum;
- if (memsz != memsz)
+ if (memsz != memsz0)
fprintf(stderr, "warning: only got %lu bytes of target mem (wanted %lu)\n",
(unsigned long)memsz, (unsigned long)memsz0);
diff --git a/riscv/sim.h b/riscv/sim.h
index d643e6d..d437c1a 100644
--- a/riscv/sim.h
+++ b/riscv/sim.h
@@ -24,6 +24,7 @@ public:
void stop();
void set_debug(bool value);
void set_procs_debug(bool value);
+ htif_isasim_t* get_htif() { return htif.get(); }
// deliver an IPI to a specific processor
void send_ipi(reg_t who);
@@ -36,7 +37,7 @@ public:
reg_t get_scr(int which);
private:
- std::auto_ptr<htif_isasim_t> htif;
+ std::unique_ptr<htif_isasim_t> htif;
char* mem; // main memory
size_t memsz; // memory size in bytes
mmu_t* debug_mmu; // debug port into main memory