diff options
author | Paul Brook <paul@nowt.org> | 2022-04-24 23:01:39 +0100 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2022-09-01 20:16:33 +0200 |
commit | 6567ffb4f259d9937ff74f21e96cdac905440620 (patch) | |
tree | a60ddfdb920044798a5ceb51b9eb0255d4cb0b61 | |
parent | 6f218d6e994bd8b229d6522899b6ac6cd98bdb47 (diff) | |
download | qemu-6567ffb4f259d9937ff74f21e96cdac905440620.zip qemu-6567ffb4f259d9937ff74f21e96cdac905440620.tar.gz qemu-6567ffb4f259d9937ff74f21e96cdac905440620.tar.bz2 |
target/i386: Destructive FP helpers for AVX
Perpare the horizontal atithmetic vector helpers for AVX
These currently use a dummy Reg typed variable to store the result then
assign the whole register. This will cause 128 bit operations to corrupt
the upper half of the register, so replace it with explicit temporaries
and element assignments.
Signed-off-by: Paul Brook <paul@nowt.org>
Message-Id: <20220424220204.2493824-18-paul@nowt.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r-- | target/i386/ops_sse.h | 101 |
1 files changed, 43 insertions, 58 deletions
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index c9737e1..c6dba95 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -22,7 +22,6 @@ #if SHIFT == 0 #define Reg MMXReg -#define SIZE 8 #define XMM_ONLY(...) #define B(n) MMX_B(n) #define W(n) MMX_W(n) @@ -31,7 +30,6 @@ #define SUFFIX _mmx #else #define Reg ZMMReg -#define SIZE 16 #define XMM_ONLY(...) __VA_ARGS__ #define B(n) ZMM_B(n) #define W(n) ZMM_W(n) @@ -43,22 +41,6 @@ #define LANE_WIDTH (SHIFT ? 16 : 8) #define PACK_WIDTH (LANE_WIDTH / 2) -/* - * Copy the relevant parts of a Reg value around. In the case where - * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of - * a 64 byte ZMMReg, so we must copy only those and keep the top bytes - * untouched in the guest-visible destination destination register. - * Note that the "lower bytes" are placed last in memory on big-endian - * hosts, which store the vector backwards in memory. In that case the - * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of - * the little-endian case. - */ -#if HOST_BIG_ENDIAN -#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE) -#else -#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE) -#endif - #if SHIFT == 0 #define FPSRL(x, c) ((x) >> shift) #define FPSRAW(x, c) ((int16_t)(x) >> shift) @@ -945,45 +927,49 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length) d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length); } -void glue(helper_haddps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - MOVE(*d, r); -} - -void glue(helper_haddpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - MOVE(*d, r); -} - -void glue(helper_hsubps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - MOVE(*d, r); -} - -void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - MOVE(*d, r); -} +#define SSE_HELPER_HPS(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + float32 r[2 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ + for (i = j = 0; j < 4; i++, j += 2) { \ + r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \ + } \ + for (j = 0; j < 4; i++, j += 2) { \ + r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \ + } \ + } \ + for (i = 0; i < 2 << SHIFT; i++) { \ + d->ZMM_S(i) = r[i]; \ + } \ +} + +SSE_HELPER_HPS(haddps, float32_add) +SSE_HELPER_HPS(hsubps, float32_sub) + +#define SSE_HELPER_HPD(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + float64 r[1 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ + for (i = j = 0; j < 2; i++, j += 2) { \ + r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \ + } \ + for (j = 0; j < 2; i++, j += 2) { \ + r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \ + } \ + } \ + for (i = 0; i < 1 << SHIFT; i++) { \ + d->ZMM_D(i) = r[i]; \ + } \ +} + +SSE_HELPER_HPD(haddpd, float64_add) +SSE_HELPER_HPD(hsubpd, float64_sub) void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { @@ -2331,4 +2317,3 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #undef L #undef Q #undef SUFFIX -#undef SIZE |