1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
/*
* Test some fused multiply add corner cases.
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <inttypes.h>
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
/*
* Perform one "n * m + a" operation using the vfmadd insn and return
* the result; on return *mxcsr_p is set to the bottom 6 bits of MXCSR
* (the Flag bits). If ftz is true then we set MXCSR.FTZ while doing
* the operation.
* We print the operation and its results to stdout.
*/
static uint64_t do_fmadd(uint64_t n, uint64_t m, uint64_t a,
bool ftz, uint32_t *mxcsr_p)
{
uint64_t r;
uint32_t mxcsr = 0;
uint32_t ftz_bit = ftz ? (1 << 15) : 0;
uint32_t saved_mxcsr = 0;
asm volatile("stmxcsr %[saved_mxcsr]\n"
"stmxcsr %[mxcsr]\n"
"andl $0xffff7fc0, %[mxcsr]\n"
"orl %[ftz_bit], %[mxcsr]\n"
"ldmxcsr %[mxcsr]\n"
"movq %[a], %%xmm0\n"
"movq %[m], %%xmm1\n"
"movq %[n], %%xmm2\n"
/* xmm0 = xmm0 + xmm2 * xmm1 */
"vfmadd231sd %%xmm1, %%xmm2, %%xmm0\n"
"movq %%xmm0, %[r]\n"
"stmxcsr %[mxcsr]\n"
"ldmxcsr %[saved_mxcsr]\n"
: [r] "=r" (r), [mxcsr] "=m" (mxcsr),
[saved_mxcsr] "=m" (saved_mxcsr)
: [n] "r" (n), [m] "r" (m), [a] "r" (a),
[ftz_bit] "r" (ftz_bit)
: "xmm0", "xmm1", "xmm2");
*mxcsr_p = mxcsr & 0x3f;
printf("vfmadd132sd 0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64
" = 0x%" PRIx64 " MXCSR flags 0x%" PRIx32 "\n",
n, m, a, r, *mxcsr_p);
return r;
}
typedef struct testdata {
/* Input n, m, a */
uint64_t n;
uint64_t m;
uint64_t a;
bool ftz;
/* Expected result */
uint64_t expected_r;
/* Expected low 6 bits of MXCSR (the Flag bits) */
uint32_t expected_mxcsr;
} testdata;
static testdata tests[] = {
{ 0, 0x7ff0000000000000, 0x7ff000000000aaaa, false, /* 0 * Inf + SNaN */
0x7ff800000000aaaa, 1 }, /* Should be QNaN and does raise Invalid */
{ 0, 0x7ff0000000000000, 0x7ff800000000aaaa, false, /* 0 * Inf + QNaN */
0x7ff800000000aaaa, 0 }, /* Should be QNaN and does *not* raise Invalid */
/*
* These inputs give a result which is tiny before rounding but which
* becomes non-tiny after rounding. x86 is a "detect tininess after
* rounding" architecture, so it should give a non-denormal result and
* not set the Underflow flag (only the Precision flag for an inexact
* result).
*/
{ 0x3fdfffffffffffff, 0x001fffffffffffff, 0x801fffffffffffff, false,
0x8010000000000000, 0x20 },
/*
* Flushing of denormal outputs to zero should also happen after
* rounding, so setting FTZ should not affect the result or the flags.
* QEMU currently does not emulate this correctly because we do the
* flush-to-zero check before rounding, so we incorrectly produce a
* zero result and set Underflow as well as Precision.
*/
#ifdef ENABLE_FAILING_TESTS
{ 0x3fdfffffffffffff, 0x001fffffffffffff, 0x801fffffffffffff, true,
0x8010000000000000, 0x20 }, /* Enabling FTZ shouldn't change flags */
#endif
};
int main(void)
{
bool passed = true;
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
uint32_t mxcsr;
uint64_t r = do_fmadd(tests[i].n, tests[i].m, tests[i].a,
tests[i].ftz, &mxcsr);
if (r != tests[i].expected_r) {
printf("expected result 0x%" PRIx64 "\n", tests[i].expected_r);
passed = false;
}
if (mxcsr != tests[i].expected_mxcsr) {
printf("expected MXCSR flags 0x%x\n", tests[i].expected_mxcsr);
passed = false;
}
}
return passed ? 0 : 1;
}
|