aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorHaochen Jiang <haochen.jiang@intel.com>2025-03-05 10:35:11 +0800
committerHaochen Jiang <haochen.jiang@intel.com>2025-03-07 11:33:48 +0800
commita1eaeac63adc4e20b7e74290fdbe51725d40ddeb (patch)
tree1d0953f90c58b5d3c4f565b71034dcff03aa6dca /gcc
parentc207dcf393b864adc8eb41bbbcd630a6cfdc145a (diff)
downloadgcc-a1eaeac63adc4e20b7e74290fdbe51725d40ddeb.zip
gcc-a1eaeac63adc4e20b7e74290fdbe51725d40ddeb.tar.gz
gcc-a1eaeac63adc4e20b7e74290fdbe51725d40ddeb.tar.bz2
i386: Correct mask width for bf8->fp16 intrin on 256/512 bit
For bf8 -> fp16 convert, when dst is 256 bit, the mask should be 16 bit since 16*16=256, not the 8 bit in the current intrin. In 512 bit intrin, the mask size is also halved. This patch will fix both of them. gcc/ChangeLog: * config/i386/avx10_2-512convertintrin.h (_mm512_mask_cvtbf8_ph): Correct mask width. (_mm512_maskz_cvtbf8_ph): Ditto. * config/i386/avx10_2convertintrin.h (_mm256_mask_cvtbf8_ph): Ditto. (_mm256_maskz_cvtbf8_ph): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx10_2-512-convert-1.c: Change function call. * gcc.target/i386/avx10_2-convert-1.c: Ditto.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/avx10_2-512convertintrin.h4
-rw-r--r--gcc/config/i386/avx10_2convertintrin.h4
-rw-r--r--gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c4
-rw-r--r--gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c4
4 files changed, 8 insertions, 8 deletions
diff --git a/gcc/config/i386/avx10_2-512convertintrin.h b/gcc/config/i386/avx10_2-512convertintrin.h
index 1079e0a..a44481e 100644
--- a/gcc/config/i386/avx10_2-512convertintrin.h
+++ b/gcc/config/i386/avx10_2-512convertintrin.h
@@ -550,7 +550,7 @@ _mm512_cvtbf8_ph (__m256i __A)
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_mask_cvtbf8_ph (__m512h __S, __mmask16 __U, __m256i __A)
+_mm512_mask_cvtbf8_ph (__m512h __S, __mmask32 __U, __m256i __A)
{
return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_mask_slli_epi16 (
(__m512i) __S, __U, (__m512i) _mm512_cvtepi8_epi16 (__A), 8));
@@ -558,7 +558,7 @@ _mm512_mask_cvtbf8_ph (__m512h __S, __mmask16 __U, __m256i __A)
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_cvtbf8_ph (__mmask16 __U, __m256i __A)
+_mm512_maskz_cvtbf8_ph (__mmask32 __U, __m256i __A)
{
return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_slli_epi16 (
(__m512i) _mm512_maskz_cvtepi8_epi16 (__U, __A), 8));
diff --git a/gcc/config/i386/avx10_2convertintrin.h b/gcc/config/i386/avx10_2convertintrin.h
index 3fc51b1..7c9c238 100644
--- a/gcc/config/i386/avx10_2convertintrin.h
+++ b/gcc/config/i386/avx10_2convertintrin.h
@@ -1004,7 +1004,7 @@ _mm256_cvtbf8_ph (__m128i __A)
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cvtbf8_ph (__m256h __S, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtbf8_ph (__m256h __S, __mmask16 __U, __m128i __A)
{
return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_mask_slli_epi16 (
(__m256i) __S, __U, (__m256i) _mm256_cvtepi8_epi16 (__A), 8));
@@ -1012,7 +1012,7 @@ _mm256_mask_cvtbf8_ph (__m256h __S, __mmask8 __U, __m128i __A)
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_cvtbf8_ph (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtbf8_ph (__mmask16 __U, __m128i __A)
{
return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_slli_epi16 (
(__m256i) _mm256_maskz_cvtepi8_epi16 (__U, __A), 8));
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c
index bda74b5..c1e44ef 100644
--- a/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c
@@ -183,6 +183,6 @@ void extern
avx10_2_512_cvtbf8_fp16_test (void)
{
y = _mm512_cvtbf8_ph (z1);
- y = _mm512_mask_cvtbf8_ph (z, m16, z1);
- y = _mm512_maskz_cvtbf8_ph (m16, z1);
+ y = _mm512_mask_cvtbf8_ph (z, m32, z1);
+ y = _mm512_maskz_cvtbf8_ph (m32, z1);
}
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c
index 57b5fce..729496f 100644
--- a/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c
@@ -289,6 +289,6 @@ avx10_2_cvtbf8_fp16_test (void)
y = _mm_maskz_cvtbf8_ph (m8, z3);
y2 = _mm256_cvtbf8_ph (z3);
- y2 = _mm256_mask_cvtbf8_ph (z2, m8, z3);
- y2 = _mm256_maskz_cvtbf8_ph (m8, z3);
+ y2 = _mm256_mask_cvtbf8_ph (z2, m16, z3);
+ y2 = _mm256_maskz_cvtbf8_ph (m16, z3);
}