/* Macro library used to help during conversion of scalar math functions to vectorized SIMD equivalents on AMD GCN. Copyright (C) 2023-2025 Free Software Foundation, Inc. Contributed by Siemens. This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Under Section 7 of GPL version 3, you are granted additional permissions described in the GCC Runtime Library Exception, version 3.1, as published by the Free Software Foundation. You should have received a copy of the GNU General Public License and a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ typedef union { v2sf t_v2sf; v4sf t_v4sf; v8sf t_v8sf; v16sf t_v16sf; v32sf t_v32sf; v64sf t_v64sf; v2df t_v2df; v4df t_v4df; v8df t_v8df; v16df t_v16df; v32df t_v32df; v64df t_v64df; v64qi t_v64qi; v64hi t_v64hi; v2si t_v2si; v4si t_v4si; v8si t_v8si; v16si t_v16si; v32si t_v32si; v64si t_v64si; v64usi t_v64usi; v2di t_v2di; v4di t_v4di; v8di t_v8di; v16di t_v16di; v32di t_v32di; v64di t_v64di; } vector_union; /* Cast between vectors with a different number of elements, or type. */ #define VGPR_CAST(to_t, from) \ ({ \ to_t __res; \ __asm__ ("" : "=v"(__res) : "0"(from)); \ __res; \ }) #define PACK_SI_PAIR(low, high) \ ({ \ v64udi __res; \ asm ("v_mov_b32\t%L0, %1\n\t" \ "v_mov_b32\t%H0, %2" \ : "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \ __res; \ }) #define UNPACK_SI_LOW(to_t, pair) VGPR_CAST(to_t, pair) #define UNPACK_SI_HIGH(to_t, pair) \ ({ \ to_t __res; \ asm ("v_mov_b32\t%0, %H1" : "=v"(__res) : "v"(pair), "e"(-1L)); \ __res; \ }) #define PACK_DI_PAIR(low, high) \ ({ \ v64uti __res; \ asm ("v_mov_b32\t%L0, %L1\n\t" \ "v_mov_b32\t%H0, %H1\n\t" \ "v_mov_b32\t%J0, %L2\n\t" \ "v_mov_b32\t%K0, %H2" \ : "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \ __res; \ }) #define UNPACK_DI_LOW(to_t, pair) VGPR_CAST(to_t, pair) #define UNPACK_DI_HIGH(to_t, pair) \ ({ \ to_t __res; \ asm ("v_mov_b32\t%L0, %J1\n\t" \ "v_mov_b32\t%H0, %K1" : "=v"(__res) : "v"(pair), "e"(-1L)); \ __res; \ }) #define NO_COND __mask /* Note - __mask is _not_ accounted for in VECTOR_MERGE! */ #define VECTOR_MERGE(vec1, vec2, cond) \ ({ \ _Static_assert (__builtin_types_compatible_p (typeof (vec1), typeof (vec2))); \ union { \ typeof (vec1) val; \ v64qi t_v64qi; \ v64hi t_v64hi; \ v64si t_v64si; \ v64di t_v64di; \ } __vec1, __vec2, __res; \ __vec1.val = (vec1); \ __vec2.val = (vec2); \ __builtin_choose_expr ( \ sizeof (vec1) == sizeof (v64si), \ ({ \ v64si __bitmask = __builtin_convertvector ((cond), v64si); \ __res.t_v64si = (__vec1.t_v64si & __bitmask) \ | (__vec2.t_v64si & ~__bitmask); \ }), \ __builtin_choose_expr ( \ sizeof (vec1) == sizeof (v64hi), \ ({ \ v64hi __bitmask = __builtin_convertvector ((cond), v64hi); \ __res.t_v64hi = (__vec1.t_v64hi & __bitmask) \ | (__vec2.t_v64hi & ~__bitmask); \ }), \ __builtin_choose_expr ( \ sizeof (vec1) == sizeof (v64qi), \ ({ \ v64qi __bitmask = __builtin_convertvector ((cond), v64qi); \ __res.t_v64qi = (__vec1.t_v64qi & __bitmask) \ | (__vec2.t_v64qi & ~__bitmask); \ }), \ ({ \ v64di __bitmask = __builtin_convertvector ((cond), v64di); \ __res.t_v64di = (__vec1.t_v64di & __bitmask) \ | (__vec2.t_v64di & ~__bitmask); \ })))); \ __res.val; \ }) #define VECTOR_COND_MOVE(var, val, cond) \ do { \ _Static_assert (__builtin_types_compatible_p (typeof (var), typeof (val))); \ __auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \ var = VECTOR_MERGE ((val), var, __cond & __mask); \ } while (0) #define VECTOR_IF(cond, cond_var) \ { \ __auto_type cond_var = (cond); \ __auto_type __inv_cond __attribute__((unused)) = ~cond_var; \ if (!ALL_ZEROES_P (cond_var)) \ { #define VECTOR_ELSEIF(cond, cond_var) \ } \ cond_var = __inv_cond & (cond); \ __inv_cond &= ~(cond); \ if (!ALL_ZEROES_P (cond_var)) \ { #define VECTOR_ELSE(cond_var) \ } \ cond_var = __inv_cond; \ if (!ALL_ZEROES_P (cond_var)) \ { #define VECTOR_IF2(cond, cond_var, prev_cond_var) \ { \ __auto_type cond_var = (cond) & __builtin_convertvector (prev_cond_var, typeof (cond)); \ __auto_type __inv_cond __attribute__((unused)) = ~cond_var; \ if (!ALL_ZEROES_P (cond_var)) \ { #define VECTOR_ELSEIF2(cond, cond_var, prev_cond_var) \ } \ cond_var = (cond) & __inv_cond & __builtin_convertvector (prev_cond_var, typeof (cond)); \ __inv_cond &= ~(cond); \ if (!ALL_ZEROES_P (cond_var)) \ { #define VECTOR_ELSE2(cond_var, prev_cond_var) \ } \ cond_var = __inv_cond & __builtin_convertvector (prev_cond_var, typeof (__inv_cond)); \ if (!ALL_ZEROES_P (cond_var)) \ { #define VECTOR_ENDIF \ } \ } #define VECTOR_INIT_AUX(x, type) \ ({ \ typeof (x) __e = (x); \ type __tmp = { \ __e, __e, __e, __e, __e, __e, __e, __e, \ __e, __e, __e, __e, __e, __e, __e, __e, \ __e, __e, __e, __e, __e, __e, __e, __e, \ __e, __e, __e, __e, __e, __e, __e, __e, \ __e, __e, __e, __e, __e, __e, __e, __e, \ __e, __e, __e, __e, __e, __e, __e, __e, \ __e, __e, __e, __e, __e, __e, __e, __e, \ __e, __e, __e, __e, __e, __e, __e, __e }; \ __tmp; \ }) #define VECTOR_INIT(x) \ (_Generic ((x), int: VECTOR_INIT_AUX ((x), v64si), \ unsigned: VECTOR_INIT_AUX ((x), v64usi), \ char: VECTOR_INIT_AUX ((x), v64qi), \ unsigned char: VECTOR_INIT_AUX ((x), v64uqi), \ short: VECTOR_INIT_AUX ((x), v64hi), \ unsigned short: VECTOR_INIT_AUX ((x), v64uhi), \ long: VECTOR_INIT_AUX ((x), v64di), \ unsigned long: VECTOR_INIT_AUX ((x), v64udi), \ float: VECTOR_INIT_AUX ((x), v64sf), \ double: VECTOR_INIT_AUX ((x), v64df))) #if defined (__GCN3__) || defined (__GCN5__) \ || defined (__CDNA1__) || defined (__CDNA2__) \ || defined (__RDNA2__) || defined (__RDNA3__) #define CDNA3_PLUS 0 #else #define CDNA3_PLUS 1 #endif #define VECTOR_INIT_MASK(COUNT) \ ({ \ MASKMODE __mask; \ int count = (COUNT); \ if (count == 64) \ { \ if (sizeof (MASKMODE) < 512 || CDNA3_PLUS) \ asm ("v_mov%B0\t%0, -1" : "=v"(__mask) : "e"(-1L)); \ else \ asm ("v_mov_b32\t%L0, -1\n\t" \ "v_mov_b32\t%H0, -1" : "=v"(__mask) : "e"(-1L)); \ } \ else \ { \ long bitmask = (count == 64 ? -1 : (1<