diff options
author | Andrew Stubbs <ams@codesourcery.com> | 2022-02-24 17:16:13 +0000 |
---|---|---|
committer | Andrew Stubbs <ams@codesourcery.com> | 2022-05-24 16:18:14 +0100 |
commit | cde52d3a2d02d037da53e6974d5e39021030b346 (patch) | |
tree | 9b8c9566629ee5915934617814b4ec080f7632ce /gcc/config | |
parent | 8086230e7ac619c0b0eeb6e15df7975ac214725f (diff) | |
download | gcc-cde52d3a2d02d037da53e6974d5e39021030b346.zip gcc-cde52d3a2d02d037da53e6974d5e39021030b346.tar.gz gcc-cde52d3a2d02d037da53e6974d5e39021030b346.tar.bz2 |
amdgcn: Add gfx90a support
This adds architecture options and multilibs for the AMD GFX90a GPUs.
It also tidies up some of the ISA selection code, and corrects a few small
mistake in the gfx908 naming.
gcc/ChangeLog:
* config.gcc (amdgcn): Accept --with-arch=gfx908 and gfx90a.
* config/gcn/gcn-opts.h (enum gcn_isa): New.
(TARGET_GCN3): Use enum gcn_isa.
(TARGET_GCN3_PLUS): Likewise.
(TARGET_GCN5): Likewise.
(TARGET_GCN5_PLUS): Likewise.
(TARGET_CDNA1): New.
(TARGET_CDNA1_PLUS): New.
(TARGET_CDNA2): New.
(TARGET_CDNA2_PLUS): New.
(TARGET_M0_LDS_LIMIT): New.
(TARGET_PACKED_WORK_ITEMS): New.
* config/gcn/gcn.cc (gcn_isa): Change to enum gcn_isa.
(gcn_option_override): Recognise CDNA ISA variants.
(gcn_omp_device_kind_arch_isa): Support gfx90a.
(gcn_expand_prologue): Make m0 init optional.
Add support for packed work items.
(output_file_start): Support gfx90a.
(gcn_hsa_declare_function_name): Support gfx90a metadata.
* config/gcn/gcn.h (TARGET_CPU_CPP_BUILTINS):Add __CDNA1__ and
__CDNA2__.
* config/gcn/gcn.md (<su>mulsi3_highpart): Use TARGET_GCN5_PLUS.
(<su>mulsi3_highpart_imm): Likewise.
(<su>mulsidi3): Likewise.
(<su>mulsidi3_imm): Likewise.
* config/gcn/gcn.opt (gpu_type): Add gfx90a.
* config/gcn/mkoffload.cc (EF_AMDGPU_MACH_AMDGCN_GFX90a): New.
(main): Support gfx90a.
* config/gcn/t-gcn-hsa: Add gfx90a multilib.
* config/gcn/t-omp-device: Add gfx90a isa.
libgomp/ChangeLog:
* plugin/plugin-gcn.c (EF_AMDGPU_MACH): Add
EF_AMDGPU_MACH_AMDGCN_GFX90a.
(gcn_gfx90a_s): New.
(isa_hsa_name): Support gfx90a.
(isa_code): Likewise.
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/gcn/gcn-opts.h | 28 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.cc | 60 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.h | 4 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.md | 8 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.opt | 3 | ||||
-rw-r--r-- | gcc/config/gcn/mkoffload.cc | 4 | ||||
-rw-r--r-- | gcc/config/gcn/t-gcn-hsa | 4 | ||||
-rw-r--r-- | gcc/config/gcn/t-omp-device | 2 |
8 files changed, 91 insertions, 22 deletions
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index c080524..b62dfb4 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -23,16 +23,30 @@ enum processor_type PROCESSOR_FIJI, // gfx803 PROCESSOR_VEGA10, // gfx900 PROCESSOR_VEGA20, // gfx906 - PROCESSOR_GFX908 // as yet unnamed + PROCESSOR_GFX908, + PROCESSOR_GFX90a }; /* Set in gcn_option_override. */ -extern int gcn_isa; - -#define TARGET_GCN3 (gcn_isa == 3) -#define TARGET_GCN3_PLUS (gcn_isa >= 3) -#define TARGET_GCN5 (gcn_isa == 5) -#define TARGET_GCN5_PLUS (gcn_isa >= 5) +extern enum gcn_isa { + ISA_UNKNOWN, + ISA_GCN3, + ISA_GCN5, + ISA_CDNA1, + ISA_CDNA2 +} gcn_isa; + +#define TARGET_GCN3 (gcn_isa == ISA_GCN3) +#define TARGET_GCN3_PLUS (gcn_isa >= ISA_GCN3) +#define TARGET_GCN5 (gcn_isa == ISA_GCN5) +#define TARGET_GCN5_PLUS (gcn_isa >= ISA_GCN5) +#define TARGET_CDNA1 (gcn_isa == ISA_CDNA1) +#define TARGET_CDNA1_PLUS (gcn_isa >= ISA_CDNA1) +#define TARGET_CDNA2 (gcn_isa == ISA_CDNA2) +#define TARGET_CDNA2_PLUS (gcn_isa >= ISA_CDNA2) + +#define TARGET_M0_LDS_LIMIT (TARGET_GCN3) +#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS) enum sram_ecc_type { diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 39a7a96..5e75a1b 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -66,7 +66,7 @@ static bool ext_gcn_constants_init = 0; /* Holds the ISA variant, derived from the command line parameters. */ -int gcn_isa = 3; /* Default to GCN3. */ +enum gcn_isa gcn_isa = ISA_GCN3; /* Default to GCN3. */ /* Reserve this much space for LDS (for propagating variables from worker-single mode to worker-partitioned mode), per workgroup. Global @@ -129,7 +129,13 @@ gcn_option_override (void) if (!flag_pic) flag_pic = flag_pie; - gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5; + gcn_isa = (gcn_arch == PROCESSOR_FIJI ? ISA_GCN3 + : gcn_arch == PROCESSOR_VEGA10 ? ISA_GCN5 + : gcn_arch == PROCESSOR_VEGA20 ? ISA_GCN5 + : gcn_arch == PROCESSOR_GFX908 ? ISA_CDNA1 + : gcn_arch == PROCESSOR_GFX90a ? ISA_CDNA2 + : ISA_UNKNOWN); + gcc_assert (gcn_isa != ISA_UNKNOWN); /* The default stack size needs to be small for offload kernels because there may be many, many threads. Also, a smaller stack gives a @@ -2642,6 +2648,8 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, return gcn_arch == PROCESSOR_VEGA20; if (strcmp (name, "gfx908") == 0) return gcn_arch == PROCESSOR_GFX908; + if (strcmp (name, "gfx90a") == 0) + return gcn_arch == PROCESSOR_GFX90a; return 0; default: gcc_unreachable (); @@ -3081,13 +3089,35 @@ gcn_expand_prologue () /* Ensure that the scheduler doesn't do anything unexpected. */ emit_insn (gen_blockage ()); - /* m0 is initialized for the usual LDS DS and FLAT memory case. - The low-part is the address of the topmost addressable byte, which is - size-1. The high-part is an offset and should be zero. */ - emit_move_insn (gen_rtx_REG (SImode, M0_REG), - gen_int_mode (LDS_SIZE, SImode)); + if (TARGET_M0_LDS_LIMIT) + { + /* m0 is initialized for the usual LDS DS and FLAT memory case. + The low-part is the address of the topmost addressable byte, which is + size-1. The high-part is an offset and should be zero. */ + emit_move_insn (gen_rtx_REG (SImode, M0_REG), + gen_int_mode (LDS_SIZE, SImode)); + + emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); + } - emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); + if (TARGET_PACKED_WORK_ITEMS + && cfun && cfun->machine && !cfun->machine->normal_function) + { + /* v0 conatins the X, Y and Z dimensions all in one. + Expand them out for ABI compatibility. */ + /* TODO: implement and use zero_extract. */ + rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); + emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)), + gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10))); + emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10))); + emit_insn (gen_prologue_use (v1)); + + rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2)); + emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)), + gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20))); + emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20))); + emit_insn (gen_prologue_use (v2)); + } if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp) { @@ -5243,6 +5273,9 @@ output_file_start (void) case PROCESSOR_GFX908: cpu = "gfx908"; break; + case PROCESSOR_GFX90a: + cpu = "gfx90a"; + break; default: gcc_unreachable (); } @@ -5296,6 +5329,10 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) sgpr = MAX_NORMAL_SGPR_COUNT; } + /* The gfx90a accum_offset field can't represent 0 registers. */ + if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4) + vgpr = 4; + fputs ("\t.rodata\n" "\t.p2align\t6\n" "\t.amdhsa_kernel\t", file); @@ -5364,6 +5401,11 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) one 64th the wave-front stack size. */ stack_size_opt / 64, LDS_SIZE); + if (gcn_arch == PROCESSOR_GFX90a) + fprintf (file, + "\t .amdhsa_accum_offset\t%i\n" + "\t .amdhsa_tg_split\t0\n", + (vgpr+3)&~3); // I think this means the AGPRs come after the VGPRs fputs ("\t.end_amdhsa_kernel\n", file); #if 1 @@ -5392,6 +5434,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) LDS_SIZE, stack_size_opt / 64, sgpr, vgpr); + if (gcn_arch == PROCESSOR_GFX90a) + fprintf (file, " .agpr_count: 0\n"); // AGPRs are not used, yet fputs (" .end_amdgpu_metadata\n", file); #endif diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index 9ae8919..a129760 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -24,6 +24,10 @@ builtin_define ("__GCN3__"); \ else if (TARGET_GCN5) \ builtin_define ("__GCN5__"); \ + else if (TARGET_CDNA1) \ + builtin_define ("__CDNA1__"); \ + else if (TARGET_CDNA2) \ + builtin_define ("__CDNA2__"); \ } \ while(0) diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 21a7476..53e846e 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -1410,7 +1410,7 @@ "" { if (can_create_pseudo_p () - && !TARGET_GCN5 + && !TARGET_GCN5_PLUS && !gcn_inline_immediate_operand (operands[2], SImode)) operands[2] = force_reg (SImode, operands[2]); @@ -1451,7 +1451,7 @@ (match_operand:SI 1 "register_operand" "Sg,Sg,v")) (match_operand:DI 2 "gcn_32bit_immediate_operand" "A, B,A")) (const_int 32))))] - "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)" + "TARGET_GCN5_PLUS || gcn_inline_immediate_operand (operands[2], SImode)" "@ s_mul_hi<sgnsuffix>0\t%0, %1, %2 s_mul_hi<sgnsuffix>0\t%0, %1, %2 @@ -1469,7 +1469,7 @@ "" { if (can_create_pseudo_p () - && !TARGET_GCN5 + && !TARGET_GCN5_PLUS && !gcn_inline_immediate_operand (operands[2], SImode)) operands[2] = force_reg (SImode, operands[2]); @@ -1506,7 +1506,7 @@ (match_operand:SI 1 "register_operand" "Sg, Sg, v")) (match_operand:DI 2 "gcn_32bit_immediate_operand" "A, B, A")))] - "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)" + "TARGET_GCN5_PLUS || gcn_inline_immediate_operand (operands[2], SImode)" "#" "&& reload_completed" [(const_int 0)] diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index 54da11f..9606aaf 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -37,6 +37,9 @@ Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20) EnumValue Enum(gpu_type) String(gfx908) Value(PROCESSOR_GFX908) +EnumValue +Enum(gpu_type) String(gfx90a) Value(PROCESSOR_GFX90a) + march= Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI) Specify the name of the target GPU. diff --git a/gcc/config/gcn/mkoffload.cc b/gcc/config/gcn/mkoffload.cc index e98277c4..ed93ae8 100644 --- a/gcc/config/gcn/mkoffload.cc +++ b/gcc/config/gcn/mkoffload.cc @@ -55,6 +55,8 @@ #define EF_AMDGPU_MACH_AMDGCN_GFX906 0x2f #undef EF_AMDGPU_MACH_AMDGCN_GFX908 #define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30 +#undef EF_AMDGPU_MACH_AMDGCN_GFX90a +#define EF_AMDGPU_MACH_AMDGCN_GFX90a 0x3f #define EF_AMDGPU_FEATURE_XNACK_V4 0x300 /* Mask. */ #define EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 0x000 @@ -904,6 +906,8 @@ main (int argc, char **argv) elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX906; else if (strcmp (argv[i], "-march=gfx908") == 0) elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX908; + else if (strcmp (argv[i], "-march=gfx90a") == 0) + elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX90a; } if (!(fopenacc ^ fopenmp)) diff --git a/gcc/config/gcn/t-gcn-hsa b/gcc/config/gcn/t-gcn-hsa index 10e31f3..9e03ec8 100644 --- a/gcc/config/gcn/t-gcn-hsa +++ b/gcc/config/gcn/t-gcn-hsa @@ -42,8 +42,8 @@ ALL_HOST_OBJS += gcn-run.o gcn-run$(exeext): gcn-run.o +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl -MULTILIB_OPTIONS = march=gfx900/march=gfx906/march=gfx908 -MULTILIB_DIRNAMES = gfx900 gfx906 gfx908 +MULTILIB_OPTIONS = march=gfx900/march=gfx906/march=gfx908/march=gfx90a +MULTILIB_DIRNAMES = gfx900 gfx906 gfx908 gfx90a gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.cc $(COMPILE) $< diff --git a/gcc/config/gcn/t-omp-device b/gcc/config/gcn/t-omp-device index e1d9e0d..27d36db 100644 --- a/gcc/config/gcn/t-omp-device +++ b/gcc/config/gcn/t-omp-device @@ -1,4 +1,4 @@ omp-device-properties-gcn: $(srcdir)/config/gcn/gcn.cc echo kind: gpu > $@ echo arch: amdgcn gcn >> $@ - echo isa: fiji gfx900 gfx906 gfx908 >> $@ + echo isa: fiji gfx900 gfx906 gfx908 gfx90a >> $@ |