diff options
Diffstat (limited to 'gcc/config/gcn')
-rw-r--r-- | gcc/config/gcn/gcn-hsa.h | 8 | ||||
-rw-r--r-- | gcc/config/gcn/gcn-opts.h | 5 | ||||
-rw-r--r-- | gcc/config/gcn/gcn-run.c | 154 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.c | 231 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.h | 2 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.opt | 4 |
6 files changed, 116 insertions, 288 deletions
diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h index 2eaf414..4fd1365 100644 --- a/gcc/config/gcn/gcn-hsa.h +++ b/gcc/config/gcn/gcn-hsa.h @@ -18,8 +18,8 @@ #error elf.h included before elfos.h #endif -#define TEXT_SECTION_ASM_OP "\t.section\t.text" -#define BSS_SECTION_ASM_OP "\t.section\t.bss" +#define TEXT_SECTION_ASM_OP "\t.text" +#define BSS_SECTION_ASM_OP "\t.bss" #define GLOBAL_ASM_OP "\t.globl\t" #define DATA_SECTION_ASM_OP "\t.data\t" #define SET_ASM_OP "\t.set\t" @@ -76,10 +76,10 @@ extern unsigned int gcn_local_sym_hash (const char *name); #define GOMP_SELF_SPECS "" /* Use LLVM assembler and linker options. */ -#define ASM_SPEC "-triple=amdgcn--amdhsa -mattr=-code-object-v3 " \ +#define ASM_SPEC "-triple=amdgcn--amdhsa " \ "%:last_arg(%{march=*:-mcpu=%*}) " \ "-filetype=obj" -#define LINK_SPEC "--pie" +#define LINK_SPEC "--pie --export-dynamic" #define LIB_SPEC "-lc" /* Provides a _start symbol to keep the linker happy. */ diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index 385d2be..8eefb7a 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -20,8 +20,9 @@ /* Which processor to generate code or schedule for. */ enum processor_type { - PROCESSOR_FIJI, - PROCESSOR_VEGA + PROCESSOR_FIJI, // gfx803 + PROCESSOR_VEGA10, // gfx900 + PROCESSOR_VEGA20 // gfx906 }; /* Set in gcn_option_override. */ diff --git a/gcc/config/gcn/gcn-run.c b/gcc/config/gcn/gcn-run.c index 1e952e9..8961ea1 100644 --- a/gcc/config/gcn/gcn-run.c +++ b/gcc/config/gcn/gcn-run.c @@ -55,7 +55,7 @@ #include "hsa.h" #ifndef HSA_RUNTIME_LIB -#define HSA_RUNTIME_LIB "libhsa-runtime64.so" +#define HSA_RUNTIME_LIB "libhsa-runtime64.so.1" #endif #ifndef VERSION_STRING @@ -429,20 +429,6 @@ load_image (const char *filename) &executable), "Initialize GCN executable"); - /* Hide relocations from the HSA runtime loader. - Keep a copy of the unmodified section headers to use later. */ - Elf64_Shdr *image_sections = - (Elf64_Shdr *) ((char *) image + image->e_shoff); - Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum); - memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum); - for (int i = image->e_shnum - 1; i >= 0; i--) - { - if (image_sections[i].sh_type == SHT_RELA - || image_sections[i].sh_type == SHT_REL) - /* Change section type to something harmless. */ - image_sections[i].sh_type = SHT_NOTE; - } - /* Add the HSACO to the executable. */ hsa_code_object_t co = { 0 }; XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co), @@ -457,23 +443,27 @@ load_image (const char *filename) /* Locate the "_init_array" function, and read the kernel's properties. */ hsa_executable_symbol_t symbol; - XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array", - device, 0, &symbol), + XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, + "_init_array.kd", device, 0, + &symbol), "Find '_init_array' function"); XHSA (hsa_fns.hsa_executable_symbol_get_info_fn - (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel), + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &init_array_kernel), "Extract '_init_array' kernel object kernel object"); /* Locate the "_fini_array" function, and read the kernel's properties. */ - XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array", - device, 0, &symbol), + XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, + "_fini_array.kd", device, 0, + &symbol), "Find '_fini_array' function"); XHSA (hsa_fns.hsa_executable_symbol_get_info_fn - (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel), + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &fini_array_kernel), "Extract '_fini_array' kernel object kernel object"); /* Locate the "main" function, and read the kernel's properties. */ - XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main", + XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd", device, 0, &symbol), "Find 'main' function"); XHSA (hsa_fns.hsa_executable_symbol_get_info_fn @@ -491,126 +481,6 @@ load_image (const char *filename) (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size), "Extract private segment size"); - - /* Find main function in ELF, and calculate actual load offset. */ - Elf64_Addr load_offset; - XHSA (hsa_fns.hsa_executable_symbol_get_info_fn - (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, - &load_offset), - "Extract 'main' symbol address"); - for (int i = 0; i < image->e_shnum; i++) - if (sections[i].sh_type == SHT_SYMTAB) - { - Elf64_Shdr *strtab = §ions[sections[i].sh_link]; - char *strings = (char *) image + strtab->sh_offset; - - for (size_t offset = 0; - offset < sections[i].sh_size; - offset += sections[i].sh_entsize) - { - Elf64_Sym *sym = (Elf64_Sym *) ((char *) image - + sections[i].sh_offset + offset); - if (strcmp ("main", strings + sym->st_name) == 0) - { - load_offset -= sym->st_value; - goto found_main; - } - } - } - /* We only get here when main was not found. - This should never happen. */ - fprintf (stderr, "Error: main function not found.\n"); - abort (); -found_main:; - - /* Find dynamic symbol table. */ - Elf64_Shdr *dynsym = NULL; - for (int i = 0; i < image->e_shnum; i++) - if (sections[i].sh_type == SHT_DYNSYM) - { - dynsym = §ions[i]; - break; - } - - /* Fix up relocations. */ - for (int i = 0; i < image->e_shnum; i++) - { - if (sections[i].sh_type == SHT_RELA) - for (size_t offset = 0; - offset < sections[i].sh_size; - offset += sections[i].sh_entsize) - { - Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image - + sections[i].sh_offset - + offset); - Elf64_Sym *sym = - (dynsym - ? (Elf64_Sym *) ((char *) image - + dynsym->sh_offset - + (dynsym->sh_entsize - * ELF64_R_SYM (reloc->r_info))) : NULL); - - int64_t S = (sym ? sym->st_value : 0); - int64_t P = reloc->r_offset + load_offset; - int64_t A = reloc->r_addend; - int64_t B = load_offset; - int64_t V, size; - switch (ELF64_R_TYPE (reloc->r_info)) - { - case R_AMDGPU_ABS32_LO: - V = (S + A) & 0xFFFFFFFF; - size = 4; - break; - case R_AMDGPU_ABS32_HI: - V = (S + A) >> 32; - size = 4; - break; - case R_AMDGPU_ABS64: - V = S + A; - size = 8; - break; - case R_AMDGPU_REL32: - V = S + A - P; - size = 4; - break; - case R_AMDGPU_REL64: - /* FIXME - LLD seems to emit REL64 where the assembler has ABS64. - This is clearly wrong because it's not what the compiler - is expecting. Let's assume, for now, that it's a bug. - In any case, GCN kernels are always self contained and - therefore relative relocations will have been resolved - already, so this should be a safe workaround. */ - V = S + A /* - P */ ; - size = 8; - break; - case R_AMDGPU_ABS32: - V = S + A; - size = 4; - break; - /* TODO R_AMDGPU_GOTPCREL */ - /* TODO R_AMDGPU_GOTPCREL32_LO */ - /* TODO R_AMDGPU_GOTPCREL32_HI */ - case R_AMDGPU_REL32_LO: - V = (S + A - P) & 0xFFFFFFFF; - size = 4; - break; - case R_AMDGPU_REL32_HI: - V = (S + A - P) >> 32; - size = 4; - break; - case R_AMDGPU_RELATIVE64: - V = B + A; - size = 8; - break; - default: - fprintf (stderr, "Error: unsupported relocation type.\n"); - exit (1); - } - XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size), - "Fix up relocation"); - } - } } /* Allocate some device memory from the kernargs region. diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 39eb8fd..fff0e8c 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -83,7 +83,7 @@ int gcn_isa = 3; /* Default to GCN3. */ /* The number of registers usable by normal non-kernel functions. The SGPR count includes any special extra registers such as VCC. */ -#define MAX_NORMAL_SGPR_COUNT 64 +#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC #define MAX_NORMAL_VGPR_COUNT 24 /* }}} */ @@ -127,7 +127,7 @@ gcn_option_override (void) if (!flag_pic) flag_pic = flag_pie; - gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3; + gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5; /* The default stack size needs to be small for offload kernels because there may be many, many threads. Also, a smaller stack gives a @@ -168,37 +168,31 @@ static const struct gcn_kernel_arg_type {"exec", NULL, DImode, EXEC_REG}, #define PRIVATE_SEGMENT_BUFFER_ARG 1 {"private_segment_buffer", - "enable_sgpr_private_segment_buffer", TImode, -1}, + ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1}, #define DISPATCH_PTR_ARG 2 - {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1}, + {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1}, #define QUEUE_PTR_ARG 3 - {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1}, + {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1}, #define KERNARG_SEGMENT_PTR_ARG 4 - {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1}, - {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1}, + {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1}, + {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1}, #define FLAT_SCRATCH_INIT_ARG 6 - {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1}, + {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1}, #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7 - {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1}, - {"grid_workgroup_count_X", - "enable_sgpr_grid_workgroup_count_x", SImode, -1}, - {"grid_workgroup_count_Y", - "enable_sgpr_grid_workgroup_count_y", SImode, -1}, - {"grid_workgroup_count_Z", - "enable_sgpr_grid_workgroup_count_z", SImode, -1}, -#define WORKGROUP_ID_X_ARG 11 - {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2}, - {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2}, - {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2}, - {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1}, -#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15 + {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1}, +#define WORKGROUP_ID_X_ARG 8 + {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2}, + {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2}, + {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2}, + {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1}, +#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12 {"private_segment_wave_offset", - "enable_sgpr_private_segment_wave_byte_offset", SImode, -2}, -#define WORK_ITEM_ID_X_ARG 16 + ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2}, +#define WORK_ITEM_ID_X_ARG 13 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG}, -#define WORK_ITEM_ID_Y_ARG 17 +#define WORK_ITEM_ID_Y_ARG 14 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1}, -#define WORK_ITEM_ID_Z_ARG 18 +#define WORK_ITEM_ID_Z_ARG 15 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2} }; @@ -2075,7 +2069,7 @@ gcn_conditional_register_usage (void) if (cfun->machine->normal_function) { /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */ - for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2); + for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT); i <= LAST_SGPR_REG; i++) fixed_regs[i] = 1, call_used_regs[i] = 1; @@ -2574,9 +2568,9 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, if (strcmp (name, "fiji") == 0) return gcn_arch == PROCESSOR_FIJI; if (strcmp (name, "gfx900") == 0) - return gcn_arch == PROCESSOR_VEGA; + return gcn_arch == PROCESSOR_VEGA10; if (strcmp (name, "gfx906") == 0) - return gcn_arch == PROCESSOR_VEGA; + return gcn_arch == PROCESSOR_VEGA20; return 0; default: gcc_unreachable (); @@ -4943,11 +4937,16 @@ gcn_fixup_accel_lto_options (tree fndecl) static void output_file_start (void) { - fprintf (asm_out_file, "\t.text\n"); - fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n"); - fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */ - fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n"); - fprintf (asm_out_file, "\t.text\n"); + char *cpu; + switch (gcn_arch) + { + case PROCESSOR_FIJI: cpu = "gfx803"; break; + case PROCESSOR_VEGA10: cpu = "gfx900"; break; + case PROCESSOR_VEGA20: cpu = "gfx906"; break; + default: gcc_unreachable (); + } + + fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s\"\n", cpu); } /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h. @@ -4963,7 +4962,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) { int sgpr, vgpr; bool xnack_enabled = false; - int extra_regs = 0; + + fputs ("\n\n", file); if (cfun && cfun->machine && cfun->machine->normal_function) { @@ -4986,76 +4986,20 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) break; vgpr++; - if (xnack_enabled) - extra_regs = 6; - if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG) - || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG)) - extra_regs = 4; - else if (df_regs_ever_live_p (VCC_LO_REG) - || df_regs_ever_live_p (VCC_HI_REG)) - extra_regs = 2; - if (!leaf_function_p ()) { /* We can't know how many registers function calls might use. */ if (vgpr < MAX_NORMAL_VGPR_COUNT) vgpr = MAX_NORMAL_VGPR_COUNT; - if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT) - sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs; + if (sgpr < MAX_NORMAL_SGPR_COUNT) + sgpr = MAX_NORMAL_SGPR_COUNT; } - /* GFX8 allocates SGPRs in blocks of 8. - GFX9 uses blocks of 16. */ - int granulated_sgprs; - if (TARGET_GCN3) - granulated_sgprs = (sgpr + extra_regs + 7) / 8 - 1; - else if (TARGET_GCN5) - granulated_sgprs = 2 * ((sgpr + extra_regs + 15) / 16 - 1); - else - gcc_unreachable (); - - fputs ("\t.align\t256\n", file); - fputs ("\t.type\t", file); - assemble_name (file, name); - fputs (",@function\n\t.amdgpu_hsa_kernel\t", file); + fputs ("\t.rodata\n" + "\t.p2align\t6\n" + "\t.amdhsa_kernel\t", file); assemble_name (file, name); fputs ("\n", file); - assemble_name (file, name); - fputs (":\n", file); - fprintf (file, "\t.amd_kernel_code_t\n" - "\t\tkernel_code_version_major = 1\n" - "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n" - /* "\t\tmachine_version_major = 8\n" - "\t\tmachine_version_minor = 0\n" - "\t\tmachine_version_stepping = 1\n" */ - "\t\tkernel_code_entry_byte_offset = 256\n" - "\t\tkernel_code_prefetch_byte_size = 0\n" - "\t\tmax_scratch_backing_memory_byte_size = 0\n" - "\t\tcompute_pgm_rsrc1_vgprs = %i\n" - "\t\tcompute_pgm_rsrc1_sgprs = %i\n" - "\t\tcompute_pgm_rsrc1_priority = 0\n" - "\t\tcompute_pgm_rsrc1_float_mode = 192\n" - "\t\tcompute_pgm_rsrc1_priv = 0\n" - "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n" - "\t\tcompute_pgm_rsrc1_debug_mode = 0\n" - "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n" - /* We enable scratch memory. */ - "\t\tcompute_pgm_rsrc2_scratch_en = 1\n" - "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n" - "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n" - "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n" - "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n" - "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n" - "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n" - "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n" - "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */ - "\t\tcompute_pgm_rsrc2_excp_en = 0\n", - (vgpr - 1) / 4, - /* Must match wavefront_sgpr_count */ - granulated_sgprs, - /* The total number of SGPR user data registers requested. This - number must match the number of user data registers enabled. */ - cfun->machine->args.nsgprs); int reg = FIRST_SGPR_REG; for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) { @@ -5073,7 +5017,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) if (gcn_kernel_arg_types[a].header_pseudo) { - fprintf (file, "\t\t%s = %i", + fprintf (file, "\t %s%s\t%i", + (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";", gcn_kernel_arg_types[a].header_pseudo, (cfun->machine->args.requested & (1 << a)) != 0); if (reg_first != -1) @@ -5091,54 +5036,71 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) } else if (gcn_kernel_arg_types[a].fixed_regno >= 0 && cfun->machine->args.requested & (1 << a)) - fprintf (file, "\t\t; %s = %i (%s)\n", + fprintf (file, "\t ; %s\t%i (%s)\n", gcn_kernel_arg_types[a].name, (cfun->machine->args.requested & (1 << a)) != 0, reg_names[gcn_kernel_arg_types[a].fixed_regno]); } - fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n", + fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n", (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG)) ? 2 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG) ? 1 : 0); - fprintf (file, "\t\tenable_ordered_append_gds = 0\n" - "\t\tprivate_element_size = 1\n" - "\t\tis_ptr64 = 1\n" - "\t\tis_dynamic_callstack = 0\n" - "\t\tis_debug_enabled = 0\n" - "\t\tis_xnack_enabled = %i\n" - "\t\tworkitem_private_segment_byte_size = %i\n" - "\t\tworkgroup_group_segment_byte_size = %u\n" - "\t\tgds_segment_byte_size = 0\n" - "\t\tkernarg_segment_byte_size = %i\n" - "\t\tworkgroup_fbarrier_count = 0\n" - "\t\twavefront_sgpr_count = %i\n" - "\t\tworkitem_vgpr_count = %i\n" - "\t\treserved_vgpr_first = 0\n" - "\t\treserved_vgpr_count = 0\n" - "\t\treserved_sgpr_first = 0\n" - "\t\treserved_sgpr_count = 0\n" - "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n" - "\t\tdebug_private_segment_buffer_sgpr = 0\n" - "\t\tkernarg_segment_alignment = %i\n" - "\t\tgroup_segment_alignment = 4\n" - "\t\tprivate_segment_alignment = %i\n" - "\t\twavefront_size = 6\n" - "\t\tcall_convention = 0\n" - "\t\truntime_loader_kernel_symbol = 0\n" - "\t.end_amd_kernel_code_t\n", xnack_enabled, + fprintf (file, + "\t .amdhsa_next_free_vgpr\t%i\n" + "\t .amdhsa_next_free_sgpr\t%i\n" + "\t .amdhsa_reserve_vcc\t1\n" + "\t .amdhsa_reserve_flat_scratch\t0\n" + "\t .amdhsa_reserve_xnack_mask\t%i\n" + "\t .amdhsa_private_segment_fixed_size\t%i\n" + "\t .amdhsa_group_segment_fixed_size\t%u\n" + "\t .amdhsa_float_denorm_mode_32\t3\n" + "\t .amdhsa_float_denorm_mode_16_64\t3\n", + vgpr, + sgpr, + xnack_enabled, /* workitem_private_segment_bytes_size needs to be one 64th the wave-front stack size. */ stack_size_opt / 64, - LDS_SIZE, cfun->machine->kernarg_segment_byte_size, - /* Number of scalar registers used by a wavefront. This - includes the special SGPRs for VCC, Flat Scratch (Base, - Size) and XNACK (for GFX8 (VI)+). It does not include the - 16 SGPR added if a trap handler is enabled. Must match - compute_pgm_rsrc1.sgprs. */ - sgpr + extra_regs, vgpr, + LDS_SIZE); + fputs ("\t.end_amdhsa_kernel\n", file); + +#if 1 + /* The following is YAML embedded in assembler; tabs are not allowed. */ + fputs (" .amdgpu_metadata\n" + " amdhsa.version:\n" + " - 1\n" + " - 0\n" + " amdhsa.kernels:\n" + " - .name: ", file); + assemble_name (file, name); + fputs ("\n .symbol: ", file); + assemble_name (file, name); + fprintf (file, + ".kd\n" + " .kernarg_segment_size: %i\n" + " .kernarg_segment_align: %i\n" + " .group_segment_fixed_size: %u\n" + " .private_segment_fixed_size: %i\n" + " .wavefront_size: 64\n" + " .sgpr_count: %i\n" + " .vgpr_count: %i\n" + " .max_flat_workgroup_size: 1024\n", + cfun->machine->kernarg_segment_byte_size, cfun->machine->kernarg_segment_alignment, - crtl->stack_alignment_needed / 8); + LDS_SIZE, + stack_size_opt / 64, + sgpr, vgpr); + fputs (" .end_amdgpu_metadata\n", file); +#endif + + fputs ("\t.text\n", file); + fputs ("\t.align\t256\n", file); + fputs ("\t.type\t", file); + assemble_name (file, name); + fputs (",@function\n", file); + assemble_name (file, name); + fputs (":\n", file); /* This comment is read by mkoffload. */ if (flag_openacc) @@ -5200,11 +5162,6 @@ gcn_target_asm_function_prologue (FILE *file) asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars); asm_fprintf (file, "\t; outgoing args size: %wd\n", offsets->outgoing_args_size); - - /* Enable denorms. */ - asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double" - " input and output denorms\n"); - asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n"); } } diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index 9993a99..f63e7df 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -525,7 +525,7 @@ enum gcn_address_spaces #ifndef USED_FOR_TARGET -#define GCN_KERNEL_ARG_TYPES 19 +#define GCN_KERNEL_ARG_TYPES 16 struct GTY(()) gcn_kernel_args { long requested; diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index e1b9942..b1ea56e 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -29,10 +29,10 @@ EnumValue Enum(gpu_type) String(fiji) Value(PROCESSOR_FIJI) EnumValue -Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA) +Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA10) EnumValue -Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA) +Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20) march= Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI) |