aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/config/gcn/gcn-hsa.h8
-rw-r--r--gcc/config/gcn/gcn-opts.h5
-rw-r--r--gcc/config/gcn/gcn-run.c154
-rw-r--r--gcc/config/gcn/gcn.c231
-rw-r--r--gcc/config/gcn/gcn.h2
-rw-r--r--gcc/config/gcn/gcn.opt4
-rw-r--r--libgomp/plugin/plugin-gcn.c235
7 files changed, 121 insertions, 518 deletions
diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h
index 2eaf414..4fd1365 100644
--- a/gcc/config/gcn/gcn-hsa.h
+++ b/gcc/config/gcn/gcn-hsa.h
@@ -18,8 +18,8 @@
#error elf.h included before elfos.h
#endif
-#define TEXT_SECTION_ASM_OP "\t.section\t.text"
-#define BSS_SECTION_ASM_OP "\t.section\t.bss"
+#define TEXT_SECTION_ASM_OP "\t.text"
+#define BSS_SECTION_ASM_OP "\t.bss"
#define GLOBAL_ASM_OP "\t.globl\t"
#define DATA_SECTION_ASM_OP "\t.data\t"
#define SET_ASM_OP "\t.set\t"
@@ -76,10 +76,10 @@ extern unsigned int gcn_local_sym_hash (const char *name);
#define GOMP_SELF_SPECS ""
/* Use LLVM assembler and linker options. */
-#define ASM_SPEC "-triple=amdgcn--amdhsa -mattr=-code-object-v3 " \
+#define ASM_SPEC "-triple=amdgcn--amdhsa " \
"%:last_arg(%{march=*:-mcpu=%*}) " \
"-filetype=obj"
-#define LINK_SPEC "--pie"
+#define LINK_SPEC "--pie --export-dynamic"
#define LIB_SPEC "-lc"
/* Provides a _start symbol to keep the linker happy. */
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index 385d2be..8eefb7a 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -20,8 +20,9 @@
/* Which processor to generate code or schedule for. */
enum processor_type
{
- PROCESSOR_FIJI,
- PROCESSOR_VEGA
+ PROCESSOR_FIJI, // gfx803
+ PROCESSOR_VEGA10, // gfx900
+ PROCESSOR_VEGA20 // gfx906
};
/* Set in gcn_option_override. */
diff --git a/gcc/config/gcn/gcn-run.c b/gcc/config/gcn/gcn-run.c
index 1e952e9..8961ea1 100644
--- a/gcc/config/gcn/gcn-run.c
+++ b/gcc/config/gcn/gcn-run.c
@@ -55,7 +55,7 @@
#include "hsa.h"
#ifndef HSA_RUNTIME_LIB
-#define HSA_RUNTIME_LIB "libhsa-runtime64.so"
+#define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
#endif
#ifndef VERSION_STRING
@@ -429,20 +429,6 @@ load_image (const char *filename)
&executable),
"Initialize GCN executable");
- /* Hide relocations from the HSA runtime loader.
- Keep a copy of the unmodified section headers to use later. */
- Elf64_Shdr *image_sections =
- (Elf64_Shdr *) ((char *) image + image->e_shoff);
- Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
- memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
- for (int i = image->e_shnum - 1; i >= 0; i--)
- {
- if (image_sections[i].sh_type == SHT_RELA
- || image_sections[i].sh_type == SHT_REL)
- /* Change section type to something harmless. */
- image_sections[i].sh_type = SHT_NOTE;
- }
-
/* Add the HSACO to the executable. */
hsa_code_object_t co = { 0 };
XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
@@ -457,23 +443,27 @@ load_image (const char *filename)
/* Locate the "_init_array" function, and read the kernel's properties. */
hsa_executable_symbol_t symbol;
- XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
- device, 0, &symbol),
+ XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
+ "_init_array.kd", device, 0,
+ &symbol),
"Find '_init_array' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
- (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+ &init_array_kernel),
"Extract '_init_array' kernel object kernel object");
/* Locate the "_fini_array" function, and read the kernel's properties. */
- XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
- device, 0, &symbol),
+ XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
+ "_fini_array.kd", device, 0,
+ &symbol),
"Find '_fini_array' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
- (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+ &fini_array_kernel),
"Extract '_fini_array' kernel object kernel object");
/* Locate the "main" function, and read the kernel's properties. */
- XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
+ XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd",
device, 0, &symbol),
"Find 'main' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
@@ -491,126 +481,6 @@ load_image (const char *filename)
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&private_segment_size),
"Extract private segment size");
-
- /* Find main function in ELF, and calculate actual load offset. */
- Elf64_Addr load_offset;
- XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
- (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
- &load_offset),
- "Extract 'main' symbol address");
- for (int i = 0; i < image->e_shnum; i++)
- if (sections[i].sh_type == SHT_SYMTAB)
- {
- Elf64_Shdr *strtab = &sections[sections[i].sh_link];
- char *strings = (char *) image + strtab->sh_offset;
-
- for (size_t offset = 0;
- offset < sections[i].sh_size;
- offset += sections[i].sh_entsize)
- {
- Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
- + sections[i].sh_offset + offset);
- if (strcmp ("main", strings + sym->st_name) == 0)
- {
- load_offset -= sym->st_value;
- goto found_main;
- }
- }
- }
- /* We only get here when main was not found.
- This should never happen. */
- fprintf (stderr, "Error: main function not found.\n");
- abort ();
-found_main:;
-
- /* Find dynamic symbol table. */
- Elf64_Shdr *dynsym = NULL;
- for (int i = 0; i < image->e_shnum; i++)
- if (sections[i].sh_type == SHT_DYNSYM)
- {
- dynsym = &sections[i];
- break;
- }
-
- /* Fix up relocations. */
- for (int i = 0; i < image->e_shnum; i++)
- {
- if (sections[i].sh_type == SHT_RELA)
- for (size_t offset = 0;
- offset < sections[i].sh_size;
- offset += sections[i].sh_entsize)
- {
- Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
- + sections[i].sh_offset
- + offset);
- Elf64_Sym *sym =
- (dynsym
- ? (Elf64_Sym *) ((char *) image
- + dynsym->sh_offset
- + (dynsym->sh_entsize
- * ELF64_R_SYM (reloc->r_info))) : NULL);
-
- int64_t S = (sym ? sym->st_value : 0);
- int64_t P = reloc->r_offset + load_offset;
- int64_t A = reloc->r_addend;
- int64_t B = load_offset;
- int64_t V, size;
- switch (ELF64_R_TYPE (reloc->r_info))
- {
- case R_AMDGPU_ABS32_LO:
- V = (S + A) & 0xFFFFFFFF;
- size = 4;
- break;
- case R_AMDGPU_ABS32_HI:
- V = (S + A) >> 32;
- size = 4;
- break;
- case R_AMDGPU_ABS64:
- V = S + A;
- size = 8;
- break;
- case R_AMDGPU_REL32:
- V = S + A - P;
- size = 4;
- break;
- case R_AMDGPU_REL64:
- /* FIXME
- LLD seems to emit REL64 where the assembler has ABS64.
- This is clearly wrong because it's not what the compiler
- is expecting. Let's assume, for now, that it's a bug.
- In any case, GCN kernels are always self contained and
- therefore relative relocations will have been resolved
- already, so this should be a safe workaround. */
- V = S + A /* - P */ ;
- size = 8;
- break;
- case R_AMDGPU_ABS32:
- V = S + A;
- size = 4;
- break;
- /* TODO R_AMDGPU_GOTPCREL */
- /* TODO R_AMDGPU_GOTPCREL32_LO */
- /* TODO R_AMDGPU_GOTPCREL32_HI */
- case R_AMDGPU_REL32_LO:
- V = (S + A - P) & 0xFFFFFFFF;
- size = 4;
- break;
- case R_AMDGPU_REL32_HI:
- V = (S + A - P) >> 32;
- size = 4;
- break;
- case R_AMDGPU_RELATIVE64:
- V = B + A;
- size = 8;
- break;
- default:
- fprintf (stderr, "Error: unsupported relocation type.\n");
- exit (1);
- }
- XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
- "Fix up relocation");
- }
- }
}
/* Allocate some device memory from the kernargs region.
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 39eb8fd..fff0e8c 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -83,7 +83,7 @@ int gcn_isa = 3; /* Default to GCN3. */
/* The number of registers usable by normal non-kernel functions.
The SGPR count includes any special extra registers such as VCC. */
-#define MAX_NORMAL_SGPR_COUNT 64
+#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
#define MAX_NORMAL_VGPR_COUNT 24
/* }}} */
@@ -127,7 +127,7 @@ gcn_option_override (void)
if (!flag_pic)
flag_pic = flag_pie;
- gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
+ gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
/* The default stack size needs to be small for offload kernels because
there may be many, many threads. Also, a smaller stack gives a
@@ -168,37 +168,31 @@ static const struct gcn_kernel_arg_type
{"exec", NULL, DImode, EXEC_REG},
#define PRIVATE_SEGMENT_BUFFER_ARG 1
{"private_segment_buffer",
- "enable_sgpr_private_segment_buffer", TImode, -1},
+ ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
#define DISPATCH_PTR_ARG 2
- {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
+ {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
#define QUEUE_PTR_ARG 3
- {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
+ {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
#define KERNARG_SEGMENT_PTR_ARG 4
- {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
- {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
+ {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
+ {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
#define FLAT_SCRATCH_INIT_ARG 6
- {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
+ {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
- {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
- {"grid_workgroup_count_X",
- "enable_sgpr_grid_workgroup_count_x", SImode, -1},
- {"grid_workgroup_count_Y",
- "enable_sgpr_grid_workgroup_count_y", SImode, -1},
- {"grid_workgroup_count_Z",
- "enable_sgpr_grid_workgroup_count_z", SImode, -1},
-#define WORKGROUP_ID_X_ARG 11
- {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
- {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
- {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
- {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
-#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
+ {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
+#define WORKGROUP_ID_X_ARG 8
+ {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
+ {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
+ {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
+ {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
+#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
{"private_segment_wave_offset",
- "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
-#define WORK_ITEM_ID_X_ARG 16
+ ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
+#define WORK_ITEM_ID_X_ARG 13
{"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
-#define WORK_ITEM_ID_Y_ARG 17
+#define WORK_ITEM_ID_Y_ARG 14
{"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
-#define WORK_ITEM_ID_Z_ARG 18
+#define WORK_ITEM_ID_Z_ARG 15
{"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
};
@@ -2075,7 +2069,7 @@ gcn_conditional_register_usage (void)
if (cfun->machine->normal_function)
{
/* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
- for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2);
+ for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
i <= LAST_SGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
@@ -2574,9 +2568,9 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
if (strcmp (name, "fiji") == 0)
return gcn_arch == PROCESSOR_FIJI;
if (strcmp (name, "gfx900") == 0)
- return gcn_arch == PROCESSOR_VEGA;
+ return gcn_arch == PROCESSOR_VEGA10;
if (strcmp (name, "gfx906") == 0)
- return gcn_arch == PROCESSOR_VEGA;
+ return gcn_arch == PROCESSOR_VEGA20;
return 0;
default:
gcc_unreachable ();
@@ -4943,11 +4937,16 @@ gcn_fixup_accel_lto_options (tree fndecl)
static void
output_file_start (void)
{
- fprintf (asm_out_file, "\t.text\n");
- fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
- fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */
- fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
- fprintf (asm_out_file, "\t.text\n");
+ char *cpu;
+ switch (gcn_arch)
+ {
+ case PROCESSOR_FIJI: cpu = "gfx803"; break;
+ case PROCESSOR_VEGA10: cpu = "gfx900"; break;
+ case PROCESSOR_VEGA20: cpu = "gfx906"; break;
+ default: gcc_unreachable ();
+ }
+
+ fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s\"\n", cpu);
}
/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
@@ -4963,7 +4962,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
{
int sgpr, vgpr;
bool xnack_enabled = false;
- int extra_regs = 0;
+
+ fputs ("\n\n", file);
if (cfun && cfun->machine && cfun->machine->normal_function)
{
@@ -4986,76 +4986,20 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
break;
vgpr++;
- if (xnack_enabled)
- extra_regs = 6;
- if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
- || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
- extra_regs = 4;
- else if (df_regs_ever_live_p (VCC_LO_REG)
- || df_regs_ever_live_p (VCC_HI_REG))
- extra_regs = 2;
-
if (!leaf_function_p ())
{
/* We can't know how many registers function calls might use. */
if (vgpr < MAX_NORMAL_VGPR_COUNT)
vgpr = MAX_NORMAL_VGPR_COUNT;
- if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT)
- sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs;
+ if (sgpr < MAX_NORMAL_SGPR_COUNT)
+ sgpr = MAX_NORMAL_SGPR_COUNT;
}
- /* GFX8 allocates SGPRs in blocks of 8.
- GFX9 uses blocks of 16. */
- int granulated_sgprs;
- if (TARGET_GCN3)
- granulated_sgprs = (sgpr + extra_regs + 7) / 8 - 1;
- else if (TARGET_GCN5)
- granulated_sgprs = 2 * ((sgpr + extra_regs + 15) / 16 - 1);
- else
- gcc_unreachable ();
-
- fputs ("\t.align\t256\n", file);
- fputs ("\t.type\t", file);
- assemble_name (file, name);
- fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
+ fputs ("\t.rodata\n"
+ "\t.p2align\t6\n"
+ "\t.amdhsa_kernel\t", file);
assemble_name (file, name);
fputs ("\n", file);
- assemble_name (file, name);
- fputs (":\n", file);
- fprintf (file, "\t.amd_kernel_code_t\n"
- "\t\tkernel_code_version_major = 1\n"
- "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
- /* "\t\tmachine_version_major = 8\n"
- "\t\tmachine_version_minor = 0\n"
- "\t\tmachine_version_stepping = 1\n" */
- "\t\tkernel_code_entry_byte_offset = 256\n"
- "\t\tkernel_code_prefetch_byte_size = 0\n"
- "\t\tmax_scratch_backing_memory_byte_size = 0\n"
- "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
- "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
- "\t\tcompute_pgm_rsrc1_priority = 0\n"
- "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
- "\t\tcompute_pgm_rsrc1_priv = 0\n"
- "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
- "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
- "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
- /* We enable scratch memory. */
- "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
- "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
- "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
- "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
- "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
- "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
- "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
- "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
- "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */
- "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
- (vgpr - 1) / 4,
- /* Must match wavefront_sgpr_count */
- granulated_sgprs,
- /* The total number of SGPR user data registers requested. This
- number must match the number of user data registers enabled. */
- cfun->machine->args.nsgprs);
int reg = FIRST_SGPR_REG;
for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
{
@@ -5073,7 +5017,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
if (gcn_kernel_arg_types[a].header_pseudo)
{
- fprintf (file, "\t\t%s = %i",
+ fprintf (file, "\t %s%s\t%i",
+ (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
gcn_kernel_arg_types[a].header_pseudo,
(cfun->machine->args.requested & (1 << a)) != 0);
if (reg_first != -1)
@@ -5091,54 +5036,71 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
}
else if (gcn_kernel_arg_types[a].fixed_regno >= 0
&& cfun->machine->args.requested & (1 << a))
- fprintf (file, "\t\t; %s = %i (%s)\n",
+ fprintf (file, "\t ; %s\t%i (%s)\n",
gcn_kernel_arg_types[a].name,
(cfun->machine->args.requested & (1 << a)) != 0,
reg_names[gcn_kernel_arg_types[a].fixed_regno]);
}
- fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
+ fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n",
(cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
? 2
: cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
? 1 : 0);
- fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
- "\t\tprivate_element_size = 1\n"
- "\t\tis_ptr64 = 1\n"
- "\t\tis_dynamic_callstack = 0\n"
- "\t\tis_debug_enabled = 0\n"
- "\t\tis_xnack_enabled = %i\n"
- "\t\tworkitem_private_segment_byte_size = %i\n"
- "\t\tworkgroup_group_segment_byte_size = %u\n"
- "\t\tgds_segment_byte_size = 0\n"
- "\t\tkernarg_segment_byte_size = %i\n"
- "\t\tworkgroup_fbarrier_count = 0\n"
- "\t\twavefront_sgpr_count = %i\n"
- "\t\tworkitem_vgpr_count = %i\n"
- "\t\treserved_vgpr_first = 0\n"
- "\t\treserved_vgpr_count = 0\n"
- "\t\treserved_sgpr_first = 0\n"
- "\t\treserved_sgpr_count = 0\n"
- "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
- "\t\tdebug_private_segment_buffer_sgpr = 0\n"
- "\t\tkernarg_segment_alignment = %i\n"
- "\t\tgroup_segment_alignment = 4\n"
- "\t\tprivate_segment_alignment = %i\n"
- "\t\twavefront_size = 6\n"
- "\t\tcall_convention = 0\n"
- "\t\truntime_loader_kernel_symbol = 0\n"
- "\t.end_amd_kernel_code_t\n", xnack_enabled,
+ fprintf (file,
+ "\t .amdhsa_next_free_vgpr\t%i\n"
+ "\t .amdhsa_next_free_sgpr\t%i\n"
+ "\t .amdhsa_reserve_vcc\t1\n"
+ "\t .amdhsa_reserve_flat_scratch\t0\n"
+ "\t .amdhsa_reserve_xnack_mask\t%i\n"
+ "\t .amdhsa_private_segment_fixed_size\t%i\n"
+ "\t .amdhsa_group_segment_fixed_size\t%u\n"
+ "\t .amdhsa_float_denorm_mode_32\t3\n"
+ "\t .amdhsa_float_denorm_mode_16_64\t3\n",
+ vgpr,
+ sgpr,
+ xnack_enabled,
/* workitem_private_segment_bytes_size needs to be
one 64th the wave-front stack size. */
stack_size_opt / 64,
- LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
- /* Number of scalar registers used by a wavefront. This
- includes the special SGPRs for VCC, Flat Scratch (Base,
- Size) and XNACK (for GFX8 (VI)+). It does not include the
- 16 SGPR added if a trap handler is enabled. Must match
- compute_pgm_rsrc1.sgprs. */
- sgpr + extra_regs, vgpr,
+ LDS_SIZE);
+ fputs ("\t.end_amdhsa_kernel\n", file);
+
+#if 1
+ /* The following is YAML embedded in assembler; tabs are not allowed. */
+ fputs (" .amdgpu_metadata\n"
+ " amdhsa.version:\n"
+ " - 1\n"
+ " - 0\n"
+ " amdhsa.kernels:\n"
+ " - .name: ", file);
+ assemble_name (file, name);
+ fputs ("\n .symbol: ", file);
+ assemble_name (file, name);
+ fprintf (file,
+ ".kd\n"
+ " .kernarg_segment_size: %i\n"
+ " .kernarg_segment_align: %i\n"
+ " .group_segment_fixed_size: %u\n"
+ " .private_segment_fixed_size: %i\n"
+ " .wavefront_size: 64\n"
+ " .sgpr_count: %i\n"
+ " .vgpr_count: %i\n"
+ " .max_flat_workgroup_size: 1024\n",
+ cfun->machine->kernarg_segment_byte_size,
cfun->machine->kernarg_segment_alignment,
- crtl->stack_alignment_needed / 8);
+ LDS_SIZE,
+ stack_size_opt / 64,
+ sgpr, vgpr);
+ fputs (" .end_amdgpu_metadata\n", file);
+#endif
+
+ fputs ("\t.text\n", file);
+ fputs ("\t.align\t256\n", file);
+ fputs ("\t.type\t", file);
+ assemble_name (file, name);
+ fputs (",@function\n", file);
+ assemble_name (file, name);
+ fputs (":\n", file);
/* This comment is read by mkoffload. */
if (flag_openacc)
@@ -5200,11 +5162,6 @@ gcn_target_asm_function_prologue (FILE *file)
asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
asm_fprintf (file, "\t; outgoing args size: %wd\n",
offsets->outgoing_args_size);
-
- /* Enable denorms. */
- asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
- " input and output denorms\n");
- asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
}
}
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index 9993a99..f63e7df 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -525,7 +525,7 @@ enum gcn_address_spaces
#ifndef USED_FOR_TARGET
-#define GCN_KERNEL_ARG_TYPES 19
+#define GCN_KERNEL_ARG_TYPES 16
struct GTY(()) gcn_kernel_args
{
long requested;
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index e1b9942..b1ea56e 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -29,10 +29,10 @@ EnumValue
Enum(gpu_type) String(fiji) Value(PROCESSOR_FIJI)
EnumValue
-Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA)
+Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA10)
EnumValue
-Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA)
+Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20)
march=
Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI)
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 4c6a4c0..0be350b 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1074,7 +1074,7 @@ init_environment_variables (void)
hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
if (hsa_runtime_lib == NULL)
- hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
+ hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so.1";
support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
@@ -1137,40 +1137,6 @@ get_executable_symbol_name (hsa_executable_symbol_t symbol)
return res;
}
-/* Helper function for find_executable_symbol. */
-
-static hsa_status_t
-find_executable_symbol_1 (hsa_executable_t executable,
- hsa_executable_symbol_t symbol,
- void *data)
-{
- hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
- *res = symbol;
- return HSA_STATUS_INFO_BREAK;
-}
-
-/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
- found, return false. */
-
-static bool
-find_executable_symbol (hsa_executable_t executable,
- hsa_executable_symbol_t *symbol)
-{
- hsa_status_t status;
-
- status
- = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
- find_executable_symbol_1,
- symbol);
- if (status != HSA_STATUS_INFO_BREAK)
- {
- hsa_error ("Could not find executable symbol", status);
- return false;
- }
-
- return true;
-}
-
/* Get the number of GPU Compute Units. */
static int
@@ -2007,13 +1973,15 @@ init_kernel_properties (struct kernel_info *kernel)
hsa_status_t status;
struct agent_info *agent = kernel->agent;
hsa_executable_symbol_t kernel_symbol;
+ char *buf = alloca (strlen (kernel->name) + 4);
+ sprintf (buf, "%s.kd", kernel->name);
status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
- kernel->name, agent->id,
+ buf, agent->id,
0, &kernel_symbol);
if (status != HSA_STATUS_SUCCESS)
{
hsa_warn ("Could not find symbol for kernel in the code object", status);
- fprintf (stderr, "not found name: '%s'\n", kernel->name);
+ fprintf (stderr, "not found name: '%s'\n", buf);
dump_executable_symbols (agent->executable);
goto failure;
}
@@ -2327,61 +2295,6 @@ init_basic_kernel_info (struct kernel_info *kernel,
return true;
}
-/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If
- not found, return false. */
-
-static bool
-find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
- struct module_info *module, Elf64_Ehdr *image,
- Elf64_Shdr *sections)
-{
- bool res = false;
-
- hsa_status_t status;
-
- hsa_executable_symbol_t symbol;
- if (!find_executable_symbol (agent->executable, &symbol))
- return false;
-
- status = hsa_fns.hsa_executable_symbol_get_info_fn
- (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
- if (status != HSA_STATUS_SUCCESS)
- {
- hsa_error ("Could not extract symbol address", status);
- return false;
- }
-
- char *symbol_name = get_executable_symbol_name (symbol);
- if (symbol_name == NULL)
- return false;
-
- /* Find the kernel function in ELF, and calculate actual load offset. */
- for (int i = 0; i < image->e_shnum; i++)
- if (sections[i].sh_type == SHT_SYMTAB)
- {
- Elf64_Shdr *strtab = &sections[sections[i].sh_link];
- char *strings = (char *)image + strtab->sh_offset;
-
- for (size_t offset = 0;
- offset < sections[i].sh_size;
- offset += sections[i].sh_entsize)
- {
- Elf64_Sym *sym = (Elf64_Sym*)((char*)image
- + sections[i].sh_offset
- + offset);
- if (strcmp (symbol_name, strings + sym->st_name) == 0)
- {
- *load_offset -= sym->st_value;
- res = true;
- break;
- }
- }
- }
-
- free (symbol_name);
- return res;
-}
-
/* Check that the GCN ISA of the given image matches the ISA of the agent. */
static bool
@@ -2421,7 +2334,6 @@ static bool
create_and_finalize_hsa_program (struct agent_info *agent)
{
hsa_status_t status;
- int reloc_count = 0;
bool res = true;
if (pthread_mutex_lock (&agent->prog_mutex))
{
@@ -2450,18 +2362,6 @@ create_and_finalize_hsa_program (struct agent_info *agent)
if (!isa_matches_agent (agent, image))
goto fail;
- /* Hide relocations from the HSA runtime loader.
- Keep a copy of the unmodified section headers to use later. */
- Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
- + image->e_shoff);
- for (int i = image->e_shnum - 1; i >= 0; i--)
- {
- if (image_sections[i].sh_type == SHT_RELA
- || image_sections[i].sh_type == SHT_REL)
- /* Change section type to something harmless. */
- image_sections[i].sh_type |= 0x80;
- }
-
hsa_code_object_t co = { 0 };
status = hsa_fns.hsa_code_object_deserialize_fn
(module->image_desc->gcn_image->image,
@@ -2517,131 +2417,6 @@ create_and_finalize_hsa_program (struct agent_info *agent)
goto fail;
}
- if (agent->module)
- {
- struct module_info *module = agent->module;
- Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
- Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
-
- Elf64_Addr load_offset;
- if (!find_load_offset (&load_offset, agent, module, image, sections))
- goto fail;
-
- /* Record the physical load address range.
- We need this for data copies later. */
- Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
- Elf64_Addr low = ~0, high = 0;
- for (int i = 0; i < image->e_phnum; i++)
- if (segments[i].p_memsz > 0)
- {
- if (segments[i].p_paddr < low)
- low = segments[i].p_paddr;
- if (segments[i].p_paddr > high)
- high = segments[i].p_paddr + segments[i].p_memsz - 1;
- }
- module->phys_address_start = low + load_offset;
- module->phys_address_end = high + load_offset;
-
- // Find dynamic symbol table
- Elf64_Shdr *dynsym = NULL;
- for (int i = 0; i < image->e_shnum; i++)
- if (sections[i].sh_type == SHT_DYNSYM)
- {
- dynsym = &sections[i];
- break;
- }
-
- /* Fix up relocations. */
- for (int i = 0; i < image->e_shnum; i++)
- {
- if (sections[i].sh_type == (SHT_RELA | 0x80))
- for (size_t offset = 0;
- offset < sections[i].sh_size;
- offset += sections[i].sh_entsize)
- {
- Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
- + sections[i].sh_offset
- + offset);
- Elf64_Sym *sym =
- (dynsym
- ? (Elf64_Sym*)((char*)image
- + dynsym->sh_offset
- + (dynsym->sh_entsize
- * ELF64_R_SYM (reloc->r_info)))
- : NULL);
-
- int64_t S = (sym ? sym->st_value : 0);
- int64_t P = reloc->r_offset + load_offset;
- int64_t A = reloc->r_addend;
- int64_t B = load_offset;
- int64_t V, size;
- switch (ELF64_R_TYPE (reloc->r_info))
- {
- case R_AMDGPU_ABS32_LO:
- V = (S + A) & 0xFFFFFFFF;
- size = 4;
- break;
- case R_AMDGPU_ABS32_HI:
- V = (S + A) >> 32;
- size = 4;
- break;
- case R_AMDGPU_ABS64:
- V = S + A;
- size = 8;
- break;
- case R_AMDGPU_REL32:
- V = S + A - P;
- size = 4;
- break;
- case R_AMDGPU_REL64:
- /* FIXME
- LLD seems to emit REL64 where the the assembler has
- ABS64. This is clearly wrong because it's not what the
- compiler is expecting. Let's assume, for now, that
- it's a bug. In any case, GCN kernels are always self
- contained and therefore relative relocations will have
- been resolved already, so this should be a safe
- workaround. */
- V = S + A/* - P*/;
- size = 8;
- break;
- case R_AMDGPU_ABS32:
- V = S + A;
- size = 4;
- break;
- /* TODO R_AMDGPU_GOTPCREL */
- /* TODO R_AMDGPU_GOTPCREL32_LO */
- /* TODO R_AMDGPU_GOTPCREL32_HI */
- case R_AMDGPU_REL32_LO:
- V = (S + A - P) & 0xFFFFFFFF;
- size = 4;
- break;
- case R_AMDGPU_REL32_HI:
- V = (S + A - P) >> 32;
- size = 4;
- break;
- case R_AMDGPU_RELATIVE64:
- V = B + A;
- size = 8;
- break;
- default:
- fprintf (stderr, "Error: unsupported relocation type.\n");
- exit (1);
- }
- status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
- if (status != HSA_STATUS_SUCCESS)
- {
- hsa_error ("Failed to fix up relocation", status);
- goto fail;
- }
- reloc_count++;
- }
- }
- }
-
- GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
- agent->device_id, reloc_count);
-
final:
agent->prog_finalized = true;