1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
|
# RUN: %PYTHON %s | FileCheck %s
from unittest import result
from mlir.ir import (
Context,
FunctionType,
Location,
Module,
InsertionPoint,
IntegerType,
IndexType,
MemRefType,
F32Type,
Block,
ArrayAttr,
Attribute,
UnitAttr,
StringAttr,
DenseI32ArrayAttr,
ShapedType,
)
from mlir.dialects import openacc, func, arith, memref
from mlir.extras import types
def run(f):
print("\n// TEST:", f.__name__)
with Context(), Location.unknown():
f()
return f
@run
def testParallelMemcpy():
module = Module.create()
dynamic = ShapedType.get_dynamic_size()
memref_f32_1d_any = MemRefType.get([dynamic], types.f32())
with InsertionPoint(module.body):
function_type = FunctionType.get(
[memref_f32_1d_any, memref_f32_1d_any, types.i64()], []
)
f = func.FuncOp(
type=function_type,
name="memcpy_idiom",
)
f.attributes["sym_visibility"] = StringAttr.get("public")
with InsertionPoint(f.add_entry_block()):
c1024 = arith.ConstantOp(types.i32(), 1024)
c128 = arith.ConstantOp(types.i32(), 128)
arg0, arg1, arg2 = f.arguments
copied = openacc.copyin(
acc_var=arg0.type,
var=arg0,
var_type=types.f32(),
bounds=[],
async_operands=[],
implicit=False,
structured=True,
)
created = openacc.create_(
acc_var=arg1.type,
var=arg1,
var_type=types.f32(),
bounds=[],
async_operands=[],
implicit=False,
structured=True,
)
parallel_op = openacc.ParallelOp(
asyncOperands=[],
waitOperands=[],
numGangs=[c1024],
numWorkers=[],
vectorLength=[c128],
reductionOperands=[],
privateOperands=[],
firstprivateOperands=[],
dataClauseOperands=[],
)
# Set required device_type and segment attributes to satisfy verifier
acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")])
parallel_op.numGangsDeviceType = acc_device_none
parallel_op.numGangsSegments = DenseI32ArrayAttr.get([1])
parallel_op.vectorLengthDeviceType = acc_device_none
parallel_block = Block.create_at_start(parent=parallel_op.region, arg_types=[])
with InsertionPoint(parallel_block):
c0 = arith.ConstantOp(types.i64(), 0)
c1 = arith.ConstantOp(types.i64(), 1)
loop_op = openacc.LoopOp(
results_=[],
lowerbound=[c0],
upperbound=[f.arguments[2]],
step=[c1],
gangOperands=[],
workerNumOperands=[],
vectorOperands=[],
tileOperands=[],
cacheOperands=[],
privateOperands=[],
reductionOperands=[],
firstprivateOperands=[],
)
# Set loop attributes: gang and independent on device_type<none>
acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")])
loop_op.gang = acc_device_none
loop_op.independent = acc_device_none
loop_block = Block.create_at_start(
parent=loop_op.region, arg_types=[types.i64()]
)
with InsertionPoint(loop_block):
idx = arith.index_cast(out=IndexType.get(), in_=loop_block.arguments[0])
val = memref.load(memref=copied, indices=[idx])
memref.store(value=val, memref=created, indices=[idx])
openacc.YieldOp([])
openacc.YieldOp([])
deleted = openacc.delete(
acc_var=copied,
bounds=[],
async_operands=[],
implicit=False,
structured=True,
)
copied = openacc.copyout(
acc_var=created,
var=arg1,
var_type=types.f32(),
bounds=[],
async_operands=[],
implicit=False,
structured=True,
)
func.ReturnOp([])
print(module)
# CHECK: TEST: testParallelMemcpy
# CHECK-LABEL: func.func public @memcpy_idiom(
# CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: i64) {
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1024 : i32
# CHECK: %[[CONSTANT_1:.*]] = arith.constant 128 : i32
# CHECK: %[[COPYIN_0:.*]] = acc.copyin varPtr(%[[ARG0]] : memref<?xf32>) -> memref<?xf32>
# CHECK: %[[CREATE_0:.*]] = acc.create varPtr(%[[ARG1]] : memref<?xf32>) -> memref<?xf32>
# CHECK: acc.parallel num_gangs({%[[CONSTANT_0]] : i32}) vector_length(%[[CONSTANT_1]] : i32) {
# CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i64
# CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i64
# CHECK: acc.loop gang control(%[[VAL_0:.*]] : i64) = (%[[CONSTANT_2]] : i64) to (%[[ARG2]] : i64) step (%[[CONSTANT_3]] : i64) {
# CHECK: %[[INDEX_CAST_0:.*]] = arith.index_cast %[[VAL_0]] : i64 to index
# CHECK: %[[LOAD_0:.*]] = memref.load %[[COPYIN_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32>
# CHECK: memref.store %[[LOAD_0]], %[[CREATE_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32>
# CHECK: acc.yield
# CHECK: } attributes {independent = [#acc.device_type<none>]}
# CHECK: acc.yield
# CHECK: }
# CHECK: acc.delete accPtr(%[[COPYIN_0]] : memref<?xf32>)
# CHECK: acc.copyout accPtr(%[[CREATE_0]] : memref<?xf32>) to varPtr(%[[ARG1]] : memref<?xf32>)
# CHECK: return
# CHECK: }
|