# For manual usage, not as a part of lit tests. Used for generating the following tests:

from string import Template
from itertools import product

TEST_SM_ARCH_PAIRS = [(60, 50), (70, 63), (90, 87)]

SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"}

ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"]

INTEGER_OPERATIONS = [
    "xchg",
    "add",
    "sub",
    "and",
    "nand",
    "or",
    "xor",
    "max",
    "min",
    "umax",
    "umin",
    "uinc_wrap",
    "udec_wrap",
    "usub_cond",
    "usub_sat",
]

FLOATING_POINT_OPERATIONS = ["fadd", "fsub", "fmin", "fmax", "fminimum", "fmaximum"]

ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}

atomicrmw_func = Template(
    """define ${datatype} @${operation}_${ordering}_${datatype}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, ${datatype} %val) {
        %retval = atomicrmw ${operation} ptr ${addrspace_cast} %addr, ${datatype} %val syncscope(\"${llvm_scope}\") ${ordering} 
        ret $datatype %retval
}
"""
)

run_statement = Template(
    """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
; RUN: %if ptxas-sm_${sm} && ptxas-isa-${ptxfp} %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
"""
)


def get_addrspace_cast(addrspace):
    if addrspace == 0:
        return ""
    else:
        return " addrspace({})".format(str(addrspace))


if __name__ == "__main__":
    for sm, ptx in TEST_SM_ARCH_PAIRS:
        # Slice 1: Keep addrspace, llvm_scope, ordering fixed, generate all possible operations and sizes
        with open("atomicrmw-sm{}.ll".format(str(sm)), "w") as fp:
            print(run_statement.substitute(sm=sm, ptx=ptx, ptxfp=ptx / 10.0), file=fp)
            # Integer operations
            addrspace, llvm_scope, ordering = 1, "block", "acq_rel"
            for operation, datatype in product(
                INTEGER_OPERATIONS, ["i8", "i16", "i32", "i64"]
            ):
                print(
                    atomicrmw_func.substitute(
                        operation=operation,
                        ordering=ordering,
                        datatype=datatype,
                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
                        llvm_scope=llvm_scope,
                        addrspace_cast=get_addrspace_cast(addrspace),
                    ),
                    file=fp,
                )

            # Floating point add
            for datatype, operation in product(
                ["float", "double", "half", "bfloat"], FLOATING_POINT_OPERATIONS
            ):
                print(
                    atomicrmw_func.substitute(
                        operation=operation,
                        ordering=ordering,
                        datatype=datatype,
                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
                        llvm_scope=llvm_scope,
                        addrspace_cast=get_addrspace_cast(addrspace),
                    ),
                    file=fp,
                )

            # Slice 2: Keep addrspace, llvm_scope fixed, and generate all possible orderings for operations add and nand.
            # add is natively supported for larger bitwidths, while nand is emulated always
            addrspace, llvm_scope = 1, "block"
            for operation, datatype, ordering in product(
                ["add", "nand"], ["i8", "i32"], ORDERINGS
            ):
                if addrspace == 1 and llvm_scope == "block" and ordering == "acq_rel":
                    # These are a part of Slice 1
                    continue
                print(
                    atomicrmw_func.substitute(
                        operation=operation,
                        ordering=ordering,
                        datatype=datatype,
                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
                        addrspace_cast=get_addrspace_cast(addrspace),
                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
                        llvm_scope=llvm_scope,
                    ),
                    file=fp,
                )