237 files changed, 3636 insertions, 1753 deletions
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 4f5625f..2c2c7f1 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.20.0)
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   project(libclc VERSION 0.2.0 LANGUAGES CXX C)
 endif()
+set(LLVM_SUBPROJECT_TITLE "libclc")
 
 set(CMAKE_CXX_STANDARD 17)
 
@@ -28,7 +29,13 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   ptx-nvidiacl/lib/SOURCES;
   r600/lib/SOURCES;
   spirv/lib/SOURCES;
-  spirv64/lib/SOURCES
+  spirv64/lib/SOURCES;
+  # CLC internal libraries
+  clc/lib/generic/SOURCES;
+  clc/lib/clspv/SOURCES;
+  clc/lib/clspv64/SOURCES;
+  clc/lib/spirv/SOURCES;
+  clc/lib/spirv64/SOURCES;
 )
 
 set( LIBCLC_MIN_LLVM 3.9.0 )
@@ -73,10 +80,10 @@ else()
   endif()
 
   if( NOT EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} )
-    setup_host_tool( clang CLANG clang_exe clang_target )
-    setup_host_tool( llvm-as LLVM_AS llvm-as_exe llvm-as_target )
-    setup_host_tool( llvm-link LLVM_LINK llvm-link_exe llvm-link_target )
-    setup_host_tool( opt OPT opt_exe opt_target )
+    get_host_tool_path( clang CLANG clang_exe clang_target )
+    get_host_tool_path( llvm-as LLVM_AS llvm-as_exe llvm-as_target )
+    get_host_tool_path( llvm-link LLVM_LINK llvm-link_exe llvm-link_target )
+    get_host_tool_path( opt OPT opt_exe opt_target )
   endif()
 endif()
 
@@ -97,17 +104,19 @@ if( EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} )
 endif()
 
 foreach( tool IN ITEMS clang opt llvm-as llvm-link )
-  if( NOT EXISTS "${${tool}_exe}" AND NOT TARGET "${${tool}_target}" )
+  if( NOT EXISTS "${${tool}_exe}" AND "${tool}_target" STREQUAL "" )
     message( FATAL_ERROR "libclc toolchain incomplete - missing tool ${tool}!" )
   endif()
 endforeach()
 
 # llvm-spirv is an optional dependency, used to build spirv-* targets.
-find_program( LLVM_SPIRV llvm-spirv PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
-
-if( LLVM_SPIRV )
-  add_executable( libclc::llvm-spirv IMPORTED GLOBAL )
-  set_target_properties( libclc::llvm-spirv PROPERTIES IMPORTED_LOCATION ${LLVM_SPIRV} )
+# It may be provided in-tree or externally.
+if( TARGET llvm-spirv )
+  get_host_tool_path( llvm-spirv LLVM_SPIRV llvm-spirv_exe llvm-spirv_target )
+else()
+  find_program( LLVM_SPIRV llvm-spirv PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
+  set( llvm-spirv_exe "${LLVM_SPIRV}" )
+  set( llvm-spirv_target )
 endif()
 
 # List of all targets. Note that some are added dynamically below.
@@ -130,24 +139,31 @@ endif()
 
 # spirv-mesa3d and spirv64-mesa3d targets can only be built with the (optional)
 # llvm-spirv external tool.
-if( TARGET libclc::llvm-spirv )
+if( llvm-spirv_exe )
   list( APPEND LIBCLC_TARGETS_ALL  spirv-mesa3d- spirv64-mesa3d- )
 endif()
 
-if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
-  set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
-endif()
-
-list( SORT LIBCLC_TARGETS_TO_BUILD )
-
 # Verify that the user hasn't requested mesa3d targets without an available
 # llvm-spirv tool.
 if( "spirv-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD OR "spirv64-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD )
-  if( NOT TARGET libclc::llvm-spirv )
+  if( NOT llvm-spirv_exe )
     message( FATAL_ERROR "SPIR-V targets requested, but spirv-tools is not installed" )
   endif()
 endif()
 
+if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
+  set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
+else()
+  foreach(TARGET_TO_BUILD ${LIBCLC_TARGETS_TO_BUILD})
+    if (NOT ${TARGET_TO_BUILD} IN_LIST LIBCLC_TARGETS_ALL)
+      message ( FATAL_ERROR "Unknown target in LIBCLC_TARGETS_TO_BUILD: \"${TARGET_TO_BUILD}\"\n"
+                            "Valid targets are: ${LIBCLC_TARGETS_ALL}\n")
+    endif()
+  endforeach()
+endif()
+
+list( SORT LIBCLC_TARGETS_TO_BUILD )
+
 # Construct LLVM version define
 set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
 
@@ -203,7 +219,7 @@ set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii
   gfx1010 gfx1011 gfx1012 gfx1013
   gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036
   gfx1100 gfx1101 gfx1102 gfx1103
-  gfx1150 gfx1151 gfx1152
+  gfx1150 gfx1151 gfx1152 gfx1153
   gfx1200 gfx1201
 )
 
@@ -218,8 +234,10 @@ if( ENABLE_RUNTIME_SUBNORMAL )
        TARGET ${file}
        INPUTS ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/${file}.ll
     )
-    install( FILES $<TARGET_PROPERTY:${file},TARGET_FILE> ARCHIVE
-      DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+    install(
+      FILES $<TARGET_PROPERTY:${file},TARGET_FILE>
+      DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
+    )
   endforeach()
 endif()
 
@@ -230,12 +248,14 @@ add_custom_command(
   COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl
   DEPENDS ${script_loc} )
 add_custom_target( "generate_convert.cl" DEPENDS convert.cl )
+set_target_properties( "generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" )
 
 add_custom_command(
   OUTPUT clspv-convert.cl
   COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl
   DEPENDS ${script_loc} )
 add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl )
+set_target_properties( "clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" )
 
 enable_testing()
 
@@ -264,49 +284,30 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
     set( DARCH ${ARCH} )
   endif()
 
-  # Enumerate SOURCES* files
-  set( source_list )
-  foreach( l ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} )
-    foreach( s "SOURCES" "SOURCES_${LLVM_MAJOR}.${LLVM_MINOR}" )
-      file( TO_CMAKE_PATH ${l}/lib/${s} file_loc )
-      file( TO_CMAKE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${file_loc} loc )
-      # Prepend the location to give higher priority to
-      # specialized implementation
-      if( EXISTS ${loc} )
-        set( source_list ${file_loc} ${source_list} )
-      endif()
-    endforeach()
-  endforeach()
-
-  # Add the generated convert.cl here to prevent adding the one listed in
-  # SOURCES
-  set( objects )   # A "set" of already-added input files
-  set( rel_files ) # Source directory input files, relative to the root dir
-  set( gen_files ) # Generated binary input files, relative to the binary dir
-  if( NOT ${ARCH} STREQUAL "spirv" AND NOT ${ARCH} STREQUAL "spirv64" )
-    if( NOT ENABLE_RUNTIME_SUBNORMAL AND NOT ${ARCH} STREQUAL "clspv" AND
-        NOT ${ARCH} STREQUAL "clspv64" )
-      list( APPEND gen_files convert.cl )
-      list( APPEND objects convert.cl )
-      list( APPEND rel_files generic/lib/subnormal_use_default.ll )
-    elseif(${ARCH} STREQUAL "clspv" OR ${ARCH} STREQUAL "clspv64")
-      list( APPEND gen_files clspv-convert.cl )
-      list( APPEND objects clspv-convert.cl )
+  set( clc_lib_files )
+  libclc_configure_lib_source(
+    clc_lib_files
+    CLC_INTERNAL
+    LIB_ROOT_DIR clc
+    DIRS ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS}
+  )
+
+  set( opencl_lib_files )
+  set( opencl_gen_files )
+
+  if( NOT ARCH STREQUAL spirv AND NOT ARCH STREQUAL spirv64 )
+    if( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 )
+      list( APPEND opencl_gen_files clspv-convert.cl )
+    elseif ( NOT ENABLE_RUNTIME_SUBNORMAL )
+      list( APPEND opencl_gen_files convert.cl )
+      list( APPEND opencl_lib_files generic/lib/subnormal_use_default.ll )
     endif()
   endif()
 
-  foreach( l ${source_list} )
-    file( READ ${l} file_list )
-    string( REPLACE "\n" ";" file_list ${file_list} )
-    get_filename_component( dir ${l} DIRECTORY )
-    foreach( f ${file_list} )
-      # Only add each file once, so that targets can 'specialize' builtins
-      if( NOT ${f} IN_LIST objects )
-        list( APPEND objects ${f} )
-        list( APPEND rel_files ${dir}/${f} )
-      endif()
-    endforeach()
-  endforeach()
+  libclc_configure_lib_source(
+    opencl_lib_files
+    DIRS ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS}
+  )
 
   foreach( d ${${t}_devices} )
     get_libclc_device_info(
@@ -317,136 +318,72 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       CLANG_TRIPLE clang_triple
     )
 
-    set( mcpu )
-    if( NOT "${cpu}" STREQUAL "" )
-      set( mcpu "-mcpu=${cpu}" )
-    endif()
-
     message( STATUS "  device: ${d} ( ${${d}_aliases} )" )
 
     if ( ARCH STREQUAL spirv OR ARCH STREQUAL spirv64 )
-      set( build_flags -O0 -finline-hint-functions )
+      set( build_flags -O0 -finline-hint-functions -DCLC_SPIRV )
       set( opt_flags )
       set( spvflags --spirv-max-version=1.1 )
+      set( MACRO_ARCH SPIRV32 )
+      if( ARCH STREQUAL spirv64 )
+        set( MACRO_ARCH SPIRV64 )
+      endif()
     elseif( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 )
-      set( build_flags "-Wno-unknown-assumption")
+      set( build_flags "-Wno-unknown-assumption" -DCLC_CLSPV )
       set( opt_flags -O3 )
+      set( MACRO_ARCH CLSPV32 )
+      if( ARCH STREQUAL clspv64 )
+        set( MACRO_ARCH CLSPV64 )
+      endif()
     else()
       set( build_flags )
       set( opt_flags -O3 )
+      set( MACRO_ARCH ${ARCH} )
     endif()
 
     set( LIBCLC_ARCH_OBJFILE_DIR "${LIBCLC_OBJFILE_DIR}/${arch_suffix}" )
     file( MAKE_DIRECTORY ${LIBCLC_ARCH_OBJFILE_DIR} )
 
-    string( TOUPPER "CLC_${ARCH}" CLC_TARGET_DEFINE )
+    string( TOUPPER "CLC_${MACRO_ARCH}" CLC_TARGET_DEFINE )
 
     list( APPEND build_flags
       -D__CLC_INTERNAL
       -D${CLC_TARGET_DEFINE}
-      -I${CMAKE_CURRENT_SOURCE_DIR}/generic/include
+      # All libclc builtin libraries see CLC headers
+      -I${CMAKE_CURRENT_SOURCE_DIR}/clc/include
       # FIXME: Fix libclc to not require disabling this noisy warning
       -Wno-bitwise-conditional-parentheses
     )
 
-    set( bytecode_files "" )
-    foreach( file IN LISTS gen_files rel_files )
-      # We need to take each file and produce an absolute input file, as well
-      # as a unique architecture-specific output file. We deal with a mix of
-      # different input files, which makes this trickier.
-      if( ${file} IN_LIST gen_files )
-        # Generated files are given just as file names, which we must make
-        # absolute to the binary directory.
-        set( input_file ${CMAKE_CURRENT_BINARY_DIR}/${file} )
-        set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.bc" )
-      else()
-        # Other files are originally relative to each SOURCE file, which are
-        # then make relative to the libclc root directory. We must normalize
-        # the path (e.g., ironing out any ".."), then make it relative to the
-        # root directory again, and use that relative path component for the
-        # binary path.
-        get_filename_component( abs_path ${file} ABSOLUTE BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
-        file( RELATIVE_PATH root_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${abs_path} )
-        set( input_file ${CMAKE_CURRENT_SOURCE_DIR}/${file} )
-        set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${root_rel_path}.bc" )
-      endif()
-
-      get_filename_component( file_dir ${file} DIRECTORY )
-
-      compile_to_bc(
-        TRIPLE ${clang_triple}
-        INPUT ${input_file}
-        OUTPUT ${output_file}
-        EXTRA_OPTS "${mcpu}" -fno-builtin -nostdlib
-                   "${build_flags}" -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir}
-        DEPENDENCIES generate_convert.cl clspv-generate_convert.cl
-      )
-      list( APPEND bytecode_files ${output_file} )
-    endforeach()
+    if( NOT "${cpu}" STREQUAL "" )
+      list( APPEND build_flags -mcpu=${cpu} )
+    endif()
 
-    set( builtins_comp_lib_tgt builtins.comp.${arch_suffix} )
-    add_custom_target( ${builtins_comp_lib_tgt}
-      DEPENDS ${bytecode_files}
+    add_libclc_builtin_set(
+      CLC_INTERNAL
+      ARCH ${ARCH}
+      ARCH_SUFFIX clc-${arch_suffix}
+      TRIPLE ${clang_triple}
+      COMPILE_FLAGS ${build_flags}
+      OPT_FLAGS ${opt_flags}
+      LIB_FILES ${clc_lib_files}
     )
 
-    set( builtins_link_lib_tgt builtins.link.${arch_suffix} )
-    link_bc(
-      TARGET ${builtins_link_lib_tgt}
-      INPUTS ${bytecode_files}
-      DEPENDENCIES ${builtins_comp_lib_tgt}
+    list( APPEND build_flags
+      -I${CMAKE_CURRENT_SOURCE_DIR}/generic/include
     )
 
-    set( builtins_link_lib $<TARGET_PROPERTY:${builtins_link_lib_tgt},TARGET_FILE> )
-
-    if( ARCH STREQUAL spirv OR ARCH STREQUAL spirv64 )
-      set( spv_suffix ${arch_suffix}.spv )
-      add_custom_command( OUTPUT ${spv_suffix}
-        COMMAND libclc::llvm-spirv ${spvflags} -o ${spv_suffix} ${builtins_link_lib}
-        DEPENDS ${builtins_link_lib} ${builtins_link_lib_tgt}
-      )
-      add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
-      install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
-         DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
-    else()
-      set( builtins_opt_lib_tgt builtins.opt.${arch_suffix} )
-
-      # Add opt target
-      add_custom_command( OUTPUT ${builtins_opt_lib_tgt}.bc
-        COMMAND ${opt_exe} ${opt_flags} -o ${builtins_opt_lib_tgt}.bc
-          ${builtins_link_lib}
-        DEPENDS ${opt_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
-      )
-      add_custom_target( ${builtins_opt_lib_tgt}
-        ALL DEPENDS ${builtins_opt_lib_tgt}.bc
-      )
-      set_target_properties( ${builtins_opt_lib_tgt}
-        PROPERTIES TARGET_FILE ${builtins_opt_lib_tgt}.bc
-      )
-
-      set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
-
-      # Add prepare target
-      set( obj_suffix ${arch_suffix}.bc )
-      add_custom_command( OUTPUT ${obj_suffix}
-        COMMAND ${prepare_builtins_exe} -o ${obj_suffix} ${builtins_opt_lib}
-        DEPENDS ${builtins_opt_lib} ${builtins_opt_lib_tgt} ${prepare_builtins_target} )
-      add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} )
-
-      # nvptx-- targets don't include workitem builtins
-      if( NOT clang_triple MATCHES ".*ptx.*--$" )
-        add_test( NAME external-calls-${obj_suffix}
-          COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} )
-      endif()
-
-      install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
-      foreach( a ${${d}_aliases} )
-        set( alias_suffix "${a}-${clang_triple}.bc" )
-        add_custom_target( ${alias_suffix} ALL
-          COMMAND ${CMAKE_COMMAND} -E create_symlink ${obj_suffix} ${alias_suffix}
-          DEPENDS prepare-${obj_suffix} )
-        install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${alias_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
-      endforeach( a )
-    endif()
+    add_libclc_builtin_set(
+      ARCH ${ARCH}
+      ARCH_SUFFIX ${arch_suffix}
+      TRIPLE ${clang_triple}
+      COMPILE_FLAGS ${build_flags}
+      OPT_FLAGS ${opt_flags}
+      LIB_FILES ${opencl_lib_files}
+      GEN_FILES ${opencl_gen_files}
+      ALIASES ${${d}_aliases}
+      # Link in the CLC builtins and internalize their symbols
+      INTERNAL_LINK_DEPENDENCIES $<TARGET_PROPERTY:builtins.link.clc-${arch_suffix},TARGET_FILE>
+    )
   endforeach( d )
 endforeach( t )
diff --git a/libclc/README.TXT b/libclc/README.TXT
deleted file mode 100644
index 57b5242b..0000000
--- a/libclc/README.TXT
+++ /dev/null
@@ -1,52 +0,0 @@
-libclc
-------
-
-libclc is an open source, BSD licensed implementation of the library
-requirements of the OpenCL C programming language, as specified by the
-OpenCL 1.1 Specification. The following sections of the specification
-impose library requirements:
-
-  * 6.1: Supported Data Types
-  * 6.2.3: Explicit Conversions
-  * 6.2.4.2: Reinterpreting Types Using as_type() and as_typen()
-  * 6.9: Preprocessor Directives and Macros
-  * 6.11: Built-in Functions
-  * 9.3: Double Precision Floating-Point
-  * 9.4: 64-bit Atomics
-  * 9.5: Writing to 3D image memory objects
-  * 9.6: Half Precision Floating-Point
-
-libclc is intended to be used with the Clang compiler's OpenCL frontend.
-
-libclc is designed to be portable and extensible. To this end, it provides
-generic implementations of most library requirements, allowing the target
-to override the generic implementation at the granularity of individual
-functions.
-
-libclc currently only supports the PTX target, but support for more
-targets is welcome.
-
-Compiling and installing with Make
-----------------------------------
-
-$ ./configure.py --with-llvm-config=/path/to/llvm-config && make
-$ make install
-
-Note you can use the DESTDIR Makefile variable to do staged installs.
-
-$ make install DESTDIR=/path/for/staged/install
-
-Compiling and installing with Ninja
------------------------------------
-
-$ ./configure.py -g ninja --with-llvm-config=/path/to/llvm-config && ninja
-$ ninja install
-
-Note you can use the DESTDIR environment variable to do staged installs.
-
-$ DESTDIR=/path/for/staged/install ninja install
-
-Website
--------
-
-https://libclc.llvm.org/
diff --git a/libclc/README.md b/libclc/README.md
new file mode 100644
index 0000000..34f329d
--- /dev/null
+++ b/libclc/README.md
@@ -0,0 +1,67 @@
+# libclc
+
+libclc is an open source implementation of the library
+requirements of the OpenCL C programming language, as specified by the
+OpenCL 1.1 Specification. The following sections of the specification
+impose library requirements:
+
+  * 6.1: Supported Data Types
+  * 6.2.3: Explicit Conversions
+  * 6.2.4.2: Reinterpreting Types Using as_type() and as_typen()
+  * 6.9: Preprocessor Directives and Macros
+  * 6.11: Built-in Functions
+  * 9.3: Double Precision Floating-Point
+  * 9.4: 64-bit Atomics
+  * 9.5: Writing to 3D image memory objects
+  * 9.6: Half Precision Floating-Point
+
+libclc is intended to be used with the Clang compiler's OpenCL frontend.
+
+libclc is designed to be portable and extensible. To this end, it provides
+generic implementations of most library requirements, allowing the target
+to override the generic implementation at the granularity of individual
+functions.
+
+libclc currently supports PTX, AMDGPU, SPIRV and CLSPV targets, but support for
+more targets is welcome.
+
+## Compiling and installing
+
+(in the following instructions you can use `make` or `ninja`)
+
+For an in-tree build, Clang must also be built at the same time:
+```
+$ cmake <path-to>/llvm-project/llvm/CMakeLists.txt -DLLVM_ENABLE_PROJECTS="libclc;clang" \
+    -DCMAKE_BUILD_TYPE=Release -G Ninja
+$ ninja
+```
+Then install:
+```
+$ ninja install
+```
+Note you can use the `DESTDIR` Makefile variable to do staged installs.
+```
+$ DESTDIR=/path/for/staged/install ninja install
+```
+To build out of tree, or in other words, against an existing LLVM build or install:
+```
+$ cmake <path-to>/llvm-project/libclc/CMakeLists.txt -DCMAKE_BUILD_TYPE=Release \
+  -G Ninja -DLLVM_DIR=$(<path-to>/llvm-config --cmakedir)
+$ ninja
+```
+Then install as before.
+
+In both cases this will include all supported targets. You can choose which
+targets are enabled by passing `-DLIBCLC_TARGETS_TO_BUILD` to CMake. The default
+is `all`.
+
+In both cases, the LLVM used must include the targets you want libclc support for
+(`AMDGPU` and `NVPTX` are enabled in LLVM by default). Apart from `SPIRV` where you do
+not need an LLVM target but you do need the
+[llvm-spirv tool](https://github.com/KhronosGroup/SPIRV-LLVM-Translator) available.
+Either build this in-tree, or place it in the directory pointed to by
+`LLVM_TOOLS_BINARY_DIR`.
+
+## Website
+
+https://libclc.llvm.org/
diff --git a/libclc/amdgcn/lib/integer/popcount.cl b/libclc/amdgcn/lib/integer/popcount.cl
index ebd167d..3b493fb 100644
--- a/libclc/amdgcn/lib/integer/popcount.cl
+++ b/libclc/amdgcn/lib/integer/popcount.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include <utils.h>
+#include <clc/utils.h>
 #include <integer/popcount.h>
 
 #define __CLC_BODY "popcount.inc"
diff --git a/libclc/amdgcn/lib/math/fmax.cl b/libclc/amdgcn/lib/math/fmax.cl
index cb79616..4407d4a 100644
--- a/libclc/amdgcn/lib/math/fmax.cl
+++ b/libclc/amdgcn/lib/math/fmax.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "../../../generic/lib/clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_DEF _CLC_OVERLOAD float fmax(float x, float y)
 {
diff --git a/libclc/amdgcn/lib/math/fmin.cl b/libclc/amdgcn/lib/math/fmin.cl
index 35dea8b..4d02a47 100644
--- a/libclc/amdgcn/lib/math/fmin.cl
+++ b/libclc/amdgcn/lib/math/fmin.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "../../../generic/lib/clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_DEF _CLC_OVERLOAD float fmin(float x, float y)
 {
diff --git a/libclc/amdgcn/lib/math/ldexp.cl b/libclc/amdgcn/lib/math/ldexp.cl
index 9713e4d..d46d2dc 100644
--- a/libclc/amdgcn/lib/math/ldexp.cl
+++ b/libclc/amdgcn/lib/math/ldexp.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "../../../generic/lib/clcmacro.h"
+#include <clc/clcmacro.h>
 
 #ifdef __HAS_LDEXPF__
 #define BUILTINF __builtin_amdgcn_ldexpf
diff --git a/libclc/amdgpu/lib/math/half_native_unary.inc b/libclc/amdgpu/lib/math/half_native_unary.inc
index 0f99ba5..bdc3806 100644
--- a/libclc/amdgpu/lib/math/half_native_unary.inc
+++ b/libclc/amdgpu/lib/math/half_native_unary.inc
@@ -1,4 +1,4 @@
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_HALF_FUNC(x) __CLC_CONCAT(half_, x)
 #define __CLC_NATIVE_FUNC(x) __CLC_CONCAT(native_, x)
diff --git a/libclc/amdgpu/lib/math/nextafter.cl b/libclc/amdgpu/lib/math/nextafter.cl
index b290da0..6dc117b 100644
--- a/libclc/amdgpu/lib/math/nextafter.cl
+++ b/libclc/amdgpu/lib/math/nextafter.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../lib/clcmacro.h"
+#include <clc/clcmacro.h>
 #include <math/clc_nextafter.h>
 
 _CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float)
diff --git a/libclc/amdgpu/lib/math/sqrt.cl b/libclc/amdgpu/lib/math/sqrt.cl
index 5562600..17d77e5 100644
--- a/libclc/amdgpu/lib/math/sqrt.cl
+++ b/libclc/amdgpu/lib/math/sqrt.cl
@@ -20,9 +20,9 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
-#include "../../../generic/lib/clcmacro.h"
 #include "math/clc_sqrt.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 _CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
 
diff --git a/libclc/clc/include/clc/clc_as_type.h b/libclc/clc/include/clc/clc_as_type.h
new file mode 100644
index 0000000..9661395
--- /dev/null
+++ b/libclc/clc/include/clc/clc_as_type.h
@@ -0,0 +1,82 @@
+#ifndef __CLC_CLC_AS_TYPE_H__
+#define __CLC_CLC_AS_TYPE_H__
+
+#define __clc_as_char(x) __builtin_astype(x, char)
+#define __clc_as_uchar(x) __builtin_astype(x, uchar)
+#define __clc_as_short(x) __builtin_astype(x, short)
+#define __clc_as_ushort(x) __builtin_astype(x, ushort)
+#define __clc_as_int(x) __builtin_astype(x, int)
+#define __clc_as_uint(x) __builtin_astype(x, uint)
+#define __clc_as_long(x) __builtin_astype(x, long)
+#define __clc_as_ulong(x) __builtin_astype(x, ulong)
+#define __clc_as_float(x) __builtin_astype(x, float)
+
+#define __clc_as_char2(x) __builtin_astype(x, char2)
+#define __clc_as_uchar2(x) __builtin_astype(x, uchar2)
+#define __clc_as_short2(x) __builtin_astype(x, short2)
+#define __clc_as_ushort2(x) __builtin_astype(x, ushort2)
+#define __clc_as_int2(x) __builtin_astype(x, int2)
+#define __clc_as_uint2(x) __builtin_astype(x, uint2)
+#define __clc_as_long2(x) __builtin_astype(x, long2)
+#define __clc_as_ulong2(x) __builtin_astype(x, ulong2)
+#define __clc_as_float2(x) __builtin_astype(x, float2)
+
+#define __clc_as_char3(x) __builtin_astype(x, char3)
+#define __clc_as_uchar3(x) __builtin_astype(x, uchar3)
+#define __clc_as_short3(x) __builtin_astype(x, short3)
+#define __clc_as_ushort3(x) __builtin_astype(x, ushort3)
+#define __clc_as_int3(x) __builtin_astype(x, int3)
+#define __clc_as_uint3(x) __builtin_astype(x, uint3)
+#define __clc_as_long3(x) __builtin_astype(x, long3)
+#define __clc_as_ulong3(x) __builtin_astype(x, ulong3)
+#define __clc_as_float3(x) __builtin_astype(x, float3)
+
+#define __clc_as_char4(x) __builtin_astype(x, char4)
+#define __clc_as_uchar4(x) __builtin_astype(x, uchar4)
+#define __clc_as_short4(x) __builtin_astype(x, short4)
+#define __clc_as_ushort4(x) __builtin_astype(x, ushort4)
+#define __clc_as_int4(x) __builtin_astype(x, int4)
+#define __clc_as_uint4(x) __builtin_astype(x, uint4)
+#define __clc_as_long4(x) __builtin_astype(x, long4)
+#define __clc_as_ulong4(x) __builtin_astype(x, ulong4)
+#define __clc_as_float4(x) __builtin_astype(x, float4)
+
+#define __clc_as_char8(x) __builtin_astype(x, char8)
+#define __clc_as_uchar8(x) __builtin_astype(x, uchar8)
+#define __clc_as_short8(x) __builtin_astype(x, short8)
+#define __clc_as_ushort8(x) __builtin_astype(x, ushort8)
+#define __clc_as_int8(x) __builtin_astype(x, int8)
+#define __clc_as_uint8(x) __builtin_astype(x, uint8)
+#define __clc_as_long8(x) __builtin_astype(x, long8)
+#define __clc_as_ulong8(x) __builtin_astype(x, ulong8)
+#define __clc_as_float8(x) __builtin_astype(x, float8)
+
+#define __clc_as_char16(x) __builtin_astype(x, char16)
+#define __clc_as_uchar16(x) __builtin_astype(x, uchar16)
+#define __clc_as_short16(x) __builtin_astype(x, short16)
+#define __clc_as_ushort16(x) __builtin_astype(x, ushort16)
+#define __clc_as_int16(x) __builtin_astype(x, int16)
+#define __clc_as_uint16(x) __builtin_astype(x, uint16)
+#define __clc_as_long16(x) __builtin_astype(x, long16)
+#define __clc_as_ulong16(x) __builtin_astype(x, ulong16)
+#define __clc_as_float16(x) __builtin_astype(x, float16)
+
+#ifdef cl_khr_fp64
+#define __clc_as_double(x) __builtin_astype(x, double)
+#define __clc_as_double2(x) __builtin_astype(x, double2)
+#define __clc_as_double3(x) __builtin_astype(x, double3)
+#define __clc_as_double4(x) __builtin_astype(x, double4)
+#define __clc_as_double8(x) __builtin_astype(x, double8)
+#define __clc_as_double16(x) __builtin_astype(x, double16)
+#endif
+
+#ifdef cl_khr_fp16
+#define __clc_as_half(x) __builtin_astype(x, half)
+#define __clc_as_half2(x) __builtin_astype(x, half2)
+#define __clc_as_half3(x) __builtin_astype(x, half3)
+#define __clc_as_half4(x) __builtin_astype(x, half4)
+#define __clc_as_half8(x) __builtin_astype(x, half8)
+#define __clc_as_half16(x) __builtin_astype(x, half16)
+#endif
+
+#endif // __CLC_CLC_AS_TYPE_H__
diff --git a/libclc/generic/include/clc/clcfunc.h b/libclc/clc/include/clc/clcfunc.h
index 086d780..4698f09 100644
--- a/libclc/generic/include/clc/clcfunc.h
+++ b/libclc/clc/include/clc/clcfunc.h
@@ -1,13 +1,18 @@
+#ifndef __CLC_CLCFUNC_H_
+#define __CLC_CLCFUNC_H_
+
 #define _CLC_OVERLOAD __attribute__((overloadable))
 #define _CLC_DECL
 #define _CLC_INLINE __attribute__((always_inline)) inline
 
 // avoid inlines for SPIR-V related targets since we'll optimise later in the
 // chain
-#if defined(CLC_SPIRV) || defined(CLC_SPIRV64)
+#if defined(CLC_SPIRV)
 #define _CLC_DEF
-#elif defined(CLC_CLSPV) || defined(CLC_CLSPV64)
+#elif defined(CLC_CLSPV)
 #define _CLC_DEF __attribute__((noinline)) __attribute__((clspv_libclc_builtin))
 #else
 #define _CLC_DEF __attribute__((always_inline))
 #endif
+
+#endif // __CLC_CLCFUNC_H_
diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
new file mode 100644
index 0000000..2442392
--- /dev/null
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -0,0 +1,219 @@
+#ifndef __CLC_CLCMACRO_H__
+#define __CLC_CLCMACRO_H__
+
+#include <clc/internal/clc.h>
+#include <clc/utils.h>
+
+#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)          \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) {                              \
+    return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y));                        \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) {                              \
+    return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z));         \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) {                              \
+    return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi));                      \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) {                              \
+    return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi));                      \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) {                            \
+    return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi));                     \
+  }
+
+#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
+                              ARG2_TYPE)                                       \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) {              \
+    return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y));              \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) {              \
+    return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y),               \
+                         FUNCTION(x.z, y.z));                                  \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) {              \
+    return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi));          \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) {              \
+    return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi));          \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) {           \
+    return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi));         \
+  }
+
+#define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,          \
+                             ARG2_TYPE)                                        \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) {                 \
+    return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi));                \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) {                 \
+    return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y),                   \
+                         FUNCTION(x, y.z));                                    \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) {                 \
+    return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi));                \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) {                 \
+    return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi));                \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) {               \
+    return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi));               \
+  }
+
+#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
+                               ARG2_TYPE, ARG3_TYPE)                           \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,                \
+                                ARG3_TYPE##2 z) {                              \
+    return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y));    \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y,                \
+                                ARG3_TYPE##3 z) {                              \
+    return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y),     \
+                         FUNCTION(x.z, y.z, z.z));                             \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y,                \
+                                ARG3_TYPE##4 z) {                              \
+    return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo),                           \
+                         FUNCTION(x.hi, y.hi, z.hi));                          \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y,                \
+                                ARG3_TYPE##8 z) {                              \
+    return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo),                           \
+                         FUNCTION(x.hi, y.hi, z.hi));                          \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y,             \
+                                 ARG3_TYPE##16 z) {                            \
+    return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo),                          \
+                          FUNCTION(x.hi, y.hi, z.hi));                         \
+  }
+
+#define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
+                               ARG2_TYPE, ARG3_TYPE)                           \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) {    \
+    return (RET_TYPE##2)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));          \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##3 z) {    \
+    return (RET_TYPE##3)(FUNCTION(x, y, z.x), FUNCTION(x, y, z.y),             \
+                         FUNCTION(x, y, z.z));                                 \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##4 z) {    \
+    return (RET_TYPE##4)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));          \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##8 z) {    \
+    return (RET_TYPE##8)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));          \
+  }                                                                            \
+                                                                               \
+  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##16 z) {  \
+    return (RET_TYPE##16)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));         \
+  }
+
+#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
+                              ADDR_SPACE, ARG2_TYPE)                           \
+  DECLSPEC __CLC_XCONCAT(RET_TYPE, 2)                                          \
+      FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 2) x,                                  \
+               ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) * y) {                   \
+    return (__CLC_XCONCAT(RET_TYPE, 2))(                                       \
+        FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y),                              \
+        FUNCTION(x.y,                                                          \
+                 (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1)));    \
+  }                                                                            \
+                                                                               \
+  DECLSPEC __CLC_XCONCAT(RET_TYPE, 3)                                          \
+      FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 3) x,                                  \
+               ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 3) * y) {                   \
+    return (__CLC_XCONCAT(RET_TYPE, 3))(                                       \
+        FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y),                              \
+        FUNCTION(x.y,                                                          \
+                 (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1)),     \
+        FUNCTION(x.z,                                                          \
+                 (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 2)));    \
+  }                                                                            \
+                                                                               \
+  DECLSPEC __CLC_XCONCAT(RET_TYPE, 4)                                          \
+      FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 4) x,                                  \
+               ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) * y) {                   \
+    return (__CLC_XCONCAT(RET_TYPE, 4))(                                       \
+        FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) *)y),           \
+        FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT(                              \
+                           ARG2_TYPE, 2) *)((ADDR_SPACE ARG2_TYPE *)y + 2)));  \
+  }                                                                            \
+                                                                               \
+  DECLSPEC __CLC_XCONCAT(RET_TYPE, 8)                                          \
+      FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 8) x,                                  \
+               ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) * y) {                   \
+    return (__CLC_XCONCAT(RET_TYPE, 8))(                                       \
+        FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) *)y),           \
+        FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT(                              \
+                           ARG2_TYPE, 4) *)((ADDR_SPACE ARG2_TYPE *)y + 4)));  \
+  }                                                                            \
+                                                                               \
+  DECLSPEC __CLC_XCONCAT(RET_TYPE, 16)                                         \
+      FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 16) x,                                 \
+               ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 16) * y) {                  \
+    return (__CLC_XCONCAT(RET_TYPE, 16))(                                      \
+        FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) *)y),           \
+        FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT(                              \
+                           ARG2_TYPE, 8) *)((ADDR_SPACE ARG2_TYPE *)y + 8)));  \
+  }
+
+#define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,     \
+                                   ARG2_TYPE)                                  \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return BUILTIN(x, y);                                                      \
+  }                                                                            \
+  _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, \
+                        ARG2_TYPE)
+
+#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(                     \
+    RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE)                         \
+  _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,           \
+                             ARG2_TYPE)                                        \
+  _CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF, RET_TYPE,    \
+                                          FUNCTION, ARG1_TYPE, ARG2_TYPE)
+
+#define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE)      \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { return BUILTIN(x); } \
+  _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE)
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define _CLC_DEFINE_UNARY_BUILTIN_FP16(FUNCTION)                               \
+  _CLC_DEF _CLC_OVERLOAD half FUNCTION(half x) {                               \
+    return (half)FUNCTION((float)x);                                           \
+  }                                                                            \
+  _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, FUNCTION, half)
+
+#define _CLC_DEFINE_BINARY_BUILTIN_FP16(FUNCTION)                              \
+  _CLC_DEF _CLC_OVERLOAD half FUNCTION(half x, half y) {                       \
+    return (half)FUNCTION((float)x, (float)y);                                 \
+  }                                                                            \
+  _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, FUNCTION, half, half)
+
+#else
+
+#define _CLC_DEFINE_UNARY_BUILTIN_FP16(FUNCTION)
+#define _CLC_DEFINE_BINARY_BUILTIN_FP16(FUNCTION)
+
+#endif
+
+#endif // __CLC_CLCMACRO_H__
diff --git a/libclc/generic/include/clc/clctypes.h b/libclc/clc/include/clc/clctypes.h
index 76b816d..8ededd9 100644
--- a/libclc/generic/include/clc/clctypes.h
+++ b/libclc/clc/include/clc/clctypes.h
@@ -1,3 +1,6 @@
+#ifndef __CLC_CLCTYPES_H_
+#define __CLC_CLCTYPES_H_
+
 /* 6.1.1 Built-in Scalar Data Types */
 
 typedef unsigned char uchar;
@@ -8,12 +11,12 @@ typedef unsigned long ulong;
 typedef __SIZE_TYPE__ size_t;
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
 
-#define __stdint_join3(a,b,c) a ## b ## c
+#define __stdint_join3(a, b, c) a##b##c
 
-#define  __intn_t(n) __stdint_join3(__INT, n, _TYPE__)
+#define __intn_t(n) __stdint_join3(__INT, n, _TYPE__)
 #define __uintn_t(n) __stdint_join3(unsigned __INT, n, _TYPE__)
 
-typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+typedef __intn_t(__INTPTR_WIDTH__) intptr_t;
 typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
 
 #undef __uintn_t
@@ -93,3 +96,5 @@ typedef __attribute__((ext_vector_type(4))) half half4;
 typedef __attribute__((ext_vector_type(8))) half half8;
 typedef __attribute__((ext_vector_type(16))) half half16;
 #endif
+
+#endif // __CLC_CLCTYPES_H_
diff --git a/libclc/clc/include/clc/geometric/clc_dot.h b/libclc/clc/include/clc/geometric/clc_dot.h
new file mode 100644
index 0000000..a7fa4e1
--- /dev/null
+++ b/libclc/clc/include/clc/geometric/clc_dot.h
@@ -0,0 +1,7 @@
+#ifndef __CLC_GEOMETRIC_CLC_DOT_H__
+#define __CLC_GEOMETRIC_CLC_DOT_H__
+
+#define __CLC_BODY <clc/geometric/clc_dot.inc>
+#include <clc/geometric/floatn.inc>
+
+#endif // __CLC_GEOMETRIC_CLC_DOT_H__
diff --git a/libclc/clc/include/clc/geometric/clc_dot.inc b/libclc/clc/include/clc/geometric/clc_dot.inc
new file mode 100644
index 0000000..016b564
--- /dev/null
+++ b/libclc/clc/include/clc/geometric/clc_dot.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_FLOAT __clc_dot(__CLC_FLOATN p0, __CLC_FLOATN p1);
diff --git a/libclc/clc/include/clc/integer/clc_abs.h b/libclc/clc/include/clc/integer/clc_abs.h
new file mode 100644
index 0000000..31c62d3
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_abs.h
@@ -0,0 +1,14 @@
+#ifndef __CLC_INTEGER_CLC_ABS_H__
+#define __CLC_INTEGER_CLC_ABS_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible abs
+#define __clc_abs abs
+#else
+
+#define __CLC_BODY <clc/integer/clc_abs.inc>
+#include <clc/integer/gentype.inc>
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_ABS_H__
diff --git a/libclc/clc/include/clc/integer/clc_abs.inc b/libclc/clc/include/clc/integer/clc_abs.inc
new file mode 100644
index 0000000..3b9901f
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_abs.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_U_GENTYPE __clc_abs(__CLC_GENTYPE x);
diff --git a/libclc/clc/include/clc/integer/clc_abs_diff.h b/libclc/clc/include/clc/integer/clc_abs_diff.h
new file mode 100644
index 0000000..9c33fcf
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_abs_diff.h
@@ -0,0 +1,14 @@
+#ifndef __CLC_INTEGER_CLC_ABS_DIFF_H__
+#define __CLC_INTEGER_CLC_ABS_DIFF_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible abs_diff
+#define __clc_abs_diff abs_diff
+#else
+
+#define __CLC_BODY <clc/integer/clc_abs_diff.inc>
+#include <clc/integer/gentype.inc>
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_ABS_DIFF_H__
diff --git a/libclc/clc/include/clc/integer/clc_abs_diff.inc b/libclc/clc/include/clc/integer/clc_abs_diff.inc
new file mode 100644
index 0000000..b0ec98a
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_abs_diff.inc
@@ -0,0 +1,2 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_U_GENTYPE __clc_abs_diff(__CLC_GENTYPE x,
+                                                       __CLC_GENTYPE y);
diff --git a/libclc/generic/include/clc/integer/gentype.inc b/libclc/clc/include/clc/integer/gentype.inc
index cefed9c..2c8dd14 100644
--- a/libclc/generic/include/clc/integer/gentype.inc
+++ b/libclc/clc/include/clc/integer/gentype.inc
@@ -1,5 +1,5 @@
-//These 2 defines only change when switching between data sizes or base types to
-//keep this file manageable.
+// These 2 defines only change when switching between data sizes or base types
+// to keep this file manageable.
 #define __CLC_GENSIZE 8
 #define __CLC_SCALAR_GENTYPE char
 
diff --git a/libclc/clc/include/clc/internal/clc.h b/libclc/clc/include/clc/internal/clc.h
new file mode 100644
index 0000000..f448c6c
--- /dev/null
+++ b/libclc/clc/include/clc/internal/clc.h
@@ -0,0 +1,29 @@
+#ifndef __CLC_INTERNAL_CLC_H_
+#define __CLC_INTERNAL_CLC_H_
+
+#ifndef cl_clang_storage_class_specifiers
+#error Implementation requires cl_clang_storage_class_specifiers extension!
+#endif
+
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+/* Function Attributes */
+#include <clc/clcfunc.h>
+
+/* 6.1 Supported Data Types */
+#include <clc/clctypes.h>
+
+/* 6.2.4.2 Reinterpreting Types Using __clc_as_type() and __clc_as_typen() */
+#include <clc/clc_as_type.h>
+
+#pragma OPENCL EXTENSION all : disable
+
+#endif // __CLC_INTERNAL_CLC_H_
diff --git a/libclc/clc/include/clc/math/clc_ceil.h b/libclc/clc/include/clc/math/clc_ceil.h
new file mode 100644
index 0000000..6659068
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_ceil.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_MATH_CLC_CEIL_H__
+#define __CLC_MATH_CLC_CEIL_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible ceil
+#define __clc_ceil ceil
+#else
+
+// Map the function to an LLVM intrinsic
+#define __CLC_FUNCTION __clc_ceil
+#define __CLC_INTRINSIC "llvm.ceil"
+#include <clc/math/unary_intrin.inc>
+
+#undef __CLC_INTRINSIC
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_MATH_CLC_CEIL_H__
diff --git a/libclc/clc/include/clc/math/clc_fabs.h b/libclc/clc/include/clc/math/clc_fabs.h
new file mode 100644
index 0000000..93367b5
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_fabs.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_MATH_CLC_FABS_H__
+#define __CLC_MATH_CLC_FABS_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible fabs
+#define __clc_fabs fabs
+#else
+
+// Map the function to an LLVM intrinsic
+#define __CLC_FUNCTION __clc_fabs
+#define __CLC_INTRINSIC "llvm.fabs"
+#include <clc/math/unary_intrin.inc>
+
+#undef __CLC_INTRINSIC
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_MATH_CLC_FABS_H__
diff --git a/libclc/clc/include/clc/math/clc_floor.h b/libclc/clc/include/clc/math/clc_floor.h
new file mode 100644
index 0000000..9919872
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_floor.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_MATH_CLC_FLOOR_H__
+#define __CLC_MATH_CLC_FLOOR_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible floor
+#define __clc_floor floor
+#else
+
+// Map the function to an LLVM intrinsic
+#define __CLC_FUNCTION __clc_floor
+#define __CLC_INTRINSIC "llvm.floor"
+#include <clc/math/unary_intrin.inc>
+
+#undef __CLC_INTRINSIC
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_MATH_CLC_FLOOR_H__
diff --git a/libclc/clc/include/clc/math/clc_rint.h b/libclc/clc/include/clc/math/clc_rint.h
new file mode 100644
index 0000000..3761407
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_rint.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_MATH_CLC_RINT_H__
+#define __CLC_MATH_CLC_RINT_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible rint
+#define __clc_rint rint
+#else
+
+// Map the function to an LLVM intrinsic
+#define __CLC_FUNCTION __clc_rint
+#define __CLC_INTRINSIC "llvm.rint"
+#include <clc/math/unary_intrin.inc>
+
+#undef __CLC_INTRINSIC
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_MATH_CLC_RINT_H__
diff --git a/libclc/clc/include/clc/math/clc_trunc.h b/libclc/clc/include/clc/math/clc_trunc.h
new file mode 100644
index 0000000..c78c889
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_trunc.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_MATH_CLC_TRUNC_H__
+#define __CLC_MATH_CLC_TRUNC_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible trunc
+#define __clc_trunc trunc
+#else
+
+// Map the function to an LLVM intrinsic
+#define __CLC_FUNCTION __clc_trunc
+#define __CLC_INTRINSIC "llvm.trunc"
+#include <clc/math/unary_intrin.inc>
+
+#undef __CLC_INTRINSIC
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_MATH_CLC_TRUNC_H__
diff --git a/libclc/generic/include/clc/math/gentype.inc b/libclc/clc/include/clc/math/gentype.inc
index 966b426..966b426 100644
--- a/libclc/generic/include/clc/math/gentype.inc
+++ b/libclc/clc/include/clc/math/gentype.inc
diff --git a/libclc/generic/include/clc/math/unary_decl.inc b/libclc/clc/include/clc/math/unary_decl.inc
index 9858d90..9858d90 100644
--- a/libclc/generic/include/clc/math/unary_decl.inc
+++ b/libclc/clc/include/clc/math/unary_decl.inc
diff --git a/libclc/generic/include/math/unary_intrin.inc b/libclc/clc/include/clc/math/unary_intrin.inc
index 532bb1f..c331d3f 100644
--- a/libclc/generic/include/math/unary_intrin.inc
+++ b/libclc/clc/include/clc/math/unary_intrin.inc
@@ -3,7 +3,8 @@ _CLC_OVERLOAD float2 __CLC_FUNCTION(float2 f) __asm(__CLC_INTRINSIC ".v2f32");
 _CLC_OVERLOAD float3 __CLC_FUNCTION(float3 f) __asm(__CLC_INTRINSIC ".v3f32");
 _CLC_OVERLOAD float4 __CLC_FUNCTION(float4 f) __asm(__CLC_INTRINSIC ".v4f32");
 _CLC_OVERLOAD float8 __CLC_FUNCTION(float8 f) __asm(__CLC_INTRINSIC ".v8f32");
-_CLC_OVERLOAD float16 __CLC_FUNCTION(float16 f) __asm(__CLC_INTRINSIC ".v16f32");
+_CLC_OVERLOAD float16 __CLC_FUNCTION(float16 f) __asm(__CLC_INTRINSIC
+                                                      ".v16f32");
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -12,11 +13,12 @@ _CLC_OVERLOAD double2 __CLC_FUNCTION(double2 d) __asm(__CLC_INTRINSIC ".v2f64");
 _CLC_OVERLOAD double3 __CLC_FUNCTION(double3 d) __asm(__CLC_INTRINSIC ".v3f64");
 _CLC_OVERLOAD double4 __CLC_FUNCTION(double4 d) __asm(__CLC_INTRINSIC ".v4f64");
 _CLC_OVERLOAD double8 __CLC_FUNCTION(double8 d) __asm(__CLC_INTRINSIC ".v8f64");
-_CLC_OVERLOAD double16 __CLC_FUNCTION(double16 d) __asm(__CLC_INTRINSIC ".v16f64");
+_CLC_OVERLOAD double16 __CLC_FUNCTION(double16 d) __asm(__CLC_INTRINSIC
+                                                        ".v16f64");
 #endif
 
 #ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16: enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 _CLC_OVERLOAD half __CLC_FUNCTION(half d) __asm(__CLC_INTRINSIC ".f16");
 _CLC_OVERLOAD half2 __CLC_FUNCTION(half2 d) __asm(__CLC_INTRINSIC ".v2f16");
 _CLC_OVERLOAD half3 __CLC_FUNCTION(half3 d) __asm(__CLC_INTRINSIC ".v3f16");
diff --git a/libclc/clc/include/clc/relational/binary_decl.inc b/libclc/clc/include/clc/relational/binary_decl.inc
new file mode 100644
index 0000000..2e4b4fd
--- /dev/null
+++ b/libclc/clc/include/clc/relational/binary_decl.inc
@@ -0,0 +1,2 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_INTN __CLC_FUNCTION(__CLC_FLOATN a,
+                                                  __CLC_FLOATN b);
diff --git a/libclc/clc/include/clc/relational/clc_all.h b/libclc/clc/include/clc/relational/clc_all.h
new file mode 100644
index 0000000..bf06810
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_all.h
@@ -0,0 +1,31 @@
+#ifndef __CLC_RELATIONAL_CLC_ALL_H__
+#define __CLC_RELATIONAL_CLC_ALL_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible all
+#define __clc_all all
+#else
+
+#include <clc/clcfunc.h>
+
+#define _CLC_ALL_DECL(TYPE) _CLC_OVERLOAD _CLC_DECL int __clc_all(TYPE v);
+
+#define _CLC_VECTOR_ALL_DECL(TYPE)                                             \
+  _CLC_ALL_DECL(TYPE)                                                          \
+  _CLC_ALL_DECL(TYPE##2)                                                       \
+  _CLC_ALL_DECL(TYPE##3)                                                       \
+  _CLC_ALL_DECL(TYPE##4)                                                       \
+  _CLC_ALL_DECL(TYPE##8)                                                       \
+  _CLC_ALL_DECL(TYPE##16)
+
+_CLC_VECTOR_ALL_DECL(char)
+_CLC_VECTOR_ALL_DECL(short)
+_CLC_VECTOR_ALL_DECL(int)
+_CLC_VECTOR_ALL_DECL(long)
+
+#undef _CLC_ALL_DECL
+#undef _CLC_VECTOR_ALL_DECL
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ALL_H__
diff --git a/libclc/clc/include/clc/relational/clc_any.h b/libclc/clc/include/clc/relational/clc_any.h
new file mode 100644
index 0000000..f947b77
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_any.h
@@ -0,0 +1,31 @@
+#ifndef __CLC_RELATIONAL_CLC_ANY_H__
+#define __CLC_RELATIONAL_CLC_ANY_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible any
+#define __clc_any any
+#else
+
+#include <clc/clcfunc.h>
+
+#define _CLC_ANY_DECL(TYPE) _CLC_OVERLOAD _CLC_DECL int __clc_any(TYPE v);
+
+#define _CLC_VECTOR_ANY_DECL(TYPE)                                             \
+  _CLC_ANY_DECL(TYPE)                                                          \
+  _CLC_ANY_DECL(TYPE##2)                                                       \
+  _CLC_ANY_DECL(TYPE##3)                                                       \
+  _CLC_ANY_DECL(TYPE##4)                                                       \
+  _CLC_ANY_DECL(TYPE##8)                                                       \
+  _CLC_ANY_DECL(TYPE##16)
+
+_CLC_VECTOR_ANY_DECL(char)
+_CLC_VECTOR_ANY_DECL(short)
+_CLC_VECTOR_ANY_DECL(int)
+_CLC_VECTOR_ANY_DECL(long)
+
+#undef _CLC_ANY_DECL
+#undef _CLC_VECTOR_ANY_DECL
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ANY_H__
diff --git a/libclc/clc/include/clc/relational/clc_bitselect.h b/libclc/clc/include/clc/relational/clc_bitselect.h
new file mode 100644
index 0000000..53fae6a
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_bitselect.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __CLC_RELATIONAL_CLC_BITSELECT_H__
+#define __CLC_RELATIONAL_CLC_BITSELECT_H__
+
+#define __CLC_BODY <clc/relational/clc_bitselect.inc>
+#include <clc/math/gentype.inc>
+#define __CLC_BODY <clc/relational/clc_bitselect.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_BODY
+
+#endif // __CLC_RELATIONAL_CLC_BITSELECT_H__
diff --git a/libclc/clc/include/clc/relational/clc_bitselect.inc b/libclc/clc/include/clc/relational/clc_bitselect.inc
new file mode 100644
index 0000000..14d5bea
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_bitselect.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_bitselect(__CLC_GENTYPE x,
+                                                      __CLC_GENTYPE y,
+                                                      __CLC_GENTYPE z);
diff --git a/libclc/clc/include/clc/relational/clc_isequal.h b/libclc/clc/include/clc/relational/clc_isequal.h
new file mode 100644
index 0000000..3a36ea2
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isequal.h
@@ -0,0 +1,41 @@
+#ifndef __CLC_RELATIONAL_CLC_ISEQUAL_H__
+#define __CLC_RELATIONAL_CLC_ISEQUAL_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isequal
+#define __clc_isequal isequal
+#else
+
+#include <clc/clcfunc.h>
+
+#define _CLC_ISEQUAL_DECL(TYPE, RETTYPE)                                       \
+  _CLC_OVERLOAD _CLC_DECL RETTYPE __clc_isequal(TYPE x, TYPE y);
+
+#define _CLC_VECTOR_ISEQUAL_DECL(TYPE, RETTYPE)                                \
+  _CLC_ISEQUAL_DECL(TYPE##2, RETTYPE##2)                                       \
+  _CLC_ISEQUAL_DECL(TYPE##3, RETTYPE##3)                                       \
+  _CLC_ISEQUAL_DECL(TYPE##4, RETTYPE##4)                                       \
+  _CLC_ISEQUAL_DECL(TYPE##8, RETTYPE##8)                                       \
+  _CLC_ISEQUAL_DECL(TYPE##16, RETTYPE##16)
+
+_CLC_ISEQUAL_DECL(float, int)
+_CLC_VECTOR_ISEQUAL_DECL(float, int)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_ISEQUAL_DECL(double, int)
+_CLC_VECTOR_ISEQUAL_DECL(double, long)
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+_CLC_ISEQUAL_DECL(half, int)
+_CLC_VECTOR_ISEQUAL_DECL(half, short)
+#endif
+
+#undef _CLC_ISEQUAL_DECL
+#undef _CLC_VECTOR_ISEQUAL_DECL
+
+#endif
+
+#endif //  __CLC_RELATIONAL_CLC_ISEQUAL_H__
diff --git a/libclc/clc/include/clc/relational/clc_isfinite.h b/libclc/clc/include/clc/relational/clc_isfinite.h
new file mode 100644
index 0000000..3ed276e
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isfinite.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISFINITE_H__
+#define __CLC_RELATIONAL_CLC_ISFINITE_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isfinite
+#define __clc_isfinite isfinite
+#else
+
+#define __CLC_FUNCTION __clc_isfinite
+#define __CLC_BODY <clc/relational/unary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISFINITE_H__
diff --git a/libclc/clc/include/clc/relational/clc_isgreater.h b/libclc/clc/include/clc/relational/clc_isgreater.h
new file mode 100644
index 0000000..b51d59a
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isgreater.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISGREATER_H__
+#define __CLC_RELATIONAL_CLC_ISGREATER_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isgreater
+#define __clc_isgreater isgreater
+#else
+
+#define __CLC_FUNCTION __clc_isgreater
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISGREATER_H__
diff --git a/libclc/clc/include/clc/relational/clc_isgreaterequal.h b/libclc/clc/include/clc/relational/clc_isgreaterequal.h
new file mode 100644
index 0000000..b7ffce1
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isgreaterequal.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__
+#define __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isgreaterequal
+#define __clc_isgreaterequal isgreaterequal
+#else
+
+#define __CLC_FUNCTION __clc_isgreaterequal
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__
diff --git a/libclc/clc/include/clc/relational/clc_isinf.h b/libclc/clc/include/clc/relational/clc_isinf.h
new file mode 100644
index 0000000..c33ef9b
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isinf.h
@@ -0,0 +1,41 @@
+#ifndef __CLC_RELATIONAL_CLC_ISINF_H__
+#define __CLC_RELATIONAL_CLC_ISINF_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isinf
+#define __clc_isinf isinf
+#else
+
+#include <clc/clcfunc.h>
+
+#define _CLC_ISINF_DECL(RET_TYPE, ARG_TYPE)                                    \
+  _CLC_OVERLOAD _CLC_DECL RET_TYPE __clc_isinf(ARG_TYPE);
+
+#define _CLC_VECTOR_ISINF_DECL(RET_TYPE, ARG_TYPE)                             \
+  _CLC_ISINF_DECL(RET_TYPE##2, ARG_TYPE##2)                                    \
+  _CLC_ISINF_DECL(RET_TYPE##3, ARG_TYPE##3)                                    \
+  _CLC_ISINF_DECL(RET_TYPE##4, ARG_TYPE##4)                                    \
+  _CLC_ISINF_DECL(RET_TYPE##8, ARG_TYPE##8)                                    \
+  _CLC_ISINF_DECL(RET_TYPE##16, ARG_TYPE##16)
+
+_CLC_ISINF_DECL(int, float)
+_CLC_VECTOR_ISINF_DECL(int, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_ISINF_DECL(int, double)
+_CLC_VECTOR_ISINF_DECL(long, double)
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+_CLC_ISINF_DECL(int, half)
+_CLC_VECTOR_ISINF_DECL(short, half)
+#endif
+
+#undef _CLC_ISINF_DECL
+#undef _CLC_VECTOR_ISINF_DECL
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISINF_H__
diff --git a/libclc/clc/include/clc/relational/clc_isless.h b/libclc/clc/include/clc/relational/clc_isless.h
new file mode 100644
index 0000000..c6950aa
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isless.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISLESS_H__
+#define __CLC_RELATIONAL_CLC_ISLESS_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isless
+#define __clc_isless isless
+#else
+
+#define __CLC_FUNCTION __clc_isless
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISLESS_H__
diff --git a/libclc/clc/include/clc/relational/clc_islessequal.h b/libclc/clc/include/clc/relational/clc_islessequal.h
new file mode 100644
index 0000000..7efac16
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_islessequal.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__
+#define __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible islessequal
+#define __clc_islessequal islessequal
+#else
+
+#define __CLC_FUNCTION __clc_islessequal
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__
diff --git a/libclc/clc/include/clc/relational/clc_islessgreater.h b/libclc/clc/include/clc/relational/clc_islessgreater.h
new file mode 100644
index 0000000..df3c5e5
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_islessgreater.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISLESSGREATER_H__
+#define __CLC_RELATIONAL_CLC_ISLESSGREATER_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible islessgreater
+#define __clc_islessgreater islessgreater
+#else
+
+#define __CLC_FUNCTION __clc_islessgreater
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISLESSGREATER_H__
diff --git a/libclc/clc/include/clc/relational/clc_isnan.h b/libclc/clc/include/clc/relational/clc_isnan.h
new file mode 100644
index 0000000..08351eb5
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isnan.h
@@ -0,0 +1,41 @@
+#ifndef __CLC_RELATIONAL_CLC_ISNAN_H__
+#define __CLC_RELATIONAL_CLC_ISNAN_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isnan
+#define __clc_isnan isnan
+#else
+
+#include <clc/clcfunc.h>
+
+#define _CLC_ISNAN_DECL(RET_TYPE, ARG_TYPE)                                    \
+  _CLC_OVERLOAD _CLC_DECL RET_TYPE __clc_isnan(ARG_TYPE);
+
+#define _CLC_VECTOR_ISNAN_DECL(RET_TYPE, ARG_TYPE)                             \
+  _CLC_ISNAN_DECL(RET_TYPE##2, ARG_TYPE##2)                                    \
+  _CLC_ISNAN_DECL(RET_TYPE##3, ARG_TYPE##3)                                    \
+  _CLC_ISNAN_DECL(RET_TYPE##4, ARG_TYPE##4)                                    \
+  _CLC_ISNAN_DECL(RET_TYPE##8, ARG_TYPE##8)                                    \
+  _CLC_ISNAN_DECL(RET_TYPE##16, ARG_TYPE##16)
+
+_CLC_ISNAN_DECL(int, float)
+_CLC_VECTOR_ISNAN_DECL(int, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_ISNAN_DECL(int, double)
+_CLC_VECTOR_ISNAN_DECL(long, double)
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+_CLC_ISNAN_DECL(int, half)
+_CLC_VECTOR_ISNAN_DECL(short, half)
+#endif
+
+#undef _CLC_ISNAN_DECL
+#undef _CLC_VECTOR_ISNAN_DECL
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISNAN_H__
diff --git a/libclc/clc/include/clc/relational/clc_isnormal.h b/libclc/clc/include/clc/relational/clc_isnormal.h
new file mode 100644
index 0000000..48ee6b8
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isnormal.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISNORMAL_H__
+#define __CLC_RELATIONAL_CLC_ISNORMAL_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isnormal
+#define __clc_isnormal isnormal
+#else
+
+#define __CLC_FUNCTION __clc_isnormal
+#define __CLC_BODY <clc/relational/unary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISNORMAL_H__
diff --git a/libclc/clc/include/clc/relational/clc_isnotequal.h b/libclc/clc/include/clc/relational/clc_isnotequal.h
new file mode 100644
index 0000000..55c1bd9
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isnotequal.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__
+#define __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isnotequal
+#define __clc_isnotequal isnotequal
+#else
+
+#define __CLC_FUNCTION __clc_isnotequal
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__
diff --git a/libclc/clc/include/clc/relational/clc_isordered.h b/libclc/clc/include/clc/relational/clc_isordered.h
new file mode 100644
index 0000000..5ce2bfe
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isordered.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISORDERED_H__
+#define __CLC_RELATIONAL_CLC_ISORDERED_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isordered
+#define __clc_isordered isordered
+#else
+
+#define __CLC_FUNCTION __clc_isordered
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISORDERED_H__
diff --git a/libclc/clc/include/clc/relational/clc_isunordered.h b/libclc/clc/include/clc/relational/clc_isunordered.h
new file mode 100644
index 0000000..305d2b4
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_isunordered.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_ISUNORDERED_H__
+#define __CLC_RELATIONAL_CLC_ISUNORDERED_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible isunordered
+#define __clc_isunordered isunordered
+#else
+
+#define __CLC_FUNCTION __clc_isunordered
+#define __CLC_BODY <clc/relational/binary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_ISUNORDERED_H__
diff --git a/libclc/clc/include/clc/relational/clc_select.h b/libclc/clc/include/clc/relational/clc_select.h
new file mode 100644
index 0000000..ddea7c5
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_select.h
@@ -0,0 +1,23 @@
+#ifndef __CLC_RELATIONAL_CLC_SELECT_H__
+#define __CLC_RELATIONAL_CLC_SELECT_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible select
+#define __clc_select select
+#else
+
+/* Duplciate these so we don't have to distribute utils.h */
+#define __CLC_CONCAT(x, y) x##y
+#define __CLC_XCONCAT(x, y) __CLC_CONCAT(x, y)
+
+#define __CLC_BODY <clc/relational/clc_select.inc>
+#include <clc/math/gentype.inc>
+#define __CLC_BODY <clc/relational/clc_select.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_CONCAT
+#undef __CLC_XCONCAT
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_SELECT_H__
diff --git a/libclc/clc/include/clc/relational/clc_select.inc b/libclc/clc/include/clc/relational/clc_select.inc
new file mode 100644
index 0000000..abf0e0f
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_select.inc
@@ -0,0 +1,29 @@
+#ifdef __CLC_SCALAR
+#define __CLC_VECSIZE
+#endif
+
+#if __CLC_FPSIZE == 64
+#define __CLC_S_GENTYPE __CLC_XCONCAT(long, __CLC_VECSIZE)
+#define __CLC_U_GENTYPE __CLC_XCONCAT(ulong, __CLC_VECSIZE)
+#elif __CLC_FPSIZE == 32
+#define __CLC_S_GENTYPE __CLC_XCONCAT(int, __CLC_VECSIZE)
+#define __CLC_U_GENTYPE __CLC_XCONCAT(uint, __CLC_VECSIZE)
+#elif __CLC_FPSIZE == 16
+#define __CLC_S_GENTYPE __CLC_XCONCAT(short, __CLC_VECSIZE)
+#define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
+#endif
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   __CLC_S_GENTYPE z);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   __CLC_U_GENTYPE z);
+
+#ifdef __CLC_FPSIZE
+#undef __CLC_S_GENTYPE
+#undef __CLC_U_GENTYPE
+#endif
+#ifdef __CLC_SCALAR
+#undef __CLC_VECSIZE
+#endif
diff --git a/libclc/clc/include/clc/relational/clc_signbit.h b/libclc/clc/include/clc/relational/clc_signbit.h
new file mode 100644
index 0000000..45a7112
--- /dev/null
+++ b/libclc/clc/include/clc/relational/clc_signbit.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_RELATIONAL_CLC_SIGNBIT_H__
+#define __CLC_RELATIONAL_CLC_SIGNBIT_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible signbit
+#define __clc_signbit signbit
+#else
+
+#define __CLC_FUNCTION __clc_signbit
+#define __CLC_BODY <clc/relational/unary_decl.inc>
+
+#include <clc/relational/floatn.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_RELATIONAL_CLC_SIGNBIT_H__
diff --git a/libclc/generic/include/clc/relational/floatn.inc b/libclc/clc/include/clc/relational/floatn.inc
index fc0d6878..fc0d6878 100644
--- a/libclc/generic/include/clc/relational/floatn.inc
+++ b/libclc/clc/include/clc/relational/floatn.inc
diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h
new file mode 100644
index 0000000..54241b6
--- /dev/null
+++ b/libclc/clc/include/clc/relational/relational.h
@@ -0,0 +1,145 @@
+#ifndef __CLC_RELATIONAL_RELATIONAL_H__
+#define __CLC_RELATIONAL_RELATIONAL_H__
+
+/*
+ * Contains relational macros that have to return 1 for scalar and -1 for vector
+ * when the result is true.
+ */
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME,  \
+                                            ARG_TYPE)                          \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return BUILTIN_NAME(x);                                                    \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} !=            \
+                      (RET_TYPE)0);                                            \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1),               \
+                                 FUNCTION(x.s2)} != (RET_TYPE)0);              \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1),               \
+                                 FUNCTION(x.s2),                               \
+                                 FUNCTION(x.s3)} != (RET_TYPE)0);              \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (                                                                   \
+        RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2),   \
+                             FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5),   \
+                             FUNCTION(x.s6), FUNCTION(x.s7)} != (RET_TYPE)0);  \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE)       \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (                                                                   \
+        RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2),   \
+                             FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5),   \
+                             FUNCTION(x.s6), FUNCTION(x.s7), FUNCTION(x.s8),   \
+                             FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb),   \
+                             FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se),   \
+                             FUNCTION(x.sf)} != (RET_TYPE)0);                  \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE)     \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16)
+
+#define _CLC_DEFINE_RELATIONAL_UNARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,     \
+                                     ARG_TYPE)                                 \
+  _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,    \
+                                      ARG_TYPE)                                \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE)
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, \
+                                             ARG0_TYPE, ARG1_TYPE)             \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
+    return BUILTIN_NAME(x, y);                                                 \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC(RET_TYPE, FUNCTION, ARG0_TYPE,       \
+                                          ARG1_TYPE)                           \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo, y.lo),                         \
+                                 FUNCTION(x.hi, y.hi)} != (RET_TYPE)0);        \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE, FUNCTION, ARG0_TYPE,      \
+                                           ARG1_TYPE)                          \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo, y.lo),                         \
+                                 FUNCTION(x.hi, y.hi)} != (RET_TYPE)0);        \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE, FUNCTION, ARG0_TYPE,      \
+                                           ARG1_TYPE)                          \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
+                                 FUNCTION(x.s2, y.s2)} != (RET_TYPE)0);        \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE, FUNCTION, ARG0_TYPE,      \
+                                           ARG1_TYPE)                          \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
+                                 FUNCTION(x.s2, y.s2),                         \
+                                 FUNCTION(x.s3, y.s3)} != (RET_TYPE)0);        \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE, FUNCTION, ARG0_TYPE,      \
+                                           ARG1_TYPE)                          \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
+                                 FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3),   \
+                                 FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5),   \
+                                 FUNCTION(x.s6, y.s6),                         \
+                                 FUNCTION(x.s7, y.s7)} != (RET_TYPE)0);        \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE, FUNCTION, ARG0_TYPE,     \
+                                            ARG1_TYPE)                         \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
+                                 FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3),   \
+                                 FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5),   \
+                                 FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7),   \
+                                 FUNCTION(x.s8, y.s8), FUNCTION(x.s9, y.s9),   \
+                                 FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb),   \
+                                 FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd),   \
+                                 FUNCTION(x.se, y.se),                         \
+                                 FUNCTION(x.sf, y.sf)} != (RET_TYPE)0);        \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE,   \
+                                              ARG1_TYPE)                       \
+  _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE##2, FUNCTION, ARG0_TYPE##2,      \
+                                     ARG1_TYPE##2)                             \
+  _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE##3, FUNCTION, ARG0_TYPE##3,      \
+                                     ARG1_TYPE##3)                             \
+  _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE##4, FUNCTION, ARG0_TYPE##4,      \
+                                     ARG1_TYPE##4)                             \
+  _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE##8, FUNCTION, ARG0_TYPE##8,      \
+                                     ARG1_TYPE##8)                             \
+  _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE##16, FUNCTION, ARG0_TYPE##16,   \
+                                      ARG1_TYPE##16)
+
+#define _CLC_DEFINE_RELATIONAL_BINARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,    \
+                                      ARG0_TYPE, ARG1_TYPE)                    \
+  _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,   \
+                                       ARG0_TYPE, ARG1_TYPE)                   \
+  _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE,         \
+                                        ARG1_TYPE)
+
+#endif // __CLC_RELATIONAL_RELATIONAL_H__
diff --git a/libclc/generic/include/clc/relational/unary_decl.inc b/libclc/clc/include/clc/relational/unary_decl.inc
index ab9b776..ab9b776 100644
--- a/libclc/generic/include/clc/relational/unary_decl.inc
+++ b/libclc/clc/include/clc/relational/unary_decl.inc
diff --git a/libclc/clc/include/clc/shared/clc_clamp.h b/libclc/clc/include/clc/shared/clc_clamp.h
new file mode 100644
index 0000000..a84184c
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_clamp.h
@@ -0,0 +1,20 @@
+#ifndef __CLC_SHARED_CLC_CLAMP_H__
+#define __CLC_SHARED_CLC_CLAMP_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible clamp
+#define __clc_clamp clamp
+#else
+
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
+#define __CLC_BODY <clc/shared/clc_clamp.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/shared/clc_clamp.inc>
+#include <clc/math/gentype.inc>
+
+#endif
+
+#endif // __CLC_SHARED_CLC_CLAMP_H__
diff --git a/libclc/clc/include/clc/shared/clc_clamp.inc b/libclc/clc/include/clc/shared/clc_clamp.inc
new file mode 100644
index 0000000..cf6b0b2
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_clamp.inc
@@ -0,0 +1,9 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x,
+                                                  __CLC_GENTYPE y,
+                                                  __CLC_GENTYPE z);
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x,
+                                                  __CLC_SCALAR_GENTYPE y,
+                                                  __CLC_SCALAR_GENTYPE z);
+#endif
diff --git a/libclc/clc/include/clc/shared/clc_max.h b/libclc/clc/include/clc/shared/clc_max.h
new file mode 100644
index 0000000..388f001
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_max.h
@@ -0,0 +1,17 @@
+#ifndef __CLC_SHARED_CLC_MAX_H__
+#define __CLC_SHARED_CLC_MAX_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible max
+#define __clc_max max
+#else
+
+#define __CLC_BODY <clc/shared/clc_max.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/shared/clc_max.inc>
+#include <clc/math/gentype.inc>
+
+#endif
+
+#endif // __CLC_SHARED_CLC_MAX_H__
diff --git a/libclc/clc/include/clc/shared/clc_max.inc b/libclc/clc/include/clc/shared/clc_max.inc
new file mode 100644
index 0000000..bddb3fa
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_max.inc
@@ -0,0 +1,7 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a,
+                                                __CLC_GENTYPE b);
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a,
+                                                __CLC_SCALAR_GENTYPE b);
+#endif
diff --git a/libclc/clc/include/clc/shared/clc_min.h b/libclc/clc/include/clc/shared/clc_min.h
new file mode 100644
index 0000000..c8d920e
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_min.h
@@ -0,0 +1,17 @@
+#ifndef __CLC_SHARED_CLC_MIN_H__
+#define __CLC_SHARED_CLC_MIN_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible min
+#define __clc_min min
+#else
+
+#define __CLC_BODY <clc/shared/clc_min.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/shared/clc_min.inc>
+#include <clc/math/gentype.inc>
+
+#endif
+
+#endif // __CLC_SHARED_CLC_MIN_H__
diff --git a/libclc/clc/include/clc/shared/clc_min.inc b/libclc/clc/include/clc/shared/clc_min.inc
new file mode 100644
index 0000000..3e1da96d
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_min.inc
@@ -0,0 +1,7 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a,
+                                                __CLC_GENTYPE b);
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a,
+                                                __CLC_SCALAR_GENTYPE b);
+#endif
diff --git a/libclc/generic/include/utils.h b/libclc/clc/include/clc/utils.h
index 018a7b3..b53b6a3 100644
--- a/libclc/generic/include/utils.h
+++ b/libclc/clc/include/clc/utils.h
@@ -1,10 +1,10 @@
-#ifndef __CLC_UTILS_H_
-#define __CLC_UTILS_H_
+#ifndef __CLC_UTILS_H__
+#define __CLC_UTILS_H__
 
-#define __CLC_CONCAT(x, y) x ## y
+#define __CLC_CONCAT(x, y) x##y
 #define __CLC_XCONCAT(x, y) __CLC_CONCAT(x, y)
 
 #define __CLC_STR(x) #x
 #define __CLC_XSTR(x) __CLC_STR(x)
 
-#endif
+#endif // __CLC_UTILS_H__
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
new file mode 100644
index 0000000..75a3130
--- /dev/null
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -0,0 +1 @@
+dummy.cl
diff --git a/libclc/clc/lib/clspv/dummy.cl b/libclc/clc/lib/clspv/dummy.cl
new file mode 100644
index 0000000..fab17ac
--- /dev/null
+++ b/libclc/clc/lib/clspv/dummy.cl
@@ -0,0 +1 @@
+// Empty file
diff --git a/libclc/clc/lib/clspv64 b/libclc/clc/lib/clspv64
new file mode 120000
index 0000000..ea01ba9
--- /dev/null
+++ b/libclc/clc/lib/clspv64
@@ -0,0 +1 @@
+clspv
+\ No newline at end of file
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
new file mode 100644
index 0000000..d7ffaaf
--- /dev/null
+++ b/libclc/clc/lib/generic/SOURCES
@@ -0,0 +1,24 @@
+geometric/clc_dot.cl
+integer/clc_abs.cl
+integer/clc_abs_diff.cl
+relational/clc_all.cl
+relational/clc_any.cl
+relational/clc_bitselect.cl
+relational/clc_isequal.cl
+relational/clc_isfinite.cl
+relational/clc_isgreater.cl
+relational/clc_isgreaterequal.cl
+relational/clc_isinf.cl
+relational/clc_isless.cl
+relational/clc_islessequal.cl
+relational/clc_islessgreater.cl
+relational/clc_isnan.cl
+relational/clc_isnormal.cl
+relational/clc_isnotequal.cl
+relational/clc_isordered.cl
+relational/clc_isunordered.cl
+relational/clc_select.cl
+relational/clc_signbit.cl
+shared/clc_clamp.cl
+shared/clc_max.cl
+shared/clc_min.cl
diff --git a/libclc/clc/lib/generic/geometric/clc_dot.cl b/libclc/clc/lib/generic/geometric/clc_dot.cl
new file mode 100644
index 0000000..bf0f19b
--- /dev/null
+++ b/libclc/clc/lib/generic/geometric/clc_dot.cl
@@ -0,0 +1,57 @@
+#include <clc/internal/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float __clc_dot(float p0, float p1) { return p0 * p1; }
+
+_CLC_OVERLOAD _CLC_DEF float __clc_dot(float2 p0, float2 p1) {
+  return p0.x * p1.x + p0.y * p1.y;
+}
+
+_CLC_OVERLOAD _CLC_DEF float __clc_dot(float3 p0, float3 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
+}
+
+_CLC_OVERLOAD _CLC_DEF float __clc_dot(float4 p0, float4 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double __clc_dot(double p0, double p1) {
+  return p0 * p1;
+}
+
+_CLC_OVERLOAD _CLC_DEF double __clc_dot(double2 p0, double2 p1) {
+  return p0.x * p1.x + p0.y * p1.y;
+}
+
+_CLC_OVERLOAD _CLC_DEF double __clc_dot(double3 p0, double3 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
+}
+
+_CLC_OVERLOAD _CLC_DEF double __clc_dot(double4 p0, double4 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
+}
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_dot(half p0, half p1) { return p0 * p1; }
+
+_CLC_OVERLOAD _CLC_DEF half __clc_dot(half2 p0, half2 p1) {
+  return p0.x * p1.x + p0.y * p1.y;
+}
+
+_CLC_OVERLOAD _CLC_DEF half __clc_dot(half3 p0, half3 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
+}
+
+_CLC_OVERLOAD _CLC_DEF half __clc_dot(half4 p0, half4 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
+}
+
+#endif
diff --git a/libclc/clc/lib/generic/integer/clc_abs.cl b/libclc/clc/lib/generic/integer/clc_abs.cl
new file mode 100644
index 0000000..31d004c
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_abs.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_abs.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_abs.inc b/libclc/clc/lib/generic/integer/clc_abs.inc
new file mode 100644
index 0000000..dcdd77f
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_abs.inc
@@ -0,0 +1,4 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE __clc_abs(__CLC_GENTYPE x) {
+  return __builtin_astype((__CLC_GENTYPE)(x > (__CLC_GENTYPE)(0) ? x : -x),
+                          __CLC_U_GENTYPE);
+}
diff --git a/libclc/clc/lib/generic/integer/clc_abs_diff.cl b/libclc/clc/lib/generic/integer/clc_abs_diff.cl
new file mode 100644
index 0000000..db2fc50
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_abs_diff.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_abs_diff.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_abs_diff.inc b/libclc/clc/lib/generic/integer/clc_abs_diff.inc
new file mode 100644
index 0000000..c0fe0fc
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_abs_diff.inc
@@ -0,0 +1,6 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE __clc_abs_diff(__CLC_GENTYPE x,
+                                                      __CLC_GENTYPE y) {
+  __CLC_U_GENTYPE ux = __builtin_astype(x, __CLC_U_GENTYPE);
+  __CLC_U_GENTYPE uy = __builtin_astype(y, __CLC_U_GENTYPE);
+  return x > y ? ux - uy : uy - ux;
+}
diff --git a/libclc/clc/lib/generic/relational/clc_all.cl b/libclc/clc/lib/generic/relational/clc_all.cl
new file mode 100644
index 0000000..e371126
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_all.cl
@@ -0,0 +1,28 @@
+#include <clc/internal/clc.h>
+
+#define _CLC_ALL(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
+#define _CLC_ALL2(v) (_CLC_ALL((v).s0) & _CLC_ALL((v).s1))
+#define _CLC_ALL3(v) (_CLC_ALL2((v)) & _CLC_ALL((v).s2))
+#define _CLC_ALL4(v) (_CLC_ALL3((v)) & _CLC_ALL((v).s3))
+#define _CLC_ALL8(v)                                                           \
+  (_CLC_ALL4((v)) & _CLC_ALL((v).s4) & _CLC_ALL((v).s5) & _CLC_ALL((v).s6) &   \
+   _CLC_ALL((v).s7))
+#define _CLC_ALL16(v)                                                          \
+  (_CLC_ALL8((v)) & _CLC_ALL((v).s8) & _CLC_ALL((v).s9) & _CLC_ALL((v).sA) &   \
+   _CLC_ALL((v).sB) & _CLC_ALL((v).sC) & _CLC_ALL((v).sD) & _CLC_ALL((v).sE) & \
+   _CLC_ALL((v).sf))
+
+#define ALL_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int __clc_all(TYPE v)
+
+#define ALL_VECTORIZE(TYPE)                                                    \
+  ALL_ID(TYPE) { return _CLC_ALL(v); }                                         \
+  ALL_ID(TYPE##2) { return _CLC_ALL2(v); }                                     \
+  ALL_ID(TYPE##3) { return _CLC_ALL3(v); }                                     \
+  ALL_ID(TYPE##4) { return _CLC_ALL4(v); }                                     \
+  ALL_ID(TYPE##8) { return _CLC_ALL8(v); }                                     \
+  ALL_ID(TYPE##16) { return _CLC_ALL16(v); }
+
+ALL_VECTORIZE(char)
+ALL_VECTORIZE(short)
+ALL_VECTORIZE(int)
+ALL_VECTORIZE(long)
diff --git a/libclc/clc/lib/generic/relational/clc_any.cl b/libclc/clc/lib/generic/relational/clc_any.cl
new file mode 100644
index 0000000..e69f211
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_any.cl
@@ -0,0 +1,28 @@
+#include <clc/internal/clc.h>
+
+#define _CLC_ANY(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
+#define _CLC_ANY2(v) (_CLC_ANY((v).s0) | _CLC_ANY((v).s1))
+#define _CLC_ANY3(v) (_CLC_ANY2((v)) | _CLC_ANY((v).s2))
+#define _CLC_ANY4(v) (_CLC_ANY3((v)) | _CLC_ANY((v).s3))
+#define _CLC_ANY8(v)                                                           \
+  (_CLC_ANY4((v)) | _CLC_ANY((v).s4) | _CLC_ANY((v).s5) | _CLC_ANY((v).s6) |   \
+   _CLC_ANY((v).s7))
+#define _CLC_ANY16(v)                                                          \
+  (_CLC_ANY8((v)) | _CLC_ANY((v).s8) | _CLC_ANY((v).s9) | _CLC_ANY((v).sA) |   \
+   _CLC_ANY((v).sB) | _CLC_ANY((v).sC) | _CLC_ANY((v).sD) | _CLC_ANY((v).sE) | \
+   _CLC_ANY((v).sf))
+
+#define ANY_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int __clc_any(TYPE v)
+
+#define ANY_VECTORIZE(TYPE)                                                    \
+  ANY_ID(TYPE) { return _CLC_ANY(v); }                                         \
+  ANY_ID(TYPE##2) { return _CLC_ANY2(v); }                                     \
+  ANY_ID(TYPE##3) { return _CLC_ANY3(v); }                                     \
+  ANY_ID(TYPE##4) { return _CLC_ANY4(v); }                                     \
+  ANY_ID(TYPE##8) { return _CLC_ANY8(v); }                                     \
+  ANY_ID(TYPE##16) { return _CLC_ANY16(v); }
+
+ANY_VECTORIZE(char)
+ANY_VECTORIZE(short)
+ANY_VECTORIZE(int)
+ANY_VECTORIZE(long)
diff --git a/libclc/clc/lib/generic/relational/clc_bitselect.cl b/libclc/clc/lib/generic/relational/clc_bitselect.cl
new file mode 100644
index 0000000..66b28af
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_bitselect.cl
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_bitselect.inc>
+#include <clc/integer/gentype.inc>
+#undef __CLC_BODY
+
+#define FLOAT_BITSELECT(f_type, i_type, width)                                 \
+  _CLC_OVERLOAD _CLC_DEF f_type##width __clc_bitselect(                        \
+      f_type##width x, f_type##width y, f_type##width z) {                     \
+    return __clc_as_##f_type##width(__clc_bitselect(                           \
+        __clc_as_##i_type##width(x), __clc_as_##i_type##width(y),              \
+        __clc_as_##i_type##width(z)));                                         \
+  }
+
+FLOAT_BITSELECT(float, uint, )
+FLOAT_BITSELECT(float, uint, 2)
+FLOAT_BITSELECT(float, uint, 3)
+FLOAT_BITSELECT(float, uint, 4)
+FLOAT_BITSELECT(float, uint, 8)
+FLOAT_BITSELECT(float, uint, 16)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+FLOAT_BITSELECT(double, ulong, )
+FLOAT_BITSELECT(double, ulong, 2)
+FLOAT_BITSELECT(double, ulong, 3)
+FLOAT_BITSELECT(double, ulong, 4)
+FLOAT_BITSELECT(double, ulong, 8)
+FLOAT_BITSELECT(double, ulong, 16)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_bitselect.inc b/libclc/clc/lib/generic/relational/clc_bitselect.inc
new file mode 100644
index 0000000..dc906ef
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_bitselect.inc
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_bitselect(__CLC_GENTYPE x,
+                                                     __CLC_GENTYPE y,
+                                                     __CLC_GENTYPE z) {
+  return ((x) ^ ((z) & ((y) ^ (x))));
+}
diff --git a/libclc/clc/lib/generic/relational/clc_isequal.cl b/libclc/clc/lib/generic/relational/clc_isequal.cl
new file mode 100644
index 0000000..7664df7
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isequal.cl
@@ -0,0 +1,44 @@
+#include <clc/internal/clc.h>
+
+#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)          \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return (x == y);                                                           \
+  }
+
+_CLC_DEFINE_ISEQUAL(int, __clc_isequal, float, float)
+_CLC_DEFINE_ISEQUAL(int2, __clc_isequal, float2, float2)
+_CLC_DEFINE_ISEQUAL(int3, __clc_isequal, float3, float3)
+_CLC_DEFINE_ISEQUAL(int4, __clc_isequal, float4, float4)
+_CLC_DEFINE_ISEQUAL(int8, __clc_isequal, float8, float8)
+_CLC_DEFINE_ISEQUAL(int16, __clc_isequal, float16, float16)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isequal(double) returns an int, but the vector
+// versions return long.
+_CLC_DEFINE_ISEQUAL(int, __clc_isequal, double, double)
+_CLC_DEFINE_ISEQUAL(long2, __clc_isequal, double2, double2)
+_CLC_DEFINE_ISEQUAL(long3, __clc_isequal, double3, double3)
+_CLC_DEFINE_ISEQUAL(long4, __clc_isequal, double4, double4)
+_CLC_DEFINE_ISEQUAL(long8, __clc_isequal, double8, double8)
+_CLC_DEFINE_ISEQUAL(long16, __clc_isequal, double16, double16)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isequal(half) returns an int, but the vector
+// versions return short.
+_CLC_DEFINE_ISEQUAL(int, __clc_isequal, half, half)
+_CLC_DEFINE_ISEQUAL(short2, __clc_isequal, half2, half2)
+_CLC_DEFINE_ISEQUAL(short3, __clc_isequal, half3, half3)
+_CLC_DEFINE_ISEQUAL(short4, __clc_isequal, half4, half4)
+_CLC_DEFINE_ISEQUAL(short8, __clc_isequal, half8, half8)
+_CLC_DEFINE_ISEQUAL(short16, __clc_isequal, half16, half16)
+
+#endif
+
+#undef _CLC_DEFINE_ISEQUAL
diff --git a/libclc/clc/lib/generic/relational/clc_isfinite.cl b/libclc/clc/lib/generic/relational/clc_isfinite.cl
new file mode 100644
index 0000000..c3def5d
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isfinite.cl
@@ -0,0 +1,31 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isfinite, __builtin_isfinite, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isfinite(double) returns an int, but the vector
+// versions return long.
+_CLC_DEF _CLC_OVERLOAD int __clc_isfinite(double x) {
+  return __builtin_isfinite(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isfinite, double)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isfinite(half) returns an int, but the vector
+// versions return short.
+_CLC_DEF _CLC_OVERLOAD int __clc_isfinite(half x) {
+  return __builtin_isfinite(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isfinite, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_isgreater.cl b/libclc/clc/lib/generic/relational/clc_isgreater.cl
new file mode 100644
index 0000000..39fb6b0
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isgreater.cl
@@ -0,0 +1,39 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+// Note: It would be nice to use __builtin_isgreater with vector inputs, but it
+// seems to only take scalar values as input, which will produce incorrect
+// output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float,
+                              float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isgreater(double, double) returns an int, but the
+// vector versions return long.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(double x, double y) {
+  return __builtin_isgreater(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isgreater(half, half) returns an int, but the
+// vector versions return short.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(half x, half y) {
+  return __builtin_isgreater(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreater, half, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
new file mode 100644
index 0000000..ccf7c88
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
@@ -0,0 +1,39 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+// Note: It would be nice to use __builtin_isgreaterequal with vector inputs,
+// but it seems to only take scalar values as input, which will produce
+// incorrect output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal,
+                              __builtin_isgreaterequal, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isgreaterequal(double, double) returns an int,
+// but the vector versions return long.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(double x, double y) {
+  return __builtin_isgreaterequal(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreaterequal, double,
+                                      double)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isgreaterequal(half, half) returns an int, but
+// the vector versions return short.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(half x, half y) {
+  return __builtin_isgreaterequal(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreaterequal, half, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_isinf.cl b/libclc/clc/lib/generic/relational/clc_isinf.cl
new file mode 100644
index 0000000..afe2912
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isinf.cl
@@ -0,0 +1,26 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isinf, __builtin_isinf, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isinf(double) returns an int, but the vector
+// versions return long.
+_CLC_DEF _CLC_OVERLOAD int __clc_isinf(double x) { return __builtin_isinf(x); }
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isinf, double)
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isinf(half) returns an int, but the vector
+// versions return short.
+_CLC_DEF _CLC_OVERLOAD int __clc_isinf(half x) { return __builtin_isinf(x); }
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isinf, half)
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_isless.cl b/libclc/clc/lib/generic/relational/clc_isless.cl
new file mode 100644
index 0000000..1204a50
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isless.cl
@@ -0,0 +1,37 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+// Note: It would be nice to use __builtin_isless with vector inputs, but it
+// seems to only take scalar values as input, which will produce incorrect
+// output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isless, __builtin_isless, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isless(double, double) returns an int, but the
+// vector versions return long.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isless(double x, double y) {
+  return __builtin_isless(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isless, double, double)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isless(half, half) returns an int, but the vector
+// versions return short.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isless(half x, half y) {
+  return __builtin_isless(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isless, half, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_islessequal.cl b/libclc/clc/lib/generic/relational/clc_islessequal.cl
new file mode 100644
index 0000000..6fde763
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_islessequal.cl
@@ -0,0 +1,39 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+// Note: It would be nice to use __builtin_islessequal with vector inputs, but
+// it seems to only take scalar values as input, which will produce incorrect
+// output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal,
+                              float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_islessequal(double, double) returns an int, but
+// the vector versions return long.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(double x, double y) {
+  return __builtin_islessequal(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_islessequal(half, half) returns an int, but the
+// vector versions return short.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(half x, half y) {
+  return __builtin_islessequal(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessequal, half, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_islessgreater.cl b/libclc/clc/lib/generic/relational/clc_islessgreater.cl
new file mode 100644
index 0000000..5106c9f
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_islessgreater.cl
@@ -0,0 +1,38 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+// Note: It would be nice to use __builtin_islessgreater with vector inputs, but
+// it seems to only take scalar values as input, which will produce incorrect
+// output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater,
+                              float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_islessgreater(double, double) returns an int, but
+// the vector versions return long.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(double x, double y) {
+  return __builtin_islessgreater(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessgreater, double, double)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_islessgreater(half, half) returns an int, but the
+// vector versions return short.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(half x, half y) {
+  return __builtin_islessgreater(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessgreater, half, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_isnan.cl b/libclc/clc/lib/generic/relational/clc_isnan.cl
new file mode 100644
index 0000000..fb30cd5
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isnan.cl
@@ -0,0 +1,28 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isnan, __builtin_isnan, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isnan(double) returns an int, but the vector
+// versions return long.
+_CLC_DEF _CLC_OVERLOAD int __clc_isnan(double x) { return __builtin_isnan(x); }
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isnan, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isnan(half) returns an int, but the vector
+// versions return short.
+_CLC_DEF _CLC_OVERLOAD int __clc_isnan(half x) { return __builtin_isnan(x); }
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isnan, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_isnormal.cl b/libclc/clc/lib/generic/relational/clc_isnormal.cl
new file mode 100644
index 0000000..e0da8cc
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isnormal.cl
@@ -0,0 +1,31 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isnormal, __builtin_isnormal, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isnormal(double) returns an int, but the vector
+// versions return long.
+_CLC_DEF _CLC_OVERLOAD int __clc_isnormal(double x) {
+  return __builtin_isnormal(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isnormal, double)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isnormal(half) returns an int, but the vector
+// versions return short.
+_CLC_DEF _CLC_OVERLOAD int __clc_isnormal(half x) {
+  return __builtin_isnormal(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isnormal, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_isnotequal.cl b/libclc/clc/lib/generic/relational/clc_isnotequal.cl
new file mode 100644
index 0000000..9f90713
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isnotequal.cl
@@ -0,0 +1,33 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)       \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return (x != y);                                                           \
+  }
+
+_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, float, float)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isnotequal, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isnotequal(double, double) returns an int, but
+// the vector versions return long.
+
+_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, double, double)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isnotequal, double, double)
+
+#endif
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isnotequal(half, half) returns an int, but the
+// vector versions return short.
+
+_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, half, half)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isnotequal, half, half)
+
+#endif
+
+#undef _CLC_DEFINE_ISNOTEQUAL
diff --git a/libclc/clc/lib/generic/relational/clc_isordered.cl b/libclc/clc/lib/generic/relational/clc_isordered.cl
new file mode 100644
index 0000000..6183d1d
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isordered.cl
@@ -0,0 +1,34 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/clc_isequal.h>
+#include <clc/relational/relational.h>
+
+#define _CLC_DEFINE_ISORDERED(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return __clc_isequal(x, x) && __clc_isequal(y, y);                         \
+  }
+
+_CLC_DEFINE_ISORDERED(int, __clc_isordered, float, float)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isordered, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isordered(double, double) returns an int, but the
+// vector versions return long.
+
+_CLC_DEFINE_ISORDERED(int, __clc_isordered, double, double)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isordered, double, double)
+
+#endif
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isordered(half, half) returns an int, but the
+// vector versions return short.
+
+_CLC_DEFINE_ISORDERED(int, __clc_isordered, half, half)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isordered, half, half)
+
+#endif
+
+#undef _CLC_DEFINE_ISORDERED
diff --git a/libclc/clc/lib/generic/relational/clc_isunordered.cl b/libclc/clc/lib/generic/relational/clc_isunordered.cl
new file mode 100644
index 0000000..dbbec03
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_isunordered.cl
@@ -0,0 +1,38 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+// Note: It would be nice to use __builtin_isunordered with vector inputs, but
+// it seems to only take scalar values as input, which will produce incorrect
+// output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isunordered, __builtin_isunordered,
+                              float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_isunordered(double, double) returns an int, but
+// the vector versions return long.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isunordered(double x, double y) {
+  return __builtin_isunordered(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isunordered, double, double)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_isunordered(half, half) returns an int, but the
+// vector versions return short.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_isunordered(half x, half y) {
+  return __builtin_isunordered(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isunordered, half, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_select.cl b/libclc/clc/lib/generic/relational/clc_select.cl
new file mode 100644
index 0000000..bb016ed
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_select.cl
@@ -0,0 +1,7 @@
+#include <clc/internal/clc.h>
+#include <clc/utils.h>
+
+#define __CLC_BODY <clc_select.inc>
+#include <clc/math/gentype.inc>
+#define __CLC_BODY <clc_select.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/relational/clc_select.inc b/libclc/clc/lib/generic/relational/clc_select.inc
new file mode 100644
index 0000000..47db806
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_select.inc
@@ -0,0 +1,35 @@
+#ifdef __CLC_SCALAR
+#define __CLC_VECSIZE
+#endif
+
+#if __CLC_FPSIZE == 64
+#define __CLC_S_GENTYPE __CLC_XCONCAT(long, __CLC_VECSIZE)
+#define __CLC_U_GENTYPE __CLC_XCONCAT(ulong, __CLC_VECSIZE)
+#elif __CLC_FPSIZE == 32
+#define __CLC_S_GENTYPE __CLC_XCONCAT(int, __CLC_VECSIZE)
+#define __CLC_U_GENTYPE __CLC_XCONCAT(uint, __CLC_VECSIZE)
+#elif __CLC_FPSIZE == 16
+#define __CLC_S_GENTYPE __CLC_XCONCAT(short, __CLC_VECSIZE)
+#define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
+                                                  __CLC_GENTYPE y,
+                                                  __CLC_S_GENTYPE z) {
+  return z ? y : x;
+}
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
+                                                  __CLC_GENTYPE y,
+                                                  __CLC_U_GENTYPE z) {
+  return z ? y : x;
+}
+
+#ifdef __CLC_FPSIZE
+#undef __CLC_S_GENTYPE
+#undef __CLC_U_GENTYPE
+#endif
+
+#ifdef __CLC_SCALAR
+#undef __CLC_VECSIZE
+#endif
diff --git a/libclc/clc/lib/generic/relational/clc_signbit.cl b/libclc/clc/lib/generic/relational/clc_signbit.cl
new file mode 100644
index 0000000..b1b2943
--- /dev/null
+++ b/libclc/clc/lib/generic/relational/clc_signbit.cl
@@ -0,0 +1,33 @@
+#include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_signbit, __builtin_signbitf, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of __clc_signbit(double) returns an int, but the vector
+// versions return long.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_signbit(double x) {
+  return __builtin_signbit(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_signbit, double)
+
+#endif
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// The scalar version of __clc_signbit(half) returns an int, but the vector
+// versions return short.
+
+_CLC_DEF _CLC_OVERLOAD int __clc_signbit(half x) {
+  return __builtin_signbit(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_signbit, half)
+
+#endif
diff --git a/libclc/clc/lib/generic/shared/clc_clamp.cl b/libclc/clc/lib/generic/shared/clc_clamp.cl
new file mode 100644
index 0000000..1d40da3
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_clamp.cl
@@ -0,0 +1,7 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_clamp.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_clamp.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/shared/clc_clamp.inc b/libclc/clc/lib/generic/shared/clc_clamp.inc
new file mode 100644
index 0000000..da67cd2
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_clamp.inc
@@ -0,0 +1,14 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x,
+                                                 __CLC_GENTYPE y,
+                                                 __CLC_GENTYPE z) {
+  return (x > z ? z : (x < y ? y : x));
+}
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x,
+                                                 __CLC_SCALAR_GENTYPE y,
+                                                 __CLC_SCALAR_GENTYPE z) {
+  return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z
+                               : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x));
+}
+#endif
diff --git a/libclc/clc/lib/generic/shared/clc_max.cl b/libclc/clc/lib/generic/shared/clc_max.cl
new file mode 100644
index 0000000..e1050ed
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_max.cl
@@ -0,0 +1,7 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_max.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_max.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/shared/clc_max.inc b/libclc/clc/lib/generic/shared/clc_max.inc
new file mode 100644
index 0000000..f4234cb
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_max.inc
@@ -0,0 +1,11 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_max(__CLC_GENTYPE a,
+                                               __CLC_GENTYPE b) {
+  return (a > b ? a : b);
+}
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_max(__CLC_GENTYPE a,
+                                               __CLC_SCALAR_GENTYPE b) {
+  return (a > (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b);
+}
+#endif
diff --git a/libclc/clc/lib/generic/shared/clc_min.cl b/libclc/clc/lib/generic/shared/clc_min.cl
new file mode 100644
index 0000000..12a26f5
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_min.cl
@@ -0,0 +1,7 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_min.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_min.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/shared/clc_min.inc b/libclc/clc/lib/generic/shared/clc_min.inc
new file mode 100644
index 0000000..e9c85dd
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_min.inc
@@ -0,0 +1,11 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_min(__CLC_GENTYPE a,
+                                               __CLC_GENTYPE b) {
+  return (b < a ? b : a);
+}
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_min(__CLC_GENTYPE a,
+                                               __CLC_SCALAR_GENTYPE b) {
+  return (b < (__CLC_GENTYPE)a ? (__CLC_GENTYPE)b : a);
+}
+#endif
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
new file mode 100644
index 0000000..d8effd1
--- /dev/null
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -0,0 +1,2 @@
+../generic/geometric/clc_dot.cl
+
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
new file mode 100644
index 0000000..9200810
--- /dev/null
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -0,0 +1 @@
+../generic/geometric/clc_dot.cl
diff --git a/libclc/clspv/lib/math/fma.cl b/libclc/clspv/lib/math/fma.cl
index 4f28069..e6251db 100644
--- a/libclc/clspv/lib/math/fma.cl
+++ b/libclc/clspv/lib/math/fma.cl
@@ -24,9 +24,9 @@
 // (__clc_sw_fma), but avoids the use of ulong in favor of uint2. The logic has
 // been updated as appropriate.
 
-#include <clc/clc.h>
-#include "../../../generic/lib/clcmacro.h"
 #include "../../../generic/lib/math/math.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 struct fp {
   uint2 mantissa;
@@ -269,3 +269,14 @@ _CLC_DEF _CLC_OVERLOAD float fma(float a, float b, float c) {
                   ((uint)st_fma.mantissa.lo & 0x7fffff));
 }
 _CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, fma, float, float, float)
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEF _CLC_OVERLOAD half fma(half a, half b, half c) {
+  return (half)mad((float)a, (float)b, (float)c);
+}
+_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, fma, half, half, half)
+
+#endif
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 68b33ed..b520626 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -76,6 +76,8 @@ endfunction()
 # Links together one or more bytecode files
 #
 # Arguments:
+# * INTERNALIZE
+#     Set if -internalize flag should be passed when linking
 # * TARGET <string>
 #     Custom target to create
 # * INPUT <string> ...
@@ -84,7 +86,7 @@ endfunction()
 #     List of extra dependencies to inject
 function(link_bc)
   cmake_parse_arguments(ARG
-    ""
+    "INTERNALIZE"
     "TARGET"
     "INPUTS;DEPENDENCIES"
     ${ARGN}
@@ -97,7 +99,7 @@ function(link_bc)
     file( TO_CMAKE_PATH ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.rsp RSP_FILE )
     # Turn it into a space-separate list of input files
     list( JOIN ARG_INPUTS " " RSP_INPUT )
-    file( WRITE ${RSP_FILE} ${RSP_INPUT} )
+    file( GENERATE OUTPUT ${RSP_FILE} CONTENT ${RSP_INPUT} )
     # Ensure that if this file is removed, we re-run CMake
     set_property( DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
       ${RSP_FILE}
@@ -107,12 +109,15 @@ function(link_bc)
 
   add_custom_command(
     OUTPUT ${ARG_TARGET}.bc
-    COMMAND ${llvm-link_exe} -o ${ARG_TARGET}.bc ${LINK_INPUT_ARG}
+    COMMAND ${llvm-link_exe} $<$<BOOL:${ARG_INTERNALIZE}>:--internalize> -o ${ARG_TARGET}.bc ${LINK_INPUT_ARG}
     DEPENDS ${llvm-link_target} ${ARG_DEPENDENCIES} ${ARG_INPUTS} ${RSP_FILE}
   )
 
   add_custom_target( ${ARG_TARGET} ALL DEPENDS ${ARG_TARGET}.bc )
-  set_target_properties( ${ARG_TARGET} PROPERTIES TARGET_FILE ${ARG_TARGET}.bc )
+  set_target_properties( ${ARG_TARGET} PROPERTIES
+    TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${ARG_TARGET}.bc
+    FOLDER "libclc/Device IR/Linking"
+  )
 endfunction()
 
 # Decomposes and returns variables based on a libclc triple and architecture
@@ -175,3 +180,254 @@ function(get_libclc_device_info)
     set( ${ARG_CLANG_TRIPLE} ${ARG_TRIPLE} PARENT_SCOPE )
   endif()
 endfunction()
+
+# Compiles a list of library source files (provided by LIB_FILES/GEN_FILES) and
+# compiles them to LLVM bytecode (or SPIR-V), links them together and optimizes
+# them.
+#
+# For bytecode libraries, a list of ALIASES may optionally be provided to
+# produce additional symlinks.
+#
+# Arguments:
+#  * ARCH <string>
+#      libclc architecture being built
+#  * ARCH_SUFFIX <string>
+#      libclc architecture/triple suffix
+#  * TRIPLE <string>
+#      Triple used to compile
+#
+# Optional Arguments:
+# * CLC_INTERNAL
+#     Pass if compiling the internal CLC builtin libraries, which are not
+#     optimized and do not have aliases created.
+#  * LIB_FILES <string> ...
+#      List of files that should be built for this library
+#  * GEN_FILES <string> ...
+#      List of generated files (in build dir) that should be built for this library
+#  * COMPILE_FLAGS <string> ...
+#      Compilation options (for clang)
+#  * OPT_FLAGS <string> ...
+#      Optimization options (for opt)
+#  * ALIASES <string> ...
+#      List of aliases
+#  * INTERNAL_LINK_DEPENDENCIES <string> ...
+#      A list of extra bytecode files to link into the builtin library. Symbols
+#      from these link dependencies will be internalized during linking.
+function(add_libclc_builtin_set)
+  cmake_parse_arguments(ARG
+    "CLC_INTERNAL"
+    "ARCH;TRIPLE;ARCH_SUFFIX"
+    "LIB_FILES;GEN_FILES;COMPILE_FLAGS;OPT_FLAGS;ALIASES;INTERNAL_LINK_DEPENDENCIES"
+    ${ARGN}
+  )
+
+  if( NOT ARG_ARCH OR NOT ARG_ARCH_SUFFIX OR NOT ARG_TRIPLE )
+    message( FATAL_ERROR "Must provide ARCH, ARCH_SUFFIX, and TRIPLE" )
+  endif()
+
+  set( bytecode_files "" )
+  foreach( file IN LISTS ARG_GEN_FILES ARG_LIB_FILES )
+    # We need to take each file and produce an absolute input file, as well
+    # as a unique architecture-specific output file. We deal with a mix of
+    # different input files, which makes this trickier.
+    if( ${file} IN_LIST ARG_GEN_FILES )
+      # Generated files are given just as file names, which we must make
+      # absolute to the binary directory.
+      set( input_file ${CMAKE_CURRENT_BINARY_DIR}/${file} )
+      set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.bc" )
+    else()
+      # Other files are originally relative to each SOURCE file, which are
+      # then make relative to the libclc root directory. We must normalize
+      # the path (e.g., ironing out any ".."), then make it relative to the
+      # root directory again, and use that relative path component for the
+      # binary path.
+      get_filename_component( abs_path ${file} ABSOLUTE BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
+      file( RELATIVE_PATH root_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${abs_path} )
+      set( input_file ${CMAKE_CURRENT_SOURCE_DIR}/${file} )
+      set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${root_rel_path}.bc" )
+    endif()
+
+    get_filename_component( file_dir ${file} DIRECTORY )
+
+    compile_to_bc(
+      TRIPLE ${ARG_TRIPLE}
+      INPUT ${input_file}
+      OUTPUT ${output_file}
+      EXTRA_OPTS -fno-builtin -nostdlib
+        "${ARG_COMPILE_FLAGS}" -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir}
+      DEPENDENCIES generate_convert.cl clspv-generate_convert.cl
+    )
+    list( APPEND bytecode_files ${output_file} )
+  endforeach()
+
+  set( builtins_comp_lib_tgt builtins.comp.${ARG_ARCH_SUFFIX} )
+  add_custom_target( ${builtins_comp_lib_tgt}
+    DEPENDS ${bytecode_files}
+  )
+  set_target_properties( ${builtins_comp_lib_tgt} PROPERTIES FOLDER "libclc/Device IR/Comp" )
+
+  if( NOT bytecode_files )
+    message(FATAL_ERROR "Cannot create an empty builtins library")
+  endif()
+
+  set( builtins_link_lib_tgt builtins.link.${ARG_ARCH_SUFFIX} )
+
+  if( NOT ARG_INTERNAL_LINK_DEPENDENCIES )
+    link_bc(
+      TARGET ${builtins_link_lib_tgt}
+      INPUTS ${bytecode_files}
+      DEPENDENCIES ${builtins_comp_lib_tgt}
+    )
+  else()
+    # If we have libraries to link while internalizing their symbols, we need
+    # two separate link steps; the --internalize flag applies to all link
+    # inputs but the first.
+    set( builtins_link_lib_tmp_tgt builtins.link.pre-deps.${ARG_ARCH_SUFFIX} )
+    link_bc(
+      TARGET ${builtins_link_lib_tmp_tgt}
+      INPUTS ${bytecode_files}
+      DEPENDENCIES ${builtins_comp_lib_tgt}
+    )
+    link_bc(
+      INTERNALIZE
+      TARGET ${builtins_link_lib_tgt}
+      INPUTS $<TARGET_PROPERTY:${builtins_link_lib_tmp_tgt},TARGET_FILE>
+        ${ARG_INTERNAL_LINK_DEPENDENCIES}
+      DEPENDENCIES ${builtins_link_lib_tmp_tgt}
+    )
+  endif()
+
+  # For the CLC internal builtins, exit here - we only optimize the targets'
+  # entry points once we've linked the CLC buitins into them
+  if( ARG_CLC_INTERNAL )
+    return()
+  endif()
+
+  set( builtins_link_lib $<TARGET_PROPERTY:${builtins_link_lib_tgt},TARGET_FILE> )
+
+  if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 )
+    set( spv_suffix ${ARG_ARCH_SUFFIX}.spv )
+    add_custom_command( OUTPUT ${spv_suffix}
+      COMMAND ${llvm-spirv_exe} ${spvflags} -o ${spv_suffix} ${builtins_link_lib}
+      DEPENDS ${llvm-spirv_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
+    )
+    add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
+    set_target_properties( "prepare-${spv_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
+       DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+
+    return()
+  endif()
+
+  set( builtins_opt_lib_tgt builtins.opt.${ARG_ARCH_SUFFIX} )
+
+  # Add opt target
+  add_custom_command( OUTPUT ${builtins_opt_lib_tgt}.bc
+    COMMAND ${opt_exe} ${ARG_OPT_FLAGS} -o ${builtins_opt_lib_tgt}.bc
+      ${builtins_link_lib}
+    DEPENDS ${opt_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
+  )
+  add_custom_target( ${builtins_opt_lib_tgt}
+    ALL DEPENDS ${builtins_opt_lib_tgt}.bc
+  )
+  set_target_properties( ${builtins_opt_lib_tgt} PROPERTIES
+    TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${builtins_opt_lib_tgt}.bc
+    FOLDER "libclc/Device IR/Opt"
+  )
+
+  set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
+
+  # Add prepare target
+  set( obj_suffix ${ARG_ARCH_SUFFIX}.bc )
+  add_custom_command( OUTPUT ${obj_suffix}
+    COMMAND ${prepare_builtins_exe} -o ${obj_suffix} ${builtins_opt_lib}
+    DEPENDS ${builtins_opt_lib} ${builtins_opt_lib_tgt} ${prepare_builtins_target} )
+  add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} )
+  set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
+
+  # nvptx-- targets don't include workitem builtins
+  if( NOT ARG_TRIPLE MATCHES ".*ptx.*--$" )
+    add_test( NAME external-calls-${obj_suffix}
+      COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} )
+  endif()
+
+  install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+  foreach( a ${ARG_ALIASES} )
+    set( alias_suffix "${a}-${ARG_TRIPLE}.bc" )
+    add_custom_command(
+      OUTPUT ${alias_suffix}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${obj_suffix} ${alias_suffix}
+      DEPENDS prepare-${obj_suffix} )
+    add_custom_target( alias-${alias_suffix} ALL DEPENDS ${alias_suffix} )
+    set_target_properties( alias-${alias_suffix} PROPERTIES FOLDER "libclc/Device IR/Aliases" )
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${alias_suffix}
+             DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+  endforeach( a )
+endfunction(add_libclc_builtin_set)
+
+# Produces a list of libclc source files by walking over SOURCES files in a
+# given directory. Outputs the list of files in LIB_FILE_LIST.
+#
+# LIB_FILE_LIST may be pre-populated and is appended to.
+#
+# Arguments:
+# * CLC_INTERNAL
+#     Pass if compiling the internal CLC builtin libraries, which have a
+#     different directory structure.
+# * LIB_ROOT_DIR <string>
+#     Root directory containing target's lib files, relative to libclc root
+#     directory. If not provided, is set to '.'.
+# * DIRS <string> ...
+#     List of directories under LIB_ROOT_DIR to walk over searching for SOURCES
+#     files
+function(libclc_configure_lib_source LIB_FILE_LIST)
+  cmake_parse_arguments(ARG
+    "CLC_INTERNAL"
+    "LIB_ROOT_DIR"
+    "DIRS"
+    ${ARGN}
+  )
+
+  if( NOT ARG_LIB_ROOT_DIR )
+    set(ARG_LIB_ROOT_DIR  ".")
+  endif()
+
+  # Enumerate SOURCES* files
+  set( source_list )
+  foreach( l ${ARG_DIRS} )
+    foreach( s "SOURCES" "SOURCES_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}" )
+      if( ARG_CLC_INTERNAL )
+        file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/lib/${l}/${s} file_loc )
+      else()
+        file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/${l}/lib/${s} file_loc )
+      endif()
+      file( TO_CMAKE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${file_loc} loc )
+      # Prepend the location to give higher priority to
+      # specialized implementation
+      if( EXISTS ${loc} )
+        set( source_list ${file_loc} ${source_list} )
+      endif()
+    endforeach()
+  endforeach()
+
+  ## Add the generated convert files here to prevent adding the ones listed in
+  ## SOURCES
+  set( rel_files ${${LIB_FILE_LIST}} ) # Source directory input files, relative to the root dir
+  set( objects ${${LIB_FILE_LIST}} )   # A "set" of already-added input files
+
+  foreach( l ${source_list} )
+    file( READ ${l} file_list )
+    string( REPLACE "\n" ";" file_list ${file_list} )
+    get_filename_component( dir ${l} DIRECTORY )
+    foreach( f ${file_list} )
+      # Only add each file once, so that targets can 'specialize' builtins
+      if( NOT ${f} IN_LIST objects )
+        list( APPEND objects ${f} )
+        list( APPEND rel_files ${dir}/${f} )
+      endif()
+    endforeach()
+  endforeach()
+
+  set( ${LIB_FILE_LIST} ${rel_files} PARENT_SCOPE )
+endfunction(libclc_configure_lib_source LIB_FILE_LIST)
diff --git a/libclc/generic/include/clc/clc.h b/libclc/generic/include/clc/clc.h
index 171b06a..94fca68 100644
--- a/libclc/generic/include/clc/clc.h
+++ b/libclc/generic/include/clc/clc.h
@@ -1,3 +1,6 @@
+#ifndef __CLC_CLC_H__
+#define __CLC_CLC_H__
+
 #ifndef cl_clang_storage_class_specifiers
 #error Implementation requires cl_clang_storage_class_specifiers extension!
 #endif
@@ -286,3 +289,5 @@
 #include <clc/image/image.h>
 
 #pragma OPENCL EXTENSION all : disable
+
+#endif // __CLC_CLC_H__
diff --git a/libclc/generic/include/clc/clcmacros.h b/libclc/generic/include/clc/clcmacros.h
index 2282d36..041c1cf 100644
--- a/libclc/generic/include/clc/clcmacros.h
+++ b/libclc/generic/include/clc/clcmacros.h
@@ -1,3 +1,6 @@
+#ifndef __CLC_CLCMACROS_H__
+#define __CLC_CLCMACROS_H__
+
 /* 6.9 Preprocessor Directives and Macros
  * Some of these are handled by clang or passed by clover */
 #if __OPENCL_VERSION__ >= 110
@@ -9,10 +12,12 @@
 #define CLC_VERSION_1_2 120
 #endif
 
-#define NULL ((void*)0)
+#define NULL ((void *)0)
 
-#define __kernel_exec(X, typen) __kernel \
-                                __attribute__((work_group_size_hint(X, 1, 1))) \
-                                __attribute__((vec_type_hint(typen)))
+#define __kernel_exec(X, typen)                                                \
+  __kernel __attribute__((work_group_size_hint(X, 1, 1)))                      \
+  __attribute__((vec_type_hint(typen)))
 
 #define kernel_exec(X, typen) __kernel_exec(X, typen)
+
+#endif // __CLC_CLCMACROS_H__
diff --git a/libclc/generic/include/clc/convert.h b/libclc/generic/include/clc/convert.h
index f0ba796..8219df4 100644
--- a/libclc/generic/include/clc/convert.h
+++ b/libclc/generic/include/clc/convert.h
@@ -20,10 +20,19 @@
   _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ulong, SUFFIX) \
   _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, float, SUFFIX)
 
-#ifdef cl_khr_fp64
+#if defined(cl_khr_fp64) && defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX)                            \
+  _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX)                                 \
+  _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX)                          \
+  _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX)
+#elif defined(cl_khr_fp64)
 #define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \
   _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX)
+#elif defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX)                            \
+  _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX)                                 \
+  _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX)
 #else
 #define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX)
@@ -40,10 +49,19 @@
   _CLC_VECTOR_CONVERT_FROM(ulong, SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM(float, SUFFIX)
 
-#ifdef cl_khr_fp64
+#if defined(cl_khr_fp64) && defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_TO(SUFFIX)                                         \
+  _CLC_VECTOR_CONVERT_TO1(SUFFIX)                                              \
+  _CLC_VECTOR_CONVERT_FROM(double, SUFFIX)                                     \
+  _CLC_VECTOR_CONVERT_FROM(half, SUFFIX)
+#elif defined(cl_khr_fp64)
 #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \
   _CLC_VECTOR_CONVERT_TO1(SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM(double, SUFFIX)
+#elif defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_TO(SUFFIX)                                         \
+  _CLC_VECTOR_CONVERT_TO1(SUFFIX)                                              \
+  _CLC_VECTOR_CONVERT_FROM(half, SUFFIX)
 #else
 #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \
   _CLC_VECTOR_CONVERT_TO1(SUFFIX)
diff --git a/libclc/generic/include/clc/relational/any.h b/libclc/generic/include/clc/relational/any.h
index 4687ed26..3989796 100644
--- a/libclc/generic/include/clc/relational/any.h
+++ b/libclc/generic/include/clc/relational/any.h
@@ -14,3 +14,6 @@ _CLC_VECTOR_ANY_DECL(char)
 _CLC_VECTOR_ANY_DECL(short)
 _CLC_VECTOR_ANY_DECL(int)
 _CLC_VECTOR_ANY_DECL(long)
+
+#undef _CLC_ANY_DECL
+#undef _CLC_VECTOR_ANY_DECL
diff --git a/libclc/generic/include/clc/relational/binary_decl.inc b/libclc/generic/include/clc/relational/binary_decl.inc
deleted file mode 100644
index c9e4aee..0000000
--- a/libclc/generic/include/clc/relational/binary_decl.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_INTN __CLC_FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b);
diff --git a/libclc/generic/include/config.h b/libclc/generic/include/config.h
index 2994199..7aa5967 100644
--- a/libclc/generic/include/config.h
+++ b/libclc/generic/include/config.h
@@ -20,6 +20,8 @@
  * THE SOFTWARE.
  */
 
+#include <clc/clcfunc.h>
+
 _CLC_DECL bool __clc_subnormals_disabled();
 _CLC_DECL bool __clc_fp16_subnormals_supported();
 _CLC_DECL bool __clc_fp32_subnormals_supported();
diff --git a/libclc/generic/include/math/clc_ldexp.h b/libclc/generic/include/math/clc_ldexp.h
index dbfc044..454b7ed 100644
--- a/libclc/generic/include/math/clc_ldexp.h
+++ b/libclc/generic/include/math/clc_ldexp.h
@@ -7,5 +7,5 @@ _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double, int);
 
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-_CLC_DEF _CLC_OVERLOAD float __clc_ldexp(half, int);
+_CLC_DEF _CLC_OVERLOAD half __clc_ldexp(half, int);
 #endif
diff --git a/libclc/generic/include/math/clc_sqrt.h b/libclc/generic/include/math/clc_sqrt.h
index 60e183f..90a7c575 100644
--- a/libclc/generic/include/math/clc_sqrt.h
+++ b/libclc/generic/include/math/clc_sqrt.h
@@ -1,3 +1,6 @@
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
 #define __CLC_FUNCTION __clc_sqrt
 #define __CLC_BODY <clc/math/unary_decl.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/atom_int32_binary.inc b/libclc/generic/lib/atom_int32_binary.inc
index 3af4c4b..5d3b33f 100644
--- a/libclc/generic/lib/atom_int32_binary.inc
+++ b/libclc/generic/lib/atom_int32_binary.inc
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "utils.h"
+#include <clc/utils.h>
 
 #define __CLC_ATOM_IMPL(AS, TYPE) \
 _CLC_OVERLOAD _CLC_DEF TYPE __CLC_XCONCAT(atom_, __CLC_ATOMIC_OP) (volatile AS TYPE *p, TYPE val) { \
diff --git a/libclc/generic/lib/clcmacro.h b/libclc/generic/lib/clcmacro.h
deleted file mode 100644
index f148dc3..0000000
--- a/libclc/generic/lib/clcmacro.h
+++ /dev/null
@@ -1,163 +0,0 @@
-#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \
-    return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \
-  } \
-\
-  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \
-    return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \
-  } \
-\
-  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \
-    return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \
-    return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \
-    return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \
-  }
-
-#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \
-    return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \
-  } \
-\
-  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \
-    return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \
-                         FUNCTION(x.z, y.z)); \
-  } \
-\
-  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) { \
-    return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) { \
-    return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) { \
-    return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
-  }
-
-#define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
-    return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) { \
-    return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y), \
-                         FUNCTION(x, y.z)); \
-  } \
-\
-  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) { \
-    return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) { \
-    return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) { \
-    return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
-  } \
-\
-
-#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, ARG3_TYPE##2 z) { \
-    return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \
-  } \
-\
-  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, ARG3_TYPE##3 z) { \
-    return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \
-                         FUNCTION(x.z, y.z, z.z)); \
-  } \
-\
-  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, ARG3_TYPE##4 z) { \
-    return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, ARG3_TYPE##8 z) { \
-    return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
-    return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
-  }
-
-#define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) { \
-    return (RET_TYPE##2)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##3 z) { \
-    return (RET_TYPE##3)(FUNCTION(x, y, z.x), FUNCTION(x, y, z.y), \
-                         FUNCTION(x, y, z.z)); \
-  } \
-\
-  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##4 z) { \
-    return (RET_TYPE##4)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##8 z) { \
-    return (RET_TYPE##8)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
-  } \
-\
-  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##16 z) { \
-    return (RET_TYPE##16)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
-  } \
-\
-
-#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ADDR_SPACE, ARG2_TYPE) \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ADDR_SPACE ARG2_TYPE##2 *y) { \
-    return (RET_TYPE##2)( \
-        FUNCTION(x.x, (ARG2_TYPE*)y), \
-        FUNCTION(x.y, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+1)) \
-    ); \
-  } \
-\
-  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ADDR_SPACE ARG2_TYPE##3 *y) { \
-    return (RET_TYPE##3)( \
-        FUNCTION(x.x, (ARG2_TYPE*)y), \
-        FUNCTION(x.y, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+1)), \
-        FUNCTION(x.z, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+2)) \
-    ); \
-  } \
-\
-  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ADDR_SPACE ARG2_TYPE##4 *y) { \
-    return (RET_TYPE##4)( \
-        FUNCTION(x.lo, (ARG2_TYPE##2*)y), \
-        FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##2*)((ADDR_SPACE ARG2_TYPE*)y+2)) \
-    ); \
-  } \
-\
-  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ADDR_SPACE ARG2_TYPE##8 *y) { \
-    return (RET_TYPE##8)( \
-        FUNCTION(x.lo, (ARG2_TYPE##4*)y), \
-        FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##4*)((ADDR_SPACE ARG2_TYPE*)y+4)) \
-    ); \
-  } \
-\
-  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ADDR_SPACE ARG2_TYPE##16 *y) { \
-    return (RET_TYPE##16)( \
-        FUNCTION(x.lo, (ARG2_TYPE##8*)y), \
-        FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##8*)((ADDR_SPACE ARG2_TYPE*)y+8)) \
-    ); \
-  }
-
-#define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
-  return BUILTIN(x, y); \
-} \
-_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
-
-#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
-_CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
-_CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
-
-#define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { \
-  return BUILTIN(x); \
-} \
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE)
diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl
index 5de56f8..cf49b19 100644
--- a/libclc/generic/lib/common/degrees.cl
+++ b/libclc/generic/lib/common/degrees.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float degrees(float radians) {
   // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl
index 3838dd6..645a305 100644
--- a/libclc/generic/lib/common/radians.cl
+++ b/libclc/generic/lib/common/radians.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float radians(float degrees) {
   // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
diff --git a/libclc/generic/lib/common/sign.cl b/libclc/generic/lib/common/sign.cl
index 25832e0..ad8f740 100644
--- a/libclc/generic/lib/common/sign.cl
+++ b/libclc/generic/lib/common/sign.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 #define SIGN(TYPE, F) \
 _CLC_DEF _CLC_OVERLOAD TYPE sign(TYPE x) { \
@@ -26,3 +26,12 @@ SIGN(double, )
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sign, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+SIGN(half,)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, sign, half)
+
+#endif
diff --git a/libclc/generic/lib/common/smoothstep.cl b/libclc/generic/lib/common/smoothstep.cl
index 9f513eb..4cdecfc 100644
--- a/libclc/generic/lib/common/smoothstep.cl
+++ b/libclc/generic/lib/common/smoothstep.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float smoothstep(float edge0, float edge1, float x) {
   float t = clamp((x - edge0) / (edge1 - edge0), 0.0f, 1.0f);
@@ -46,7 +45,7 @@ SMOOTH_STEP_DEF(double, double, SMOOTH_STEP_IMPL_D);
 
 _CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, double, double);
 
-#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64)
+#if !defined(CLC_SPIRV)
 SMOOTH_STEP_DEF(float, double, SMOOTH_STEP_IMPL_D);
 SMOOTH_STEP_DEF(double, float, SMOOTH_STEP_IMPL_D);
 
diff --git a/libclc/generic/lib/common/step.cl b/libclc/generic/lib/common/step.cl
index 5d7c487..3d9bc53 100644
--- a/libclc/generic/lib/common/step.cl
+++ b/libclc/generic/lib/common/step.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float step(float edge, float x) {
   return x < edge ? 0.0f : 1.0f;
@@ -45,7 +44,7 @@ STEP_DEF(double, double);
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double);
 _CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double);
 
-#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64)
+#if !defined(CLC_SPIRV)
 STEP_DEF(float, double);
 STEP_DEF(double, float);
 
diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py
index 21fc8eb..d2f69e6 100644
--- a/libclc/generic/lib/gen_convert.py
+++ b/libclc/generic/lib/gen_convert.py
@@ -46,21 +46,21 @@ types = [
     "uint",
     "long",
     "ulong",
+    "half",
     "float",
     "double",
 ]
 int_types = ["char", "uchar", "short", "ushort", "int", "uint", "long", "ulong"]
 unsigned_types = ["uchar", "ushort", "uint", "ulong"]
-float_types = ["float", "double"]
+float_types = ["half", "float", "double"]
 int64_types = ["long", "ulong"]
 float64_types = ["double"]
+float16_types = ["half"]
 vector_sizes = ["", "2", "3", "4", "8", "16"]
 half_sizes = [("2", ""), ("4", "2"), ("8", "4"), ("16", "8")]
 
 saturation = ["", "_sat"]
 rounding_modes = ["_rtz", "_rte", "_rtp", "_rtn"]
-float_prefix = {"float": "FLT_", "double": "DBL_"}
-float_suffix = {"float": "f", "double": ""}
 
 bool_type = {
     "char": "char",
@@ -71,6 +71,7 @@ bool_type = {
     "uint": "int",
     "long": "long",
     "ulong": "long",
+    "half": "short",
     "float": "int",
     "double": "long",
 }
@@ -95,6 +96,7 @@ sizeof_type = {
     "uint": 4,
     "long": 8,
     "ulong": 8,
+    "half": 2,
     "float": 4,
     "double": 8,
 }
@@ -108,6 +110,7 @@ limit_max = {
     "uint": "UINT_MAX",
     "long": "LONG_MAX",
     "ulong": "ULONG_MAX",
+    "half": "0x1.ffcp+15",
 }
 
 limit_min = {
@@ -119,24 +122,36 @@ limit_min = {
     "uint": "0",
     "long": "LONG_MIN",
     "ulong": "0",
+    "half": "-0x1.ffcp+15",
 }
 
 
 def conditional_guard(src, dst):
     int64_count = 0
     float64_count = 0
+    float16_count = 0
     if src in int64_types:
         int64_count = int64_count + 1
     elif src in float64_types:
         float64_count = float64_count + 1
+    elif src in float16_types:
+        float16_count = float16_count + 1
     if dst in int64_types:
         int64_count = int64_count + 1
     elif dst in float64_types:
         float64_count = float64_count + 1
-    if float64_count > 0:
+    elif dst in float16_types:
+        float16_count = float16_count + 1
+    if float64_count > 0 and float16_count > 0:
+        print("#if defined(cl_khr_fp16) && defined(cl_khr_fp64)")
+        return True
+    elif float64_count > 0:
         # In embedded profile, if cl_khr_fp64 is supported cles_khr_int64 has to be
         print("#ifdef cl_khr_fp64")
         return True
+    elif float16_count > 0:
+        print("#if defined cl_khr_fp16")
+        return True
     elif int64_count > 0:
         print("#if defined cles_khr_int64 || !defined(__EMBEDDED_PROFILE__)")
         return True
@@ -175,6 +190,10 @@ print(
 
 #include <clc/clc.h>
 
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
@@ -222,41 +241,21 @@ print(
 def generate_default_conversion(src, dst, mode):
     close_conditional = conditional_guard(src, dst)
 
-    # scalar conversions
-    print(
-        """_CLC_DEF _CLC_OVERLOAD
-{DST} convert_{DST}{M}({SRC} x)
-{{
-  return ({DST})x;
+    for size in vector_sizes:
+        if not size:
+            print(
+                f"""_CLC_DEF _CLC_OVERLOAD {dst} convert_{dst}{mode}({src} x) {{
+  return ({dst})x;
 }}
-""".format(
-            SRC=src, DST=dst, M=mode
-        )
-    )
-
-    # vector conversions, done through decomposition to components
-    for size, half_size in half_sizes:
-        print(
-            """_CLC_DEF _CLC_OVERLOAD
-{DST}{N} convert_{DST}{N}{M}({SRC}{N} x)
-{{
-  return ({DST}{N})(convert_{DST}{H}(x.lo), convert_{DST}{H}(x.hi));
+"""
+            )
+        else:
+            print(
+                f"""_CLC_DEF _CLC_OVERLOAD {dst}{size} convert_{dst}{size}{mode}({src}{size} x) {{
+  return __builtin_convertvector(x, {dst}{size});
 }}
-""".format(
-                SRC=src, DST=dst, N=size, H=half_size, M=mode
+"""
             )
-        )
-
-    # 3-component vector conversions
-    print(
-        """_CLC_DEF _CLC_OVERLOAD
-{DST}3 convert_{DST}3{M}({SRC}3 x)
-{{
-  return ({DST}3)(convert_{DST}2(x.s01), convert_{DST}(x.s2));
-}}""".format(
-            SRC=src, DST=dst, M=mode
-        )
-    )
 
     if close_conditional:
         print("#endif")
@@ -498,22 +497,42 @@ def generate_float_conversion(src, dst, size, mode, sat):
                         )
                     )
                 print(
-                    "  return select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), c);".format(
+                    "  {DST}{N} sel = select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), c);".format(
                         DST=dst, N=size, BOOL=bool_type[dst], SRC=src
                     )
                 )
             else:
                 print(
-                    "  return select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), convert_{BOOL}{N}(abs_y > abs_x));".format(
+                    "  {DST}{N} sel = select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), convert_{BOOL}{N}(abs_y > abs_x));".format(
                         DST=dst, N=size, BOOL=bool_type[dst]
                     )
                 )
+            if dst == "half" and src in int_types and sizeof_type[src] >= 2:
+                dst_max = limit_max[dst]
+                # short is 16 bits signed, so the maximum value rounded to zero is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff == 32767)
+                if src == "short":
+                    dst_max = "0x1.ffcp+14"
+                print(
+                    "  return clamp(sel, ({DST}{N}){DST_MIN}, ({DST}{N}){DST_MAX});".format(
+                        DST=dst, N=size, DST_MIN=limit_min[dst], DST_MAX=dst_max
+                    )
+                )
+            else:
+                print("  return sel;")
         if mode == "_rtp":
             print(
-                "  return select(r, nextafter(r, ({DST}{N})INFINITY), convert_{BOOL}{N}(y < x));".format(
+                "  {DST}{N} sel = select(r, nextafter(r, ({DST}{N})INFINITY), convert_{BOOL}{N}(y < x));".format(
                     DST=dst, N=size, BOOL=bool_type[dst]
                 )
             )
+            if dst == "half" and src in int_types and sizeof_type[src] >= 2:
+                print(
+                    "  return max(sel, ({DST}{N}){DST_MIN});".format(
+                        DST=dst, N=size, DST_MIN=limit_min[dst]
+                    )
+                )
+            else:
+                print("  return sel;")
         if mode == "_rtn":
             if clspv:
                 print(
@@ -528,16 +547,28 @@ def generate_float_conversion(src, dst, size, mode, sat):
                         )
                     )
                 print(
-                    "  return select(r, nextafter(r, ({DST}{N})-INFINITY), c);".format(
+                    "  {DST}{N} sel = select(r, nextafter(r, ({DST}{N})-INFINITY), c);".format(
                         DST=dst, N=size, BOOL=bool_type[dst], SRC=src
                     )
                 )
             else:
                 print(
-                    "  return select(r, nextafter(r, ({DST}{N})-INFINITY), convert_{BOOL}{N}(y > x));".format(
+                    "  {DST}{N} sel = select(r, nextafter(r, ({DST}{N})-INFINITY), convert_{BOOL}{N}(y > x));".format(
                         DST=dst, N=size, BOOL=bool_type[dst]
                     )
                 )
+            if dst == "half" and src in int_types and sizeof_type[src] >= 2:
+                dst_max = limit_max[dst]
+                # short is 16 bits signed, so the maximum value rounded to negative infinity is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff == 32767)
+                if src == "short":
+                    dst_max = "0x1.ffcp+14"
+                print(
+                    "  return min(sel, ({DST}{N}){DST_MAX});".format(
+                        DST=dst, N=size, DST_MAX=dst_max
+                    )
+                )
+            else:
+                print("  return sel;")
 
     # Footer
     print("}")
diff --git a/libclc/generic/lib/geometric/dot.cl b/libclc/generic/lib/geometric/dot.cl
index e58bc26..e790d02 100644
--- a/libclc/generic/lib/geometric/dot.cl
+++ b/libclc/generic/lib/geometric/dot.cl
@@ -1,19 +1,20 @@
 #include <clc/clc.h>
+#include <clc/geometric/clc_dot.h>
 
 _CLC_OVERLOAD _CLC_DEF float dot(float p0, float p1) {
-  return p0*p1;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF float dot(float2 p0, float2 p1) {
-  return p0.x*p1.x + p0.y*p1.y;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF float dot(float3 p0, float3 p1) {
-  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF float dot(float4 p0, float4 p1) {
-  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w;
+  return __clc_dot(p0, p1);
 }
 
 #ifdef cl_khr_fp64
@@ -21,19 +22,19 @@ _CLC_OVERLOAD _CLC_DEF float dot(float4 p0, float4 p1) {
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 _CLC_OVERLOAD _CLC_DEF double dot(double p0, double p1) {
-  return p0*p1;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF double dot(double2 p0, double2 p1) {
-  return p0.x*p1.x + p0.y*p1.y;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF double dot(double3 p0, double3 p1) {
-  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF double dot(double4 p0, double4 p1) {
-  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w;
+  return __clc_dot(p0, p1);
 }
 
 #endif
@@ -42,20 +43,18 @@ _CLC_OVERLOAD _CLC_DEF double dot(double4 p0, double4 p1) {
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_OVERLOAD _CLC_DEF half dot(half p0, half p1) {
-  return p0*p1;
-}
+_CLC_OVERLOAD _CLC_DEF half dot(half p0, half p1) { return __clc_dot(p0, p1); }
 
 _CLC_OVERLOAD _CLC_DEF half dot(half2 p0, half2 p1) {
-  return p0.x*p1.x + p0.y*p1.y;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF half dot(half3 p0, half3 p1) {
-  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z;
+  return __clc_dot(p0, p1);
 }
 
 _CLC_OVERLOAD _CLC_DEF half dot(half4 p0, half4 p1) {
-  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w;
+  return __clc_dot(p0, p1);
 }
 
 #endif
diff --git a/libclc/generic/lib/integer/abs.cl b/libclc/generic/lib/integer/abs.cl
index faff8d0..fda23c8 100644
--- a/libclc/generic/lib/integer/abs.cl
+++ b/libclc/generic/lib/integer/abs.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_abs.h>
 
 #define __CLC_BODY <abs.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/abs.inc b/libclc/generic/lib/integer/abs.inc
index cfe7bfe..443d0dc 100644
--- a/libclc/generic/lib/integer/abs.inc
+++ b/libclc/generic/lib/integer/abs.inc
@@ -1,3 +1,3 @@
 _CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs(__CLC_GENTYPE x) {
-  return __builtin_astype((__CLC_GENTYPE)(x > (__CLC_GENTYPE)(0) ? x : -x), __CLC_U_GENTYPE);
+  return __clc_abs(x);
 }
diff --git a/libclc/generic/lib/integer/abs_diff.cl b/libclc/generic/lib/integer/abs_diff.cl
index 3d75105..6cd9efc 100644
--- a/libclc/generic/lib/integer/abs_diff.cl
+++ b/libclc/generic/lib/integer/abs_diff.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_abs_diff.h>
 
 #define __CLC_BODY <abs_diff.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/abs_diff.inc b/libclc/generic/lib/integer/abs_diff.inc
index 2d3c492..da87bb1 100644
--- a/libclc/generic/lib/integer/abs_diff.inc
+++ b/libclc/generic/lib/integer/abs_diff.inc
@@ -1,5 +1,3 @@
 _CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs_diff(__CLC_GENTYPE x, __CLC_GENTYPE y) {
-  __CLC_U_GENTYPE ux = __builtin_astype(x, __CLC_U_GENTYPE);
-  __CLC_U_GENTYPE uy = __builtin_astype(y, __CLC_U_GENTYPE);
-  return x > y ? ux - uy : uy - ux;
+  return __clc_abs_diff(x, y);
 }
diff --git a/libclc/generic/lib/integer/add_sat.cl b/libclc/generic/lib/integer/add_sat.cl
index 252dce9..11a4a33 100644
--- a/libclc/generic/lib/integer/add_sat.cl
+++ b/libclc/generic/lib/integer/add_sat.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 // From add_sat.ll
 _CLC_DECL char   __clc_add_sat_s8(char, char);
diff --git a/libclc/generic/lib/integer/clz.cl b/libclc/generic/lib/integer/clz.cl
index e2080b5..904d027 100644
--- a/libclc/generic/lib/integer/clz.cl
+++ b/libclc/generic/lib/integer/clz.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF char clz(char x) {
   return clz((ushort)(uchar)x) - 8;
diff --git a/libclc/generic/lib/integer/mad_sat.cl b/libclc/generic/lib/integer/mad_sat.cl
index 1708b29..2372eaa 100644
--- a/libclc/generic/lib/integer/mad_sat.cl
+++ b/libclc/generic/lib/integer/mad_sat.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF char mad_sat(char x, char y, char z) {
   return clamp((short)mad24((short)x, (short)y, (short)z), (short)CHAR_MIN, (short) CHAR_MAX);
diff --git a/libclc/generic/lib/integer/sub_sat.cl b/libclc/generic/lib/integer/sub_sat.cl
index 2fbc316..e6beef7 100644
--- a/libclc/generic/lib/integer/sub_sat.cl
+++ b/libclc/generic/lib/integer/sub_sat.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF char sub_sat(char x, char y) {
   short r = x - y;
diff --git a/libclc/generic/lib/math/acos.cl b/libclc/generic/lib/math/acos.cl
index 87db014..aeb7287 100644
--- a/libclc/generic/lib/math/acos.cl
+++ b/libclc/generic/lib/math/acos.cl
@@ -20,9 +20,9 @@
  * THE SOFTWARE.
  */
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float acos(float x) {
     // Computes arccos(x).
@@ -171,3 +171,5 @@ _CLC_OVERLOAD _CLC_DEF double acos(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acos, double);
 
 #endif // cl_khr_fp64
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(acos)
diff --git a/libclc/generic/lib/math/acosh.cl b/libclc/generic/lib/math/acosh.cl
index 59da511..4656f14 100644
--- a/libclc/generic/lib/math/acosh.cl
+++ b/libclc/generic/lib/math/acosh.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "ep_log.h"
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF  float acosh(float x) {
     uint ux = as_uint(x);
@@ -125,3 +125,5 @@ _CLC_OVERLOAD _CLC_DEF double acosh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acosh, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(acosh)
diff --git a/libclc/generic/lib/math/acospi.cl b/libclc/generic/lib/math/acospi.cl
index c91fc41..83a47eb 100644
--- a/libclc/generic/lib/math/acospi.cl
+++ b/libclc/generic/lib/math/acospi.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float acospi(float x) {
     // Computes arccos(x).
@@ -170,3 +170,5 @@ _CLC_OVERLOAD _CLC_DEF double acospi(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acospi, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(acospi)
diff --git a/libclc/generic/lib/math/asin.cl b/libclc/generic/lib/math/asin.cl
index 43ce905..443dec8 100644
--- a/libclc/generic/lib/math/asin.cl
+++ b/libclc/generic/lib/math/asin.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float asin(float x) {
     // Computes arcsin(x).
diff --git a/libclc/generic/lib/math/asinh.cl b/libclc/generic/lib/math/asinh.cl
index cfddb31c..f7637ad 100644
--- a/libclc/generic/lib/math/asinh.cl
+++ b/libclc/generic/lib/math/asinh.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "ep_log.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float asinh(float x) {
     uint ux = as_uint(x);
@@ -291,3 +291,5 @@ _CLC_OVERLOAD _CLC_DEF double asinh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, asinh, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(asinh)
diff --git a/libclc/generic/lib/math/asinpi.cl b/libclc/generic/lib/math/asinpi.cl
index 511d74e..18dc530 100644
--- a/libclc/generic/lib/math/asinpi.cl
+++ b/libclc/generic/lib/math/asinpi.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float asinpi(float x) {
     // Computes arcsin(x).
diff --git a/libclc/generic/lib/math/atan.cl b/libclc/generic/lib/math/atan.cl
index fa3633c..28eaaf7 100644
--- a/libclc/generic/lib/math/atan.cl
+++ b/libclc/generic/lib/math/atan.cl
@@ -20,10 +20,10 @@
  * THE SOFTWARE.
  */
 
-#include "math.h"
-#include "../clcmacro.h"
-
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+
+#include "math.h"
 
 _CLC_OVERLOAD _CLC_DEF float atan(float x)
 {
@@ -181,3 +181,6 @@ _CLC_OVERLOAD _CLC_DEF double atan(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan, double);
 
 #endif // cl_khr_fp64
+
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(atan)
diff --git a/libclc/generic/lib/math/atan2.cl b/libclc/generic/lib/math/atan2.cl
index a2f104f..98b457a 100644
--- a/libclc/generic/lib/math/atan2.cl
+++ b/libclc/generic/lib/math/atan2.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float atan2(float y, float x)
 {
@@ -235,3 +235,5 @@ _CLC_OVERLOAD _CLC_DEF double atan2(double y, double x)
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2, double, double);
 
 #endif
+
+_CLC_DEFINE_BINARY_BUILTIN_FP16(atan2)
diff --git a/libclc/generic/lib/math/atan2pi.cl b/libclc/generic/lib/math/atan2pi.cl
index a15b14f..ad41b11 100644
--- a/libclc/generic/lib/math/atan2pi.cl
+++ b/libclc/generic/lib/math/atan2pi.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF  float atan2pi(float y, float x) {
     const float pi = 0x1.921fb6p+1f;
@@ -219,3 +219,5 @@ _CLC_OVERLOAD _CLC_DEF double atan2pi(double y, double x) {
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2pi, double, double)
 
 #endif
+
+_CLC_DEFINE_BINARY_BUILTIN_FP16(atan2pi)
diff --git a/libclc/generic/lib/math/atanh.cl b/libclc/generic/lib/math/atanh.cl
index 4af2f45..f2298a2 100644
--- a/libclc/generic/lib/math/atanh.cl
+++ b/libclc/generic/lib/math/atanh.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float atanh(float x) {
     uint ux = as_uint(x);
@@ -111,3 +111,5 @@ _CLC_OVERLOAD _CLC_DEF double atanh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanh, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(atanh)
diff --git a/libclc/generic/lib/math/atanpi.cl b/libclc/generic/lib/math/atanpi.cl
index 2e2f032..9e6b3ec 100644
--- a/libclc/generic/lib/math/atanpi.cl
+++ b/libclc/generic/lib/math/atanpi.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float atanpi(float x) {
     const float pi = 3.1415926535897932f;
@@ -180,3 +180,5 @@ _CLC_OVERLOAD _CLC_DEF double atanpi(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanpi, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(atanpi)
diff --git a/libclc/generic/lib/math/cbrt.cl b/libclc/generic/lib/math/cbrt.cl
index 5ff9367..8462f5f 100644
--- a/libclc/generic/lib/math/cbrt.cl
+++ b/libclc/generic/lib/math/cbrt.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float cbrt(float x) {
 
@@ -149,3 +149,5 @@ _CLC_OVERLOAD _CLC_DEF double cbrt(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cbrt, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cbrt)
diff --git a/libclc/generic/lib/math/ceil.cl b/libclc/generic/lib/math/ceil.cl
index 9f7154c..e02789e 100644
--- a/libclc/generic/lib/math/ceil.cl
+++ b/libclc/generic/lib/math/ceil.cl
@@ -1,10 +1,6 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
-
-// Map the llvm intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_ceil
-#define __CLC_INTRINSIC "llvm.ceil"
-#include "math/unary_intrin.inc"
+#include <clc/clcmacro.h>
+#include <clc/math/clc_ceil.h>
 
 #undef __CLC_FUNCTION
 #define __CLC_FUNCTION ceil
diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl
index c6a9476..6ea8743 100644
--- a/libclc/generic/lib/math/clc_exp10.cl
+++ b/libclc/generic/lib/math/clc_exp10.cl
@@ -21,11 +21,12 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/relational/clc_isnan.h>
 
 #include "config.h"
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 //    Algorithm:
 //
@@ -62,7 +63,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x)
     const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
     const float R_LN10 = 0x1.26bb1cp+1f;
 
-    int return_nan = isnan(x);
+    int return_nan = __clc_isnan(x);
     int return_inf = x > X_MAX;
     int return_zero = x < X_MIN;
 
@@ -138,7 +139,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
     z2 = ldexp(z2, m);
     z2 = small_value ? z3: z2;
 
-    z2 = isnan(x) ? x : z2;
+    z2 = __clc_isnan(x) ? x : z2;
 
     z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
     z2 = x < X_MIN ? 0.0 : z2;
diff --git a/libclc/generic/lib/math/clc_fma.cl b/libclc/generic/lib/math/clc_fma.cl
index dee90e9..15de4c8 100644
--- a/libclc/generic/lib/math/clc_fma.cl
+++ b/libclc/generic/lib/math/clc_fma.cl
@@ -21,138 +21,147 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/integer/clc_abs.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/shared/clc_max.h>
 
 #include "config.h"
 #include "math.h"
-#include "../clcmacro.h"
 
 struct fp {
-	ulong mantissa;
-	int exponent;
-	uint sign;
+  ulong mantissa;
+  int exponent;
+  uint sign;
 };
 
-_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c)
-{
-	/* special cases */
-	if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b))
-		return mad(a, b, c);
+_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
+  /* special cases */
+  if (__clc_isnan(a) || __clc_isnan(b) || __clc_isnan(c) || __clc_isinf(a) ||
+      __clc_isinf(b))
+    return mad(a, b, c);
 
-	/* If only c is inf, and both a,b are regular numbers, the result is c*/
-	if (isinf(c))
-		return c;
+  /* If only c is inf, and both a,b are regular numbers, the result is c*/
+  if (__clc_isinf(c))
+    return c;
 
-	a = __clc_flush_denormal_if_not_supported(a);
-	b = __clc_flush_denormal_if_not_supported(b);
-	c = __clc_flush_denormal_if_not_supported(c);
+  a = __clc_flush_denormal_if_not_supported(a);
+  b = __clc_flush_denormal_if_not_supported(b);
+  c = __clc_flush_denormal_if_not_supported(c);
 
-	if (c == 0)
-		return a * b;
+  if (c == 0)
+    return a * b;
 
-	struct fp st_a, st_b, st_c;
+  struct fp st_a, st_b, st_c;
 
-	st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
-	st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
-	st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
+  st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
+  st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
+  st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
 
-	st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000;
-	st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000;
-	st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000;
+  st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000;
+  st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000;
+  st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000;
 
-	st_a.sign = as_uint(a) & 0x80000000;
-	st_b.sign = as_uint(b) & 0x80000000;
-	st_c.sign = as_uint(c) & 0x80000000;
+  st_a.sign = as_uint(a) & 0x80000000;
+  st_b.sign = as_uint(b) & 0x80000000;
+  st_c.sign = as_uint(c) & 0x80000000;
 
-	// Multiplication.
-	// Move the product to the highest bits to maximize precision
-	// mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
-	// Add one bit for future addition overflow,
-	// add another bit to detect subtraction underflow
-	struct fp st_mul;
-	st_mul.sign = st_a.sign ^ st_b.sign;
-	st_mul.mantissa = (st_a.mantissa * st_b.mantissa) << 14ul;
-	st_mul.exponent = st_mul.mantissa ? st_a.exponent + st_b.exponent : 0;
+  // Multiplication.
+  // Move the product to the highest bits to maximize precision
+  // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
+  // Add one bit for future addition overflow,
+  // add another bit to detect subtraction underflow
+  struct fp st_mul;
+  st_mul.sign = st_a.sign ^ st_b.sign;
+  st_mul.mantissa = (st_a.mantissa * st_b.mantissa) << 14ul;
+  st_mul.exponent = st_mul.mantissa ? st_a.exponent + st_b.exponent : 0;
 
-	// FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
-	if (st_mul.exponent == 0 && st_mul.mantissa == 0)
-		return c;
+  // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
+  if (st_mul.exponent == 0 && st_mul.mantissa == 0)
+    return c;
 
 // Mantissa is 23 fractional bits, shift it the same way as product mantissa
 #define C_ADJUST 37ul
 
-	// both exponents are bias adjusted
-	int exp_diff = st_mul.exponent - st_c.exponent;
-
-	st_c.mantissa <<= C_ADJUST;
-	ulong cutoff_bits = 0;
-	ulong cutoff_mask = (1ul << abs(exp_diff)) - 1ul;
-	if (exp_diff > 0) {
-		cutoff_bits = exp_diff >= 64 ? st_c.mantissa : (st_c.mantissa & cutoff_mask);
-		st_c.mantissa = exp_diff >= 64 ? 0 : (st_c.mantissa >> exp_diff);
-	} else {
-		cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa : (st_mul.mantissa & cutoff_mask);
-		st_mul.mantissa = -exp_diff >= 64 ? 0 : (st_mul.mantissa >> -exp_diff);
-	}
-
-	struct fp st_fma;
-	st_fma.sign = st_mul.sign;
-	st_fma.exponent = max(st_mul.exponent, st_c.exponent);
-	if (st_c.sign == st_mul.sign) {
-		st_fma.mantissa = st_mul.mantissa + st_c.mantissa;
-	} else {
-		// cutoff bits borrow one
-		st_fma.mantissa = st_mul.mantissa - st_c.mantissa - (cutoff_bits && (st_mul.exponent > st_c.exponent) ? 1 : 0);
-	}
-
-	// underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
-	if (st_fma.mantissa > LONG_MAX) {
-		st_fma.mantissa = 0 - st_fma.mantissa;
-		st_fma.sign = st_mul.sign ^ 0x80000000;
-	}
-
-	// detect overflow/underflow
-	int overflow_bits = 3 - clz(st_fma.mantissa);
-
-	// adjust exponent
-	st_fma.exponent += overflow_bits;
-
-	// handle underflow
-	if (overflow_bits < 0) {
-		st_fma.mantissa <<= -overflow_bits;
-		overflow_bits = 0;
-	}
-
-	// rounding
-	ulong trunc_mask = (1ul << (C_ADJUST + overflow_bits)) - 1;
-	ulong trunc_bits = (st_fma.mantissa & trunc_mask) | (cutoff_bits != 0);
-	ulong last_bit = st_fma.mantissa & (1ul << (C_ADJUST + overflow_bits));
-	ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits));
-
-	// round to nearest even
-	if ((trunc_bits > grs_bits) ||
-	    (trunc_bits == grs_bits && last_bit != 0))
-		st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits));
-
-	// Shift mantissa back to bit 23
-	st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits));
-
-	// Detect rounding overflow
-	if (st_fma.mantissa > 0xffffff) {
-		++st_fma.exponent;
-		st_fma.mantissa >>= 1;
-	}
-
-	if (st_fma.mantissa == 0)
-		return .0f;
-
-	// Flating point range limit
-	if (st_fma.exponent > 127)
-		return as_float(as_uint(INFINITY) | st_fma.sign);
-
-	// Flush denormals
-	if (st_fma.exponent <= -127)
-		return as_float(st_fma.sign);
-
-	return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) | ((uint)st_fma.mantissa & 0x7fffff));
+  // both exponents are bias adjusted
+  int exp_diff = st_mul.exponent - st_c.exponent;
+
+  st_c.mantissa <<= C_ADJUST;
+  ulong cutoff_bits = 0;
+  ulong cutoff_mask = (1ul << __clc_abs(exp_diff)) - 1ul;
+  if (exp_diff > 0) {
+    cutoff_bits =
+        exp_diff >= 64 ? st_c.mantissa : (st_c.mantissa & cutoff_mask);
+    st_c.mantissa = exp_diff >= 64 ? 0 : (st_c.mantissa >> exp_diff);
+  } else {
+    cutoff_bits =
+        -exp_diff >= 64 ? st_mul.mantissa : (st_mul.mantissa & cutoff_mask);
+    st_mul.mantissa = -exp_diff >= 64 ? 0 : (st_mul.mantissa >> -exp_diff);
+  }
+
+  struct fp st_fma;
+  st_fma.sign = st_mul.sign;
+  st_fma.exponent = __clc_max(st_mul.exponent, st_c.exponent);
+  if (st_c.sign == st_mul.sign) {
+    st_fma.mantissa = st_mul.mantissa + st_c.mantissa;
+  } else {
+    // cutoff bits borrow one
+    st_fma.mantissa =
+        st_mul.mantissa - st_c.mantissa -
+        (cutoff_bits && (st_mul.exponent > st_c.exponent) ? 1 : 0);
+  }
+
+  // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
+  if (st_fma.mantissa > LONG_MAX) {
+    st_fma.mantissa = 0 - st_fma.mantissa;
+    st_fma.sign = st_mul.sign ^ 0x80000000;
+  }
+
+  // detect overflow/underflow
+  int overflow_bits = 3 - clz(st_fma.mantissa);
+
+  // adjust exponent
+  st_fma.exponent += overflow_bits;
+
+  // handle underflow
+  if (overflow_bits < 0) {
+    st_fma.mantissa <<= -overflow_bits;
+    overflow_bits = 0;
+  }
+
+  // rounding
+  ulong trunc_mask = (1ul << (C_ADJUST + overflow_bits)) - 1;
+  ulong trunc_bits = (st_fma.mantissa & trunc_mask) | (cutoff_bits != 0);
+  ulong last_bit = st_fma.mantissa & (1ul << (C_ADJUST + overflow_bits));
+  ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits));
+
+  // round to nearest even
+  if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0))
+    st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits));
+
+  // Shift mantissa back to bit 23
+  st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits));
+
+  // Detect rounding overflow
+  if (st_fma.mantissa > 0xffffff) {
+    ++st_fma.exponent;
+    st_fma.mantissa >>= 1;
+  }
+
+  if (st_fma.mantissa == 0)
+    return .0f;
+
+  // Flating point range limit
+  if (st_fma.exponent > 127)
+    return as_float(as_uint(INFINITY) | st_fma.sign);
+
+  // Flush denormals
+  if (st_fma.exponent <= -127)
+    return as_float(st_fma.sign);
+
+  return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
+                  ((uint)st_fma.mantissa & 0x7fffff));
 }
-_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float, float, float)
+_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float,
+                       float, float)
diff --git a/libclc/generic/lib/math/clc_fmod.cl b/libclc/generic/lib/math/clc_fmod.cl
index ea9f0e4..5d10137 100644
--- a/libclc/generic/lib/math/clc_fmod.cl
+++ b/libclc/generic/lib/math/clc_fmod.cl
@@ -21,9 +21,12 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_floor.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/shared/clc_max.h>
 
 #include <math/clc_remainder.h>
-#include "../clcmacro.h"
 #include "config.h"
 #include "math.h"
 
@@ -103,7 +106,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y)
     // less than the mantissa of y, ntimes will be one too large
     // but it doesn't matter - it just means that we'll go round
     // the loop below one extra time.
-    int ntimes = max(0, (xexp1 - yexp1) / 53);
+    int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
     double w =  ldexp(dy, ntimes * 53);
     w = ntimes == 0 ? dy : w;
     double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
@@ -119,7 +122,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y)
 
     for (i = 0; i < ntimes; i++) {
         // Compute integral multiplier
-        t = trunc(dx / w);
+        t = __clc_trunc(dx / w);
 
         // Compute w * t in quad precision
         p = w * t;
@@ -138,7 +141,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y)
 
     // One more time
     // Variable todd says whether the integer t is odd or not
-    t = floor(dx / w);
+    t = __clc_floor(dx / w);
     long lt = (long)t;
     int todd = lt & 1;
 
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
index 35532a9..a17e661 100644
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@@ -21,78 +21,84 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/integer/clc_abs.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
 
 #include "config.h"
 #include "math.h"
-#include "../clcmacro.h"
-
-// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result warrants it
-_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y)
-{
-    uint ux = as_uint(x);
-    uint aux = ux & EXSIGNBIT_SP32;
-    uint uy = as_uint(y);
-    uint auy = uy & EXSIGNBIT_SP32;
-    float retval;
-    int c = aux > auy;
-    ux = c ? aux : auy;
-    uy = c ? auy : aux;
-
-    int xexp = clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126);
-    float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
-    float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
-    float fx = as_float(ux) * fi_exp;
-    float fy = as_float(uy) * fi_exp;
-    retval = sqrt(mad(fx, fx, fy*fy)) * fx_exp;
-
-    retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
-    retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32 ? as_float(PINFBITPATT_SP32) : retval;
-    return retval;
+
+// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result
+// warrants it
+_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
+  uint ux = as_uint(x);
+  uint aux = ux & EXSIGNBIT_SP32;
+  uint uy = as_uint(y);
+  uint auy = uy & EXSIGNBIT_SP32;
+  float retval;
+  int c = aux > auy;
+  ux = c ? aux : auy;
+  uy = c ? auy : aux;
+
+  int xexp =
+      __clc_clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126);
+  float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+  float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+  float fx = as_float(ux) * fi_exp;
+  float fy = as_float(uy) * fi_exp;
+  retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp;
+
+  retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
+  retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
+               ? as_float(PINFBITPATT_SP32)
+               : retval;
+  return retval;
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_hypot, float, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y)
-{
-    ulong ux = as_ulong(x) & ~SIGNBIT_DP64;
-    int xexp = ux >> EXPSHIFTBITS_DP64;
-    x = as_double(ux);
+_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
+  ulong ux = as_ulong(x) & ~SIGNBIT_DP64;
+  int xexp = ux >> EXPSHIFTBITS_DP64;
+  x = as_double(ux);
 
-    ulong uy = as_ulong(y) & ~SIGNBIT_DP64;
-    int yexp = uy >> EXPSHIFTBITS_DP64;
-    y = as_double(uy);
+  ulong uy = as_ulong(y) & ~SIGNBIT_DP64;
+  int yexp = uy >> EXPSHIFTBITS_DP64;
+  y = as_double(uy);
 
-    int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500;
-    double preadjust = c ? 0x1.0p-600 : 1.0;
-    double postadjust = c ? 0x1.0p+600 : 1.0;
+  int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500;
+  double preadjust = c ? 0x1.0p-600 : 1.0;
+  double postadjust = c ? 0x1.0p+600 : 1.0;
 
-    c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500;
-    preadjust = c ? 0x1.0p+600 : preadjust;
-    postadjust = c ? 0x1.0p-600 : postadjust;
+  c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500;
+  preadjust = c ? 0x1.0p+600 : preadjust;
+  postadjust = c ? 0x1.0p-600 : postadjust;
 
-    double ax = x * preadjust;
-    double ay = y * preadjust;
+  double ax = x * preadjust;
+  double ay = y * preadjust;
 
-    // The post adjust may overflow, but this can't be avoided in any case
-    double r = sqrt(fma(ax, ax, ay*ay)) * postadjust;
+  // The post adjust may overflow, but this can't be avoided in any case
+  double r = sqrt(fma(ax, ax, ay * ay)) * postadjust;
 
-    // If the difference in exponents between x and y is large
-    double s = x + y;
-    c = abs(xexp - yexp) > MANTLENGTH_DP64 + 1;
-    r = c ? s : r;
+  // If the difference in exponents between x and y is large
+  double s = x + y;
+  c = __clc_abs(xexp - yexp) > MANTLENGTH_DP64 + 1;
+  r = c ? s : r;
 
-    // Check for NaN
-    //c = x != x | y != y;
-    c = isnan(x) | isnan(y);
-    r = c ? as_double(QNANBITPATT_DP64) : r;
+  // Check for NaN
+  // c = x != x | y != y;
+  c = __clc_isnan(x) | __clc_isnan(y);
+  r = c ? as_double(QNANBITPATT_DP64) : r;
 
-    // If either is Inf, we must return Inf
-    c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64);
-    r = c ? as_double(PINFBITPATT_DP64) : r;
+  // If either is Inf, we must return Inf
+  c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64);
+  r = c ? as_double(PINFBITPATT_DP64) : r;
 
-    return r;
+  return r;
 }
 
-_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, double)
+_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double,
+                      double)
 #endif
diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl
index 61e34a5..6d37215 100644
--- a/libclc/generic/lib/math/clc_ldexp.cl
+++ b/libclc/generic/lib/math/clc_ldexp.cl
@@ -20,76 +20,80 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
 #include "config.h"
-#include "../clcmacro.h"
 #include "math.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/shared/clc_clamp.h>
 
 _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) {
 
-	if (!__clc_fp32_subnormals_supported()) {
-
-		// This treats subnormals as zeros
-		int i = as_int(x);
-		int e = (i >> 23) & 0xff;
-		int m = i & 0x007fffff;
-		int s = i & 0x80000000;
-		int v = add_sat(e, n);
-		v = clamp(v, 0, 0xff);
-		int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
-		int c = e == 0xff;
-		mr = c ? m : mr;
-		int er = c ? e : v;
-		er = e ? er : e;
-		return as_float( s | (er << 23) | mr );
-	}
-
-	/* supports denormal values */
-	const int multiplier = 24;
-	float val_f;
-	uint val_ui;
-	uint sign;
-	int exponent;
-	val_ui = as_uint(x);
-	sign = val_ui & 0x80000000;
-	val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
-	int val_x = val_ui;
-
-	exponent = val_ui >> 23; /* get the exponent */
-	int dexp = exponent;
-
-	/* denormal support */
-	int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
-	int dexponent = 25 - fbh;
-	uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
-	int ex = dexponent + n - multiplier;
-	dexponent = ex;
-	uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
-	int ex1 = dexponent + multiplier;
-	ex1 = -ex1 +25;
-	dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
-	dval_ui = dexponent > 0 ? val :dval_ui;
-	dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui;  /*overflow*/
-	dval_ui = dexponent < -multiplier ? 0 : dval_ui;  /*underflow*/
-	dval_ui = dval_ui | sign;
-	val_f = as_float(dval_ui);
-
-	exponent += n;
-
-	val = sign | (exponent << 23) | (val_ui & 0x007fffff);
-	ex1 = exponent + multiplier;
-	ex1 = -ex1 +25;
-	val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
-	val_ui = exponent > 0 ? val :val_ui;
-	val_ui = exponent > 254 ? 0x7f800000 :val_ui;  /*overflow*/
-	val_ui = exponent < -multiplier ? 0 : val_ui;  /*underflow*/
-	val_ui = val_ui | sign;
-
-	val_ui = dexp == 0? dval_ui : val_ui;
-	val_f = as_float(val_ui);
-
-	val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
-	return val_f;
+  if (!__clc_fp32_subnormals_supported()) {
+
+    // This treats subnormals as zeros
+    int i = as_int(x);
+    int e = (i >> 23) & 0xff;
+    int m = i & 0x007fffff;
+    int s = i & 0x80000000;
+    int v = add_sat(e, n);
+    v = __clc_clamp(v, 0, 0xff);
+    int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+    int c = e == 0xff;
+    mr = c ? m : mr;
+    int er = c ? e : v;
+    er = e ? er : e;
+    return as_float(s | (er << 23) | mr);
+  }
+
+  /* supports denormal values */
+  const int multiplier = 24;
+  float val_f;
+  uint val_ui;
+  uint sign;
+  int exponent;
+  val_ui = as_uint(x);
+  sign = val_ui & 0x80000000;
+  val_ui = val_ui & 0x7fffffff; /* remove the sign bit */
+  int val_x = val_ui;
+
+  exponent = val_ui >> 23; /* get the exponent */
+  int dexp = exponent;
+
+  /* denormal support */
+  int fbh =
+      127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
+  int dexponent = 25 - fbh;
+  uint dval_ui = (((val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+  int ex = dexponent + n - multiplier;
+  dexponent = ex;
+  uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+  int ex1 = dexponent + multiplier;
+  ex1 = -ex1 + 25;
+  dval_ui = (((dval_ui & 0x007fffff) | 0x800000) >> ex1);
+  dval_ui = dexponent > 0 ? val : dval_ui;
+  dval_ui = dexponent > 254 ? 0x7f800000 : dval_ui; /*overflow*/
+  dval_ui = dexponent < -multiplier ? 0 : dval_ui;  /*underflow*/
+  dval_ui = dval_ui | sign;
+  val_f = as_float(dval_ui);
+
+  exponent += n;
+
+  val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+  ex1 = exponent + multiplier;
+  ex1 = -ex1 + 25;
+  val_ui = (((val_ui & 0x007fffff) | 0x800000) >> ex1);
+  val_ui = exponent > 0 ? val : val_ui;
+  val_ui = exponent > 254 ? 0x7f800000 : val_ui; /*overflow*/
+  val_ui = exponent < -multiplier ? 0 : val_ui;  /*underflow*/
+  val_ui = val_ui | sign;
+
+  val_ui = dexp == 0 ? dval_ui : val_ui;
+  val_f = as_float(val_ui);
+
+  val_f = __clc_isnan(x) | __clc_isinf(x) | val_x == 0 ? x : val_f;
+  return val_f;
 }
 
 #ifdef cl_khr_fp64
@@ -97,32 +101,44 @@ _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) {
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) {
-	long l = as_ulong(x);
-	int e = (l >> 52) & 0x7ff;
-	long s = l & 0x8000000000000000;
+  long l = as_ulong(x);
+  int e = (l >> 52) & 0x7ff;
+  long s = l & 0x8000000000000000;
 
-	ulong ux = as_ulong(x * 0x1.0p+53);
-	int de = ((int)(ux >> 52) & 0x7ff) - 53;
-	int c = e == 0;
-	e = c ? de: e;
+  ulong ux = as_ulong(x * 0x1.0p+53);
+  int de = ((int)(ux >> 52) & 0x7ff) - 53;
+  int c = e == 0;
+  e = c ? de : e;
 
-	ux = c ? ux : l;
+  ux = c ? ux : l;
 
-	int v = e + n;
-	v = clamp(v, -0x7ff, 0x7ff);
+  int v = e + n;
+  v = __clc_clamp(v, -0x7ff, 0x7ff);
 
-	ux &= ~EXPBITS_DP64;
+  ux &= ~EXPBITS_DP64;
 
-	double mr = as_double(ux | ((ulong)(v+53) << 52));
-	mr = mr * 0x1.0p-53;
+  double mr = as_double(ux | ((ulong)(v + 53) << 52));
+  mr = mr * 0x1.0p-53;
 
-	mr = v > 0  ? as_double(ux | ((ulong)v << 52)) : mr;
+  mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr;
 
-	mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64)  : mr;
-	mr = v < -53 ? as_double(s) : mr;
+  mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr;
+  mr = v < -53 ? as_double(s) : mr;
 
-	mr  = ((n == 0) | isinf(x) | (x == 0) ) ? x : mr;
-	return mr;
+  mr = ((n == 0) | __clc_isinf(x) | (x == 0)) ? x : mr;
+  return mr;
 }
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_ldexp(half x, int n) {
+  return (half)__clc_ldexp((float)x, n);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_ldexp, half, int);
+
+#endif
diff --git a/libclc/generic/lib/math/clc_nextafter.cl b/libclc/generic/lib/math/clc_nextafter.cl
index d32ef70..623eb11 100644
--- a/libclc/generic/lib/math/clc_nextafter.cl
+++ b/libclc/generic/lib/math/clc_nextafter.cl
@@ -1,41 +1,44 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
+#include <clc/relational/clc_isnan.h>
 
 // This file provides OpenCL C implementations of nextafter for
 // targets that don't support the clang builtin.
 
 #define AS_TYPE(x) as_##x
 
-#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE)                      \
-_CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, FLOAT_TYPE y) { \
-  const UINT_TYPE sign_bit                                        \
-   = (UINT_TYPE)1 << (sizeof(INT_TYPE) * 8 - 1);                  \
-  const UINT_TYPE sign_bit_mask = sign_bit - 1;                   \
-  INT_TYPE ix = AS_TYPE(INT_TYPE)(x);                             \
-  INT_TYPE ax = ix & sign_bit_mask;                               \
-  INT_TYPE mx = sign_bit - ix;                                    \
-  mx = ix < 0 ? mx : ix;                                          \
-  INT_TYPE iy = AS_TYPE(INT_TYPE)(y);                             \
-  INT_TYPE ay = iy & sign_bit_mask;                               \
-  INT_TYPE my = sign_bit - iy;                                    \
-  my = iy < 0 ? my : iy;                                          \
-  INT_TYPE t = mx + (mx < my ? 1 : -1);                           \
-  INT_TYPE r = sign_bit - t;                                      \
-  r = t < 0 ? r : t;                                              \
-  r = isnan(x) ? ix : r;                                          \
-  r = isnan(y) ? iy : r;                                          \
-  r = ((ax | ay) == 0 | ix == iy) ? iy : r;                       \
-  return AS_TYPE(FLOAT_TYPE)(r);                                  \
-}
+#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE)                             \
+  _CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x,              \
+                                                    FLOAT_TYPE y) {            \
+    const UINT_TYPE sign_bit = (UINT_TYPE)1 << (sizeof(INT_TYPE) * 8 - 1);     \
+    const UINT_TYPE sign_bit_mask = sign_bit - 1;                              \
+    INT_TYPE ix = AS_TYPE(INT_TYPE)(x);                                        \
+    INT_TYPE ax = ix & sign_bit_mask;                                          \
+    INT_TYPE mx = sign_bit - ix;                                               \
+    mx = ix < 0 ? mx : ix;                                                     \
+    INT_TYPE iy = AS_TYPE(INT_TYPE)(y);                                        \
+    INT_TYPE ay = iy & sign_bit_mask;                                          \
+    INT_TYPE my = sign_bit - iy;                                               \
+    my = iy < 0 ? my : iy;                                                     \
+    INT_TYPE t = mx + (mx < my ? 1 : -1);                                      \
+    INT_TYPE r = sign_bit - t;                                                 \
+    r = t < 0 ? r : t;                                                         \
+    r = __clc_isnan(x) ? ix : r;                                               \
+    r = __clc_isnan(y) ? iy : r;                                               \
+    r = ((ax | ay) == 0 | ix == iy) ? iy : r;                                  \
+    return AS_TYPE(FLOAT_TYPE)(r);                                             \
+  }
 
 NEXTAFTER(float, uint, int)
-_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_nextafter, float, float)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_nextafter, float,
+                      float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 NEXTAFTER(double, ulong, long)
-_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_nextafter, double, double)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_nextafter, double,
+                      double)
 #endif
 
 #ifdef cl_khr_fp16
diff --git a/libclc/generic/lib/math/clc_pow.cl b/libclc/generic/lib/math/clc_pow.cl
index 02063a2..2e2dade 100644
--- a/libclc/generic/lib/math/clc_pow.cl
+++ b/libclc/generic/lib/math/clc_pow.cl
@@ -21,11 +21,12 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
 
 #include "config.h"
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 /*
  compute pow using log and exp
@@ -80,7 +81,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y)
      *  First handle case that x is close to 1
      */
     float r = 1.0f - as_float(ax);
-    int near1 = fabs(r) < 0x1.0p-4f;
+    int near1 = __clc_fabs(r) < 0x1.0p-4f;
     float r2 = r*r;
 
     /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
diff --git a/libclc/generic/lib/math/clc_pown.cl b/libclc/generic/lib/math/clc_pown.cl
index 0b7ac32..031bf9b 100644
--- a/libclc/generic/lib/math/clc_pown.cl
+++ b/libclc/generic/lib/math/clc_pown.cl
@@ -21,11 +21,12 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
 
 #include "config.h"
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 // compute pow using log and exp
 // x^y = exp(y * log(x))
@@ -78,7 +79,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny)
     // Extra precise log calculation
     // First handle case that x is close to 1
     float r = 1.0f - as_float(ax);
-    int near1 = fabs(r) < 0x1.0p-4f;
+    int near1 = __clc_fabs(r) < 0x1.0p-4f;
     float r2 = r*r;
 
     // Coefficients are just 1/3, 1/4, 1/5 and 1/6
@@ -368,3 +369,15 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny)
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pown, double, int)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_pown(half x, int y) {
+    return (half)__clc_pown((float)x, y);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_pown, half, int);
+
+#endif
diff --git a/libclc/generic/lib/math/clc_powr.cl b/libclc/generic/lib/math/clc_powr.cl
index ef97d3c..c431f52 100644
--- a/libclc/generic/lib/math/clc_powr.cl
+++ b/libclc/generic/lib/math/clc_powr.cl
@@ -21,11 +21,12 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
 
 #include "config.h"
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 // compute pow using log and exp
 // x^y = exp(y * log(x))
@@ -76,7 +77,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y)
     // Extra precise log calculation
     // First handle case that x is close to 1
     float r = 1.0f - as_float(ax);
-    int near1 = fabs(r) < 0x1.0p-4f;
+    int near1 = __clc_fabs(r) < 0x1.0p-4f;
     float r2 = r*r;
 
     // Coefficients are just 1/3, 1/4, 1/5 and 1/6
diff --git a/libclc/generic/lib/math/clc_remainder.cl b/libclc/generic/lib/math/clc_remainder.cl
index ba50ee3..8a0ce88 100644
--- a/libclc/generic/lib/math/clc_remainder.cl
+++ b/libclc/generic/lib/math/clc_remainder.cl
@@ -21,9 +21,12 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_floor.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/shared/clc_max.h>
 
 #include <math/clc_remainder.h>
-#include "../clcmacro.h"
 #include "config.h"
 #include "math.h"
 
@@ -113,7 +116,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y)
     // less than the mantissa of y, ntimes will be one too large
     // but it doesn't matter - it just means that we'll go round
     // the loop below one extra time.
-    int ntimes = max(0, (xexp1 - yexp1) / 53);
+    int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
     double w =  ldexp(dy, ntimes * 53);
     w = ntimes == 0 ? dy : w;
     double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
@@ -129,7 +132,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y)
 
     for (i = 0; i < ntimes; i++) {
         // Compute integral multiplier
-        t = trunc(dx / w);
+        t = __clc_trunc(dx / w);
 
         // Compute w * t in quad precision
         p = w * t;
@@ -148,7 +151,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y)
 
     // One more time
     // Variable todd says whether the integer t is odd or not
-    t = floor(dx / w);
+    t = __clc_floor(dx / w);
     long lt = (long)t;
     int todd = lt & 1;
 
diff --git a/libclc/generic/lib/math/clc_remquo.cl b/libclc/generic/lib/math/clc_remquo.cl
index 3b9159a..8d2e5f9 100644
--- a/libclc/generic/lib/math/clc_remquo.cl
+++ b/libclc/generic/lib/math/clc_remquo.cl
@@ -21,236 +21,268 @@
  */
 
 #include <clc/clc.h>
-
+#include <clc/clcmacro.h>
+#include <clc/math/clc_floor.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/shared/clc_max.h>
 #include <math/clc_remainder.h>
-#include "../clcmacro.h"
+
 #include "config.h"
 #include "math.h"
 
-_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, __private int *quo)
-{
-    x = __clc_flush_denormal_if_not_supported(x);
-    y = __clc_flush_denormal_if_not_supported(y);
-    int ux = as_int(x);
-    int ax = ux & EXSIGNBIT_SP32;
-    float xa = as_float(ax);
-    int sx = ux ^ ax;
-    int ex = ax >> EXPSHIFTBITS_SP32;
-
-    int uy = as_int(y);
-    int ay = uy & EXSIGNBIT_SP32;
-    float ya = as_float(ay);
-    int sy = uy ^ ay;
-    int ey = ay >> EXPSHIFTBITS_SP32;
-
-    float xr = as_float(0x3f800000 | (ax & 0x007fffff));
-    float yr = as_float(0x3f800000 | (ay & 0x007fffff));
-    int c;
-    int k = ex - ey;
-
-    uint q = 0;
-
-    while (k > 0) {
-        c = xr >= yr;
-        q = (q << 1) | c;
-        xr -= c ? yr : 0.0f;
-        xr += xr;
-	--k;
-    }
-
-    c = xr > yr;
+_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
+                                          __private int *quo) {
+  x = __clc_flush_denormal_if_not_supported(x);
+  y = __clc_flush_denormal_if_not_supported(y);
+  int ux = as_int(x);
+  int ax = ux & EXSIGNBIT_SP32;
+  float xa = as_float(ax);
+  int sx = ux ^ ax;
+  int ex = ax >> EXPSHIFTBITS_SP32;
+
+  int uy = as_int(y);
+  int ay = uy & EXSIGNBIT_SP32;
+  float ya = as_float(ay);
+  int sy = uy ^ ay;
+  int ey = ay >> EXPSHIFTBITS_SP32;
+
+  float xr = as_float(0x3f800000 | (ax & 0x007fffff));
+  float yr = as_float(0x3f800000 | (ay & 0x007fffff));
+  int c;
+  int k = ex - ey;
+
+  uint q = 0;
+
+  while (k > 0) {
+    c = xr >= yr;
     q = (q << 1) | c;
     xr -= c ? yr : 0.0f;
+    xr += xr;
+    --k;
+  }
 
-    int lt = ex < ey;
+  c = xr > yr;
+  q = (q << 1) | c;
+  xr -= c ? yr : 0.0f;
 
-    q = lt ? 0 : q;
-    xr = lt ? xa : xr;
-    yr = lt ? ya : yr;
+  int lt = ex < ey;
 
-    c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
-    xr -= c ? yr : 0.0f;
-    q += c;
+  q = lt ? 0 : q;
+  xr = lt ? xa : xr;
+  yr = lt ? ya : yr;
 
-    float s = as_float(ey << EXPSHIFTBITS_SP32);
-    xr *= lt ? 1.0f : s;
+  c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
+  xr -= c ? yr : 0.0f;
+  q += c;
 
-    int qsgn = sx == sy ? 1 : -1;
-    int quot = (q & 0x7f) * qsgn;
+  float s = as_float(ey << EXPSHIFTBITS_SP32);
+  xr *= lt ? 1.0f : s;
 
-    c = ax == ay;
-    quot = c ? qsgn : quot;
-    xr = c ? 0.0f : xr;
+  int qsgn = sx == sy ? 1 : -1;
+  int quot = (q & 0x7f) * qsgn;
 
-    xr = as_float(sx ^ as_int(xr));
+  c = ax == ay;
+  quot = c ? qsgn : quot;
+  xr = c ? 0.0f : xr;
 
-    c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | ay == 0;
-    quot = c ? 0 : quot;
-    xr = c ? as_float(QNANBITPATT_SP32) : xr;
+  xr = as_float(sx ^ as_int(xr));
 
-    *quo = quot;
+  c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
+      ay == 0;
+  quot = c ? 0 : quot;
+  xr = c ? as_float(QNANBITPATT_SP32) : xr;
 
-    return xr;
-}
-// remquo singature is special, we don't have macro for this
-#define __VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \
-_CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo(TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, __private int##VEC_SIZE *quo) \
-{ \
-	int##HALF_VEC_SIZE lo, hi; \
-	TYPE##VEC_SIZE ret; \
-	ret.lo = __clc_remquo(x.lo, y.lo, &lo); \
-	ret.hi = __clc_remquo(x.hi, y.hi, &hi); \
-	(*quo).lo = lo; \
-	(*quo).hi = hi; \
-	return ret; \
+  *quo = quot;
+
+  return xr;
 }
-__VEC_REMQUO(float, 2,)
-__VEC_REMQUO(float, 3, 2)
+// remquo signature is special, we don't have macro for this
+#define __VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE)                            \
+  _CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo(                          \
+      TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, __private int##VEC_SIZE *quo) {      \
+    int##HALF_VEC_SIZE lo, hi;                                                 \
+    TYPE##VEC_SIZE ret;                                                        \
+    ret.lo = __clc_remquo(x.lo, y.lo, &lo);                                    \
+    ret.hi = __clc_remquo(x.hi, y.hi, &hi);                                    \
+    (*quo).lo = lo;                                                            \
+    (*quo).hi = hi;                                                            \
+    return ret;                                                                \
+  }
+
+#define __VEC3_REMQUO(TYPE)                                                    \
+  _CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo(TYPE##3 x, TYPE##3 y,            \
+                                              __private int##3 * quo) {        \
+    int2 lo;                                                                   \
+    int hi;                                                                    \
+    TYPE##3 ret;                                                               \
+    ret.s01 = __clc_remquo(x.s01, y.s01, &lo);                                 \
+    ret.s2 = __clc_remquo(x.s2, y.s2, &hi);                                    \
+    (*quo).s01 = lo;                                                           \
+    (*quo).s2 = hi;                                                            \
+    return ret;                                                                \
+  }
+__VEC_REMQUO(float, 2, )
+__VEC3_REMQUO(float)
 __VEC_REMQUO(float, 4, 2)
 __VEC_REMQUO(float, 8, 4)
 __VEC_REMQUO(float, 16, 8)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y, __private int *pquo)
-{
-    ulong ux = as_ulong(x);
-    ulong ax = ux & ~SIGNBIT_DP64;
-    ulong xsgn = ux ^ ax;
-    double dx = as_double(ax);
-    int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
-    int xexp1 = 11 - (int) clz(ax & MANTBITS_DP64);
-    xexp1 = xexp < 1 ? xexp1 : xexp;
-
-    ulong uy = as_ulong(y);
-    ulong ay = uy & ~SIGNBIT_DP64;
-    double dy = as_double(ay);
-    int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
-    int yexp1 = 11 - (int) clz(ay & MANTBITS_DP64);
-    yexp1 = yexp < 1 ? yexp1 : yexp;
-
-    int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
-
-    // First assume |x| > |y|
-
-    // Set ntimes to the number of times we need to do a
-    // partial remainder. If the exponent of x is an exact multiple
-    // of 53 larger than the exponent of y, and the mantissa of x is
-    // less than the mantissa of y, ntimes will be one too large
-    // but it doesn't matter - it just means that we'll go round
-    // the loop below one extra time.
-    int ntimes = max(0, (xexp1 - yexp1) / 53);
-    double w =  ldexp(dy, ntimes * 53);
-    w = ntimes == 0 ? dy : w;
-    double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
-
-    // Each time round the loop we compute a partial remainder.
-    // This is done by subtracting a large multiple of w
-    // from x each time, where w is a scaled up version of y.
-    // The subtraction must be performed exactly in quad
-    // precision, though the result at each stage can
-    // fit exactly in a double precision number.
-    int i;
-    double t, v, p, pp;
-
-    for (i = 0; i < ntimes; i++) {
-        // Compute integral multiplier
-        t = trunc(dx / w);
-
-        // Compute w * t in quad precision
-        p = w * t;
-        pp = fma(w, t, -p);
-
-        // Subtract w * t from dx
-        v = dx - p;
-        dx = v + (((dx - v) - p) - pp);
-
-        // If t was one too large, dx will be negative. Add back one w.
-        dx += dx < 0.0 ? w : 0.0;
-
-        // Scale w down by 2^(-53) for the next iteration
-        w *= scale;
-    }
-
-    // One more time
-    // Variable todd says whether the integer t is odd or not
-    t = floor(dx / w);
-    long lt = (long)t;
-    int todd = lt & 1;
-
+_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
+                                           __private int *pquo) {
+  ulong ux = as_ulong(x);
+  ulong ax = ux & ~SIGNBIT_DP64;
+  ulong xsgn = ux ^ ax;
+  double dx = as_double(ax);
+  int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
+  int xexp1 = 11 - (int)clz(ax & MANTBITS_DP64);
+  xexp1 = xexp < 1 ? xexp1 : xexp;
+
+  ulong uy = as_ulong(y);
+  ulong ay = uy & ~SIGNBIT_DP64;
+  double dy = as_double(ay);
+  int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
+  int yexp1 = 11 - (int)clz(ay & MANTBITS_DP64);
+  yexp1 = yexp < 1 ? yexp1 : yexp;
+
+  int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
+
+  // First assume |x| > |y|
+
+  // Set ntimes to the number of times we need to do a
+  // partial remainder. If the exponent of x is an exact multiple
+  // of 53 larger than the exponent of y, and the mantissa of x is
+  // less than the mantissa of y, ntimes will be one too large
+  // but it doesn't matter - it just means that we'll go round
+  // the loop below one extra time.
+  int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
+  double w = ldexp(dy, ntimes * 53);
+  w = ntimes == 0 ? dy : w;
+  double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
+
+  // Each time round the loop we compute a partial remainder.
+  // This is done by subtracting a large multiple of w
+  // from x each time, where w is a scaled up version of y.
+  // The subtraction must be performed exactly in quad
+  // precision, though the result at each stage can
+  // fit exactly in a double precision number.
+  int i;
+  double t, v, p, pp;
+
+  for (i = 0; i < ntimes; i++) {
+    // Compute integral multiplier
+    t = __clc_trunc(dx / w);
+
+    // Compute w * t in quad precision
     p = w * t;
     pp = fma(w, t, -p);
+
+    // Subtract w * t from dx
     v = dx - p;
     dx = v + (((dx - v) - p) - pp);
-    i = dx < 0.0;
-    todd ^= i;
-    dx += i ? w : 0.0;
-
-    lt -= i;
-
-    // At this point, dx lies in the range [0,dy)
-
-    // For the remainder function, we need to adjust dx
-    // so that it lies in the range (-y/2, y/2] by carefully
-    // subtracting w (== dy == y) if necessary. The rigmarole
-    // with todd is to get the correct sign of the result
-    // when x/y lies exactly half way between two integers,
-    // when we need to choose the even integer.
-
-    int al = (2.0*dx > w) | (todd & (2.0*dx == w));
-    double dxl = dx - (al ? w : 0.0);
-
-    int ag = (dx > 0.5*w) | (todd & (dx == 0.5*w));
-    double dxg = dx - (ag ? w : 0.0);
-
-    dx = dy < 0x1.0p+1022 ? dxl : dxg;
-    lt += dy < 0x1.0p+1022 ? al : ag;
-    int quo = ((int)lt & 0x7f) * qsgn;
-
-    double ret = as_double(xsgn ^ as_ulong(dx));
-    dx = as_double(ax);
-
-    // Now handle |x| == |y|
-    int c = dx == dy;
-    t = as_double(xsgn);
-    quo = c ? qsgn : quo;
-    ret = c ? t : ret;
-
-    // Next, handle |x| < |y|
-    c = dx < dy;
-    quo = c ? 0 : quo;
-    ret = c ? x : ret;
-
-    c &= (yexp < 1023 & 2.0*dx > dy) | (dx > 0.5*dy);
-    quo = c ? qsgn : quo;
-    // we could use a conversion here instead since qsgn = +-1
-    p = qsgn == 1 ? -1.0 : 1.0;
-    t = fma(y, p, x);
-    ret = c ? t : ret;
-
-    // We don't need anything special for |x| == 0
-
-    // |y| is 0
-    c = dy == 0.0;
-    quo = c ? 0 : quo;
-    ret = c ? as_double(QNANBITPATT_DP64) : ret;
-
-    // y is +-Inf, NaN
-    c = yexp > BIASEDEMAX_DP64;
-    quo = c ? 0 : quo;
-    t = y == y ? x : y;
-    ret = c ? t : ret;
-
-    // x is +=Inf, NaN
-    c = xexp > BIASEDEMAX_DP64;
-    quo = c ? 0 : quo;
-    ret = c ? as_double(QNANBITPATT_DP64) : ret;
-
-    *pquo = quo;
-    return ret;
+
+    // If t was one too large, dx will be negative. Add back one w.
+    dx += dx < 0.0 ? w : 0.0;
+
+    // Scale w down by 2^(-53) for the next iteration
+    w *= scale;
+  }
+
+  // One more time
+  // Variable todd says whether the integer t is odd or not
+  t = __clc_floor(dx / w);
+  long lt = (long)t;
+  int todd = lt & 1;
+
+  p = w * t;
+  pp = fma(w, t, -p);
+  v = dx - p;
+  dx = v + (((dx - v) - p) - pp);
+  i = dx < 0.0;
+  todd ^= i;
+  dx += i ? w : 0.0;
+
+  lt -= i;
+
+  // At this point, dx lies in the range [0,dy)
+
+  // For the remainder function, we need to adjust dx
+  // so that it lies in the range (-y/2, y/2] by carefully
+  // subtracting w (== dy == y) if necessary. The rigmarole
+  // with todd is to get the correct sign of the result
+  // when x/y lies exactly half way between two integers,
+  // when we need to choose the even integer.
+
+  int al = (2.0 * dx > w) | (todd & (2.0 * dx == w));
+  double dxl = dx - (al ? w : 0.0);
+
+  int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w));
+  double dxg = dx - (ag ? w : 0.0);
+
+  dx = dy < 0x1.0p+1022 ? dxl : dxg;
+  lt += dy < 0x1.0p+1022 ? al : ag;
+  int quo = ((int)lt & 0x7f) * qsgn;
+
+  double ret = as_double(xsgn ^ as_ulong(dx));
+  dx = as_double(ax);
+
+  // Now handle |x| == |y|
+  int c = dx == dy;
+  t = as_double(xsgn);
+  quo = c ? qsgn : quo;
+  ret = c ? t : ret;
+
+  // Next, handle |x| < |y|
+  c = dx < dy;
+  quo = c ? 0 : quo;
+  ret = c ? x : ret;
+
+  c &= (yexp<1023 & 2.0 * dx> dy) | (dx > 0.5 * dy);
+  quo = c ? qsgn : quo;
+  // we could use a conversion here instead since qsgn = +-1
+  p = qsgn == 1 ? -1.0 : 1.0;
+  t = fma(y, p, x);
+  ret = c ? t : ret;
+
+  // We don't need anything special for |x| == 0
+
+  // |y| is 0
+  c = dy == 0.0;
+  quo = c ? 0 : quo;
+  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+
+  // y is +-Inf, NaN
+  c = yexp > BIASEDEMAX_DP64;
+  quo = c ? 0 : quo;
+  t = y == y ? x : y;
+  ret = c ? t : ret;
+
+  // x is +=Inf, NaN
+  c = xexp > BIASEDEMAX_DP64;
+  quo = c ? 0 : quo;
+  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+
+  *pquo = quo;
+  return ret;
 }
-__VEC_REMQUO(double, 2,)
-__VEC_REMQUO(double, 3, 2)
+__VEC_REMQUO(double, 2, )
+__VEC3_REMQUO(double)
 __VEC_REMQUO(double, 4, 2)
 __VEC_REMQUO(double, 8, 4)
 __VEC_REMQUO(double, 16, 8)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y, __private int *pquo) {
+  return (half)__clc_remquo((float)x, (float)y, pquo);
+}
+__VEC_REMQUO(half, 2, )
+__VEC3_REMQUO(half)
+__VEC_REMQUO(half, 4, 2)
+__VEC_REMQUO(half, 8, 4)
+__VEC_REMQUO(half, 16, 8)
+
+#endif
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
index 0a2c98d..eee9c9f 100644
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -21,11 +21,12 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
 
 #include "config.h"
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 // compute pow using log and exp
 // x^y = exp(y * log(x))
@@ -78,7 +79,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny)
     // Extra precise log calculation
     // First handle case that x is close to 1
     float r = 1.0f - as_float(ax);
-    int near1 = fabs(r) < 0x1.0p-4f;
+    int near1 = __clc_fabs(r) < 0x1.0p-4f;
     float r2 = r*r;
 
     // Coefficients are just 1/3, 1/4, 1/5 and 1/6
@@ -368,3 +369,15 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny)
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_rootn(half x, int y) {
+    return (half)__clc_rootn((float)x, y);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_rootn, half, int);
+
+#endif
diff --git a/libclc/generic/lib/math/clc_sqrt.cl b/libclc/generic/lib/math/clc_sqrt.cl
index 14a48aa..92c7f6e 100644
--- a/libclc/generic/lib/math/clc_sqrt.cl
+++ b/libclc/generic/lib/math/clc_sqrt.cl
@@ -25,7 +25,7 @@
 // Map the llvm sqrt intrinsic to an OpenCL function.
 #define __CLC_FUNCTION __clc_llvm_intr_sqrt
 #define __CLC_INTRINSIC "llvm.sqrt"
-#include <math/unary_intrin.inc>
+#include <clc/math/unary_intrin.inc>
 #undef __CLC_FUNCTION
 #undef __CLC_INTRINSIC
 
diff --git a/libclc/generic/lib/math/clc_sw_binary.inc b/libclc/generic/lib/math/clc_sw_binary.inc
index 7741475c..b701d78 100644
--- a/libclc/generic/lib/math/clc_sw_binary.inc
+++ b/libclc/generic/lib/math/clc_sw_binary.inc
@@ -1,12 +1,26 @@
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_SW_FUNC(x) __CLC_CONCAT(__clc_, x)
 
-// TODO: Enable half precision when the sw routine is implemented
 #if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return __CLC_SW_FUNC(__CLC_FUNC)(x, y);
 }
+#elif __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x,
+                                                __CLC_GENTYPE y) {
+  return convert_half(
+      __CLC_SW_FUNC(__CLC_FUNC)(convert_float(x), convert_float(y)));
+}
+#else
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x,
+                                                __CLC_GENTYPE y) {
+  return __CLC_XCONCAT(convert_half, __CLC_VECSIZE)(__CLC_SW_FUNC(__CLC_FUNC)(
+      __CLC_XCONCAT(convert_float, __CLC_VECSIZE)(x),
+      __CLC_XCONCAT(convert_float, __CLC_VECSIZE)(y)));
+}
+#endif
 #endif
 
 #undef __CLC_SW_FUNC
diff --git a/libclc/generic/lib/math/clc_sw_unary.inc b/libclc/generic/lib/math/clc_sw_unary.inc
index cd148b0..0cf242d 100644
--- a/libclc/generic/lib/math/clc_sw_unary.inc
+++ b/libclc/generic/lib/math/clc_sw_unary.inc
@@ -1,12 +1,22 @@
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_SW_FUNC(x) __CLC_CONCAT(__clc_, x)
 
-// TODO: Enable half precision when the sw routine is implemented
 #if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x) {
   return __CLC_SW_FUNC(__CLC_FUNC)(x);
 }
+#elif __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x) {
+  return convert_half(__CLC_SW_FUNC(__CLC_FUNC)(convert_float(x)));
+}
+#else
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x) {
+  return __CLC_XCONCAT(convert_half, __CLC_VECSIZE)(__CLC_SW_FUNC(__CLC_FUNC)(
+      __CLC_XCONCAT(convert_float, __CLC_VECSIZE)(x)));
+}
+#endif
 #endif
 
 #undef __CLC_SW_FUNC
diff --git a/libclc/generic/lib/math/clc_tan.cl b/libclc/generic/lib/math/clc_tan.cl
index ebba36a..4daaee5 100644
--- a/libclc/generic/lib/math/clc_tan.cl
+++ b/libclc/generic/lib/math/clc_tan.cl
@@ -20,52 +20,55 @@
  * THE SOFTWARE.
  */
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
 
 #include "math.h"
 #include "sincos_helpers.h"
-#include "../clcmacro.h"
 #include "tables.h"
 
-_CLC_DEF _CLC_OVERLOAD float __clc_tan(float x)
-{
-    int ix = as_int(x);
-    int ax = ix & 0x7fffffff;
-    float dx = as_float(ax);
+_CLC_DEF _CLC_OVERLOAD float __clc_tan(float x) {
+  int ix = as_int(x);
+  int ax = ix & 0x7fffffff;
+  float dx = as_float(ax);
 
-    float r0, r1;
-    int regn = __clc_argReductionS(&r0, &r1, dx);
+  float r0, r1;
+  int regn = __clc_argReductionS(&r0, &r1, dx);
 
-    float t = __clc_tanf_piby4(r0 + r1, regn);
-    t = as_float(as_int(t) ^ (ix ^ ax));
+  float t = __clc_tanf_piby4(r0 + r1, regn);
+  t = as_float(as_int(t) ^ (ix ^ ax));
 
-    t = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : t;
-    //Take care of subnormals
-    t = (x == 0.0f) ? x : t;
-    return t;
+  t = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : t;
+  // Take care of subnormals
+  t = (x == 0.0f) ? x : t;
+  return t;
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_tan, float);
 
 #ifdef cl_khr_fp64
 #include "sincosD_piby4.h"
 
-_CLC_DEF _CLC_OVERLOAD double __clc_tan(double x)
-{
-    double y = fabs(x);
+_CLC_DEF _CLC_OVERLOAD double __clc_tan(double x) {
+  double y = __clc_fabs(x);
 
-    double r, rr;
-    int regn;
+  double r, rr;
+  int regn;
 
-    if (y < 0x1.0p+30)
-        __clc_remainder_piby2_medium(y, &r, &rr, &regn);
-    else
-        __clc_remainder_piby2_large(y, &r, &rr, &regn);
+  if (y < 0x1.0p+30)
+    __clc_remainder_piby2_medium(y, &r, &rr, &regn);
+  else
+    __clc_remainder_piby2_large(y, &r, &rr, &regn);
 
-    double2 tt = __clc_tan_piby4(r, rr);
+  double2 tt = __clc_tan_piby4(r, rr);
 
-    int2 t = as_int2(regn & 1 ? tt.y : tt.x);
-    t.hi ^= (x < 0.0) << 31;
+  int2 t = as_int2(regn & 1 ? tt.y : tt.x);
+  t.hi ^= (x < 0.0) << 31;
 
-    return isnan(x) || isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(t);
+  return __clc_isnan(x) || __clc_isinf(x) ? as_double(QNANBITPATT_DP64)
+                                          : as_double(t);
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_tan, double);
+
 #endif
diff --git a/libclc/generic/lib/math/clc_tanpi.cl b/libclc/generic/lib/math/clc_tanpi.cl
index d57c3ce..65d1984 100644
--- a/libclc/generic/lib/math/clc_tanpi.cl
+++ b/libclc/generic/lib/math/clc_tanpi.cl
@@ -20,10 +20,10 @@
  * THE SOFTWARE.
  */
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "sincos_helpers.h"
-#include "../clcmacro.h"
 #include "tables.h"
 
 _CLC_DEF _CLC_OVERLOAD float __clc_tanpi(float x)
diff --git a/libclc/generic/lib/math/copysign.cl b/libclc/generic/lib/math/copysign.cl
index df65e9d..08045be 100644
--- a/libclc/generic/lib/math/copysign.cl
+++ b/libclc/generic/lib/math/copysign.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_DEFINE_BINARY_BUILTIN(float, copysign, __builtin_copysignf, float, float)
 
diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl
index 157447f..4219289 100644
--- a/libclc/generic/lib/math/cos.cl
+++ b/libclc/generic/lib/math/cos.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "sincos_helpers.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float cos(float x)
 {
@@ -75,3 +75,5 @@ _CLC_OVERLOAD _CLC_DEF double cos(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cos)
diff --git a/libclc/generic/lib/math/cosh.cl b/libclc/generic/lib/math/cosh.cl
index 1a67275..1f58d7a 100644
--- a/libclc/generic/lib/math/cosh.cl
+++ b/libclc/generic/lib/math/cosh.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float cosh(float x) {
 
@@ -190,3 +190,5 @@ _CLC_OVERLOAD _CLC_DEF double cosh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cosh, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cosh)
diff --git a/libclc/generic/lib/math/cospi.cl b/libclc/generic/lib/math/cospi.cl
index 108b637..0e69f78 100644
--- a/libclc/generic/lib/math/cospi.cl
+++ b/libclc/generic/lib/math/cospi.cl
@@ -21,11 +21,11 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "sincos_helpers.h"
 #include "sincospiF_piby4.h"
-#include "../clcmacro.h"
 #ifdef cl_khr_fp64
 #include "sincosD_piby4.h"
 #endif
@@ -134,3 +134,5 @@ _CLC_OVERLOAD _CLC_DEF double cospi(double x) {
 }
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cospi, double);
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cospi)
diff --git a/libclc/generic/lib/math/erf.cl b/libclc/generic/lib/math/erf.cl
index 2c395ce..ae8b6ab 100644
--- a/libclc/generic/lib/math/erf.cl
+++ b/libclc/generic/lib/math/erf.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 /*
  * ====================================================
diff --git a/libclc/generic/lib/math/erfc.cl b/libclc/generic/lib/math/erfc.cl
index cd35ea8..c4d34ea 100644
--- a/libclc/generic/lib/math/erfc.cl
+++ b/libclc/generic/lib/math/erfc.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 /*
  * ====================================================
diff --git a/libclc/generic/lib/math/exp.cl b/libclc/generic/lib/math/exp.cl
index 37f693c..1e37d76 100644
--- a/libclc/generic/lib/math/exp.cl
+++ b/libclc/generic/lib/math/exp.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float exp(float x) {
 
@@ -88,3 +88,5 @@ _CLC_OVERLOAD _CLC_DEF double exp(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(exp)
diff --git a/libclc/generic/lib/math/exp2.cl b/libclc/generic/lib/math/exp2.cl
index 1ddccbd..8d71831 100644
--- a/libclc/generic/lib/math/exp2.cl
+++ b/libclc/generic/lib/math/exp2.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float exp2(float x) {
 
diff --git a/libclc/generic/lib/math/expm1.cl b/libclc/generic/lib/math/expm1.cl
index 9a3a907..fbb9f0d 100644
--- a/libclc/generic/lib/math/expm1.cl
+++ b/libclc/generic/lib/math/expm1.cl
@@ -1,8 +1,8 @@
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 /* Refer to the exp routine for the underlying algorithm */
 
@@ -140,3 +140,5 @@ _CLC_OVERLOAD _CLC_DEF double expm1(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(expm1)
diff --git a/libclc/generic/lib/math/fabs.cl b/libclc/generic/lib/math/fabs.cl
index 0a70370..9644369 100644
--- a/libclc/generic/lib/math/fabs.cl
+++ b/libclc/generic/lib/math/fabs.cl
@@ -1,10 +1,6 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
-
-// Map the llvm intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_fabs
-#define __CLC_INTRINSIC "llvm.fabs"
-#include "math/unary_intrin.inc"
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
 
 #undef __CLC_FUNCTION
 #define __CLC_FUNCTION fabs
diff --git a/libclc/generic/lib/math/fdim.inc b/libclc/generic/lib/math/fdim.inc
index 9aa3496..98cbef6 100644
--- a/libclc/generic/lib/math/fdim.inc
+++ b/libclc/generic/lib/math/fdim.inc
@@ -69,3 +69,28 @@ __CLC_FDIM_VEC(16)
 #undef __CLC_FDIM_VEC
 #endif
 #endif
+
+#if __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+#define QNANBITPATT_FP16 ((short)0x7e00)
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fdim(__CLC_GENTYPE x,
+                                          private __CLC_GENTYPE y) {
+  short n = -(isnan(x) | isnan(y)) & QNANBITPATT_FP16;
+  short r = -(x > y) & as_short(x - y);
+  return as_half((short)(n | r));
+}
+#define __CLC_FDIM_VEC(width)                                                  \
+  _CLC_OVERLOAD _CLC_DEF half##width fdim(half##width x, half##width y) {      \
+    /* See comment in float implementation for explanation. */                 \
+    short##width n = ~((x == x) & (y == y)) & QNANBITPATT_FP16;                \
+    short##width r = (x > y) & as_short##width(x - y);                         \
+    return as_half##width(n | r);                                              \
+  }
+__CLC_FDIM_VEC(2)
+__CLC_FDIM_VEC(3)
+__CLC_FDIM_VEC(4)
+__CLC_FDIM_VEC(8)
+__CLC_FDIM_VEC(16)
+#undef __CLC_FDIM_VEC
+#endif
+#endif
diff --git a/libclc/generic/lib/math/floor.cl b/libclc/generic/lib/math/floor.cl
index de215e4..f5c36b7 100644
--- a/libclc/generic/lib/math/floor.cl
+++ b/libclc/generic/lib/math/floor.cl
@@ -1,10 +1,6 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
-
-// Map the llvm intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_floor
-#define __CLC_INTRINSIC "llvm.floor"
-#include "math/unary_intrin.inc"
+#include <clc/clcmacro.h>
+#include <clc/math/clc_floor.h>
 
 #undef __CLC_FUNCTION
 #define __CLC_FUNCTION floor
diff --git a/libclc/generic/lib/math/fmax.cl b/libclc/generic/lib/math/fmax.cl
index 5c269ce..c42fe4f 100644
--- a/libclc/generic/lib/math/fmax.cl
+++ b/libclc/generic/lib/math/fmax.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_DEFINE_BINARY_BUILTIN(float, fmax, __builtin_fmaxf, float, float);
 
diff --git a/libclc/generic/lib/math/fmin.cl b/libclc/generic/lib/math/fmin.cl
index 45c112d..55575d0 100644
--- a/libclc/generic/lib/math/fmin.cl
+++ b/libclc/generic/lib/math/fmin.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_DEFINE_BINARY_BUILTIN(float, fmin, __builtin_fminf, float, float);
 
diff --git a/libclc/generic/lib/math/frexp.cl b/libclc/generic/lib/math/frexp.cl
index cd2c717..75a9158 100644
--- a/libclc/generic/lib/math/frexp.cl
+++ b/libclc/generic/lib/math/frexp.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_BODY <frexp.inc>
 #define __CLC_ADDRESS_SPACE private
diff --git a/libclc/generic/lib/math/frexp.inc b/libclc/generic/lib/math/frexp.inc
index b61cc35..0d938d2 100644
--- a/libclc/generic/lib/math/frexp.inc
+++ b/libclc/generic/lib/math/frexp.inc
@@ -21,6 +21,8 @@
  * THE SOFTWARE.
  */
 
+#include <clc/clcmacro.h>
+
 #define __CLC_AS_GENTYPE __CLC_XCONCAT(as_, __CLC_GENTYPE)
 #define __CLC_AS_INTN __CLC_XCONCAT(as_, __CLC_INTN)
 
@@ -40,6 +42,17 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE
 }
 #endif
 
+#if __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x,
+                                           __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
+  return (__CLC_GENTYPE)frexp((float)x, ep);
+}
+_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, __CLC_GENTYPE, frexp,
+                      __CLC_GENTYPE, __CLC_ADDRESS_SPACE, __CLC_INTN);
+#endif
+#endif
+
 #if __CLC_FPSIZE == 64
 #ifdef __CLC_SCALAR
 #define __CLC_AS_LONGN as_long
diff --git a/libclc/generic/lib/math/half_binary.inc b/libclc/generic/lib/math/half_binary.inc
index f831b53..2dc48e5 100644
--- a/libclc/generic/lib/math/half_binary.inc
+++ b/libclc/generic/lib/math/half_binary.inc
@@ -1,4 +1,4 @@
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_HALF_FUNC(x) __CLC_CONCAT(half_, x)
 
diff --git a/libclc/generic/lib/math/half_unary.inc b/libclc/generic/lib/math/half_unary.inc
index a68f91a..aac668a 100644
--- a/libclc/generic/lib/math/half_unary.inc
+++ b/libclc/generic/lib/math/half_unary.inc
@@ -1,4 +1,4 @@
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_HALF_FUNC(x) __CLC_CONCAT(half_, x)
 
diff --git a/libclc/generic/lib/math/ilogb.cl b/libclc/generic/lib/math/ilogb.cl
index 050239c..f16b440 100644
--- a/libclc/generic/lib/math/ilogb.cl
+++ b/libclc/generic/lib/math/ilogb.cl
@@ -21,9 +21,9 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
-#include "../clcmacro.h"
 #include "math.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF int ilogb(float x) {
     uint ux = as_uint(x);
@@ -71,3 +71,15 @@ _CLC_OVERLOAD _CLC_DEF int ilogb(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF int ilogb(half x) {
+    return ilogb((float)x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, half);
+
+#endif
diff --git a/libclc/generic/lib/math/ldexp.cl b/libclc/generic/lib/math/ldexp.cl
index 190a4d5..a999c63 100644
--- a/libclc/generic/lib/math/ldexp.cl
+++ b/libclc/generic/lib/math/ldexp.cl
@@ -20,11 +20,11 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
 #include "config.h"
-#include "../clcmacro.h"
 #include "math.h"
 #include "math/clc_ldexp.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 _CLC_DEFINE_BINARY_BUILTIN(float, ldexp, __clc_ldexp, float, int)
 
diff --git a/libclc/generic/lib/math/lgamma.cl b/libclc/generic/lib/math/lgamma.cl
index 26cd20e..ca7b961 100644
--- a/libclc/generic/lib/math/lgamma.cl
+++ b/libclc/generic/lib/math/lgamma.cl
@@ -22,7 +22,7 @@
  */
 
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float lgamma(float x) {
     int s;
@@ -41,4 +41,6 @@ _CLC_OVERLOAD _CLC_DEF double lgamma(double x) {
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma, double)
 
-#endif
-\ No newline at end of file
+#endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(lgamma)
diff --git a/libclc/generic/lib/math/lgamma_r.cl b/libclc/generic/lib/math/lgamma_r.cl
index ff44738..bd68a76 100644
--- a/libclc/generic/lib/math/lgamma_r.cl
+++ b/libclc/generic/lib/math/lgamma_r.cl
@@ -22,8 +22,8 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
-#include "../clcmacro.h"
 #include "math.h"
 
 /*
@@ -486,6 +486,17 @@ _CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, private int *ip) {
 _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma_r, double, private, int)
 #endif
 
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half lgamma_r(half x, private int *iptr) {
+    return (half)lgamma_r((float)x, iptr);
+}
+
+_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, lgamma_r, half, private, int);
+
+#endif
 
 #define __CLC_ADDRSPACE global
 #define __CLC_BODY <lgamma_r.inc>
diff --git a/libclc/generic/lib/math/lgamma_r.inc b/libclc/generic/lib/math/lgamma_r.inc
index 0e19ba8..8aa17fb 100644
--- a/libclc/generic/lib/math/lgamma_r.inc
+++ b/libclc/generic/lib/math/lgamma_r.inc
@@ -21,12 +21,9 @@
  * THE SOFTWARE.
  */
 
-// TODO: Enable half precision when the base version is implemented.
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, __CLC_ADDRSPACE __CLC_INTN *iptr) {
     __CLC_INTN private_iptr;
     __CLC_GENTYPE ret = lgamma_r(x, &private_iptr);
     *iptr = private_iptr;
     return ret;
 }
-#endif
diff --git a/libclc/generic/lib/math/log.cl b/libclc/generic/lib/math/log.cl
index ec1faa1..336c801 100644
--- a/libclc/generic/lib/math/log.cl
+++ b/libclc/generic/lib/math/log.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 /*
  *log(x) = log2(x) * (1/log2(e))
diff --git a/libclc/generic/lib/math/log10.cl b/libclc/generic/lib/math/log10.cl
index 35a53a1..3abb14a 100644
--- a/libclc/generic/lib/math/log10.cl
+++ b/libclc/generic/lib/math/log10.cl
@@ -20,14 +20,18 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
-#include "../clcmacro.h"
 #include "tables.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif // cl_khr_fp64
 
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // cl_khr_fp16
+
 #define COMPILING_LOG10
 #include "log_base.h"
 #undef COMPILING_LOG10
@@ -37,3 +41,7 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log10, float);
 #ifdef cl_khr_fp64
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log10, double);
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, log10, half);
+#endif // cl_khr_fp16
diff --git a/libclc/generic/lib/math/log1p.cl b/libclc/generic/lib/math/log1p.cl
index be25c64..a371995 100644
--- a/libclc/generic/lib/math/log1p.cl
+++ b/libclc/generic/lib/math/log1p.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float log1p(float x)
 {
@@ -175,3 +175,5 @@ _CLC_OVERLOAD _CLC_DEF double log1p(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log1p, double);
 
 #endif // cl_khr_fp64
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(log1p)
diff --git a/libclc/generic/lib/math/log2.cl b/libclc/generic/lib/math/log2.cl
index 8776a80..a6f9692 100644
--- a/libclc/generic/lib/math/log2.cl
+++ b/libclc/generic/lib/math/log2.cl
@@ -20,14 +20,18 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
-#include "../clcmacro.h"
 #include "tables.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif // cl_khr_fp64
 
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // cl_khr_fp16
+
 #define COMPILING_LOG2
 #include "log_base.h"
 #undef COMPILING_LOG2
@@ -37,3 +41,7 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log2, float);
 #ifdef cl_khr_fp64
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log2, double);
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, log2, half);
+#endif // cl_khr_fp16
diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h
index 4e20329..b8110ca 100644
--- a/libclc/generic/lib/math/log_base.h
+++ b/libclc/generic/lib/math/log_base.h
@@ -295,3 +295,22 @@ log(double x)
 }
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+_CLC_OVERLOAD _CLC_DEF half
+#if defined(COMPILING_LOG2)
+log2(half x) {
+  return (half)log2((float)x);
+}
+#elif defined(COMPILING_LOG10)
+log10(half x) {
+  return (half)log10((float)x);
+}
+#else
+log(half x) {
+  return (half)log((float)x);
+}
+#endif
+
+#endif // cl_khr_fp16
diff --git a/libclc/generic/lib/math/logb.cl b/libclc/generic/lib/math/logb.cl
index 31e5161..7a7111d 100644
--- a/libclc/generic/lib/math/logb.cl
+++ b/libclc/generic/lib/math/logb.cl
@@ -1,6 +1,6 @@
-#include <clc/clc.h>
 #include "math.h"
-#include "../clcmacro.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float logb(float x) {
     int ax = as_int(x) & EXSIGNBIT_SP32;
@@ -29,3 +29,5 @@ _CLC_OVERLOAD _CLC_DEF double logb(double x) {
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, logb, double)
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(logb)
diff --git a/libclc/generic/lib/math/math.h b/libclc/generic/lib/math/math.h
index 351e37d..d5ef087 100644
--- a/libclc/generic/lib/math/math.h
+++ b/libclc/generic/lib/math/math.h
@@ -40,7 +40,7 @@
 
 #if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__
 #define HAVE_HW_FMA32() (0)
-#elif defined CLC_SPIRV || defined CLC_SPIRV64
+#elif defined(CLC_SPIRV)
 bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
 #define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
 #else
diff --git a/libclc/generic/lib/math/maxmag.cl b/libclc/generic/lib/math/maxmag.cl
index 7b5902d..12d22ae 100644
--- a/libclc/generic/lib/math/maxmag.cl
+++ b/libclc/generic/lib/math/maxmag.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_BODY <maxmag.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/minmag.cl b/libclc/generic/lib/math/minmag.cl
index 0d898820..e9c9c82 100644
--- a/libclc/generic/lib/math/minmag.cl
+++ b/libclc/generic/lib/math/minmag.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_BODY <minmag.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/nan.cl b/libclc/generic/lib/math/nan.cl
index 03752ab..8f89e8e 100644
--- a/libclc/generic/lib/math/nan.cl
+++ b/libclc/generic/lib/math/nan.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "utils.h"
+#include <clc/utils.h>
 
 #define __CLC_AS_GENTYPE __CLC_XCONCAT(as_, __CLC_GENTYPE)
 #define __CLC_BODY <nan.inc>
diff --git a/libclc/generic/lib/math/native_unary_intrinsic.inc b/libclc/generic/lib/math/native_unary_intrinsic.inc
index 5640141e..c0a3efd 100644
--- a/libclc/generic/lib/math/native_unary_intrinsic.inc
+++ b/libclc/generic/lib/math/native_unary_intrinsic.inc
@@ -20,14 +20,14 @@
  * THE SOFTWARE.
  */
 
-#include <utils.h>
+#include <clc/utils.h>
 
 #ifdef __CLC_SCALAR
 #define __CLC_FUNCTION __CLC_XCONCAT(__clc_native_, __CLC_NATIVE_INTRINSIC)
 #define __CLC_INTRINSIC "llvm." __CLC_XSTR(__CLC_NATIVE_INTRINSIC)
 
 #undef cl_khr_fp64
-#include <math/unary_intrin.inc>
+#include <clc/math/unary_intrin.inc>
 
 #endif
 
diff --git a/libclc/generic/lib/math/pown.inc b/libclc/generic/lib/math/pown.inc
index 2add2c7..84729d9 100644
--- a/libclc/generic/lib/math/pown.inc
+++ b/libclc/generic/lib/math/pown.inc
@@ -1,6 +1,3 @@
-// TODO: Enable half precision when the sw routine is implemented
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE pown(__CLC_GENTYPE x, __CLC_INTN y) {
   return __clc_pown(x, y);
 }
-#endif
diff --git a/libclc/generic/lib/math/remquo.inc b/libclc/generic/lib/math/remquo.inc
index c33b5dd..c1de78a 100644
--- a/libclc/generic/lib/math/remquo.inc
+++ b/libclc/generic/lib/math/remquo.inc
@@ -1,9 +1,6 @@
-// TODO: Enable half precision when the sw routine is implemented
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE remquo(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_ADDRESS_SPACE __CLC_INTN *q) {
   __CLC_INTN local_q;
   __CLC_GENTYPE ret = __clc_remquo(x, y, &local_q);
   *q = local_q;
   return ret;
 }
-#endif
diff --git a/libclc/generic/lib/math/rint.cl b/libclc/generic/lib/math/rint.cl
index 5d9f4b1..185bbbb 100644
--- a/libclc/generic/lib/math/rint.cl
+++ b/libclc/generic/lib/math/rint.cl
@@ -1,9 +1,5 @@
 #include <clc/clc.h>
-
-// Map the llvm intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_rint
-#define __CLC_INTRINSIC "llvm.rint"
-#include "math/unary_intrin.inc"
+#include <clc/math/clc_rint.h>
 
 #undef __CLC_FUNCTION
 #define __CLC_FUNCTION rint
diff --git a/libclc/generic/lib/math/rootn.inc b/libclc/generic/lib/math/rootn.inc
index f788649..3f5b00c 100644
--- a/libclc/generic/lib/math/rootn.inc
+++ b/libclc/generic/lib/math/rootn.inc
@@ -1,6 +1,3 @@
-// TODO: Enable half precision when the sw routine is implemented
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rootn(__CLC_GENTYPE x, __CLC_INTN y) {
   return __clc_rootn(x, y);
 }
-#endif
diff --git a/libclc/generic/lib/math/round.cl b/libclc/generic/lib/math/round.cl
index 17c72c9..285328a 100644
--- a/libclc/generic/lib/math/round.cl
+++ b/libclc/generic/lib/math/round.cl
@@ -3,7 +3,7 @@
 // Map the llvm intrinsic to an OpenCL function.
 #define __CLC_FUNCTION __clc_round
 #define __CLC_INTRINSIC "llvm.round"
-#include "math/unary_intrin.inc"
+#include <clc/math/unary_intrin.inc>
 
 #undef __CLC_FUNCTION
 #define __CLC_FUNCTION round
diff --git a/libclc/generic/lib/math/rsqrt.cl b/libclc/generic/lib/math/rsqrt.cl
index 131ffc1..b38d4a1 100644
--- a/libclc/generic/lib/math/rsqrt.cl
+++ b/libclc/generic/lib/math/rsqrt.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float rsqrt(float x)
 {
diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl
index 3a40749..30638a5 100644
--- a/libclc/generic/lib/math/sin.cl
+++ b/libclc/generic/lib/math/sin.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "sincos_helpers.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float sin(float x)
 {
@@ -77,3 +77,5 @@ _CLC_OVERLOAD _CLC_DEF double sin(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double);
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(sin)
diff --git a/libclc/generic/lib/math/sincos.inc b/libclc/generic/lib/math/sincos.inc
index 2318ffb..e97f0f9 100644
--- a/libclc/generic/lib/math/sincos.inc
+++ b/libclc/generic/lib/math/sincos.inc
@@ -1,5 +1,3 @@
-// TODO: Enable half precision when sin/cos is implemented
-#if __CLC_FPSIZE > 16
 #define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \
   _CLC_OVERLOAD _CLC_DEF TYPE sincos (TYPE x, ADDRSPACE TYPE * cosval) { \
     *cosval = cos(x); \
@@ -11,4 +9,3 @@ __CLC_DECLARE_SINCOS(local, __CLC_GENTYPE)
 __CLC_DECLARE_SINCOS(private, __CLC_GENTYPE)
 
 #undef __CLC_DECLARE_SINCOS
-#endif
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
index 3c466bc..0adecf6 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/shared/clc_max.h>
 
 #include "math.h"
 #include "tables.h"
@@ -372,7 +373,7 @@ _CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *
 
     long ux = as_long(x);
     int e = (int)(ux >> 52) -  1023;
-    int i = max(23, (e >> 3) + 17);
+    int i = __clc_max(23, (e >> 3) + 17);
     int j = 150 - i;
     int j16 = j & ~0xf;
     double fract_temp;
diff --git a/libclc/generic/lib/math/sinh.cl b/libclc/generic/lib/math/sinh.cl
index 9159b89..3de0792 100644
--- a/libclc/generic/lib/math/sinh.cl
+++ b/libclc/generic/lib/math/sinh.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "tables.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float sinh(float x)
 {
@@ -189,3 +189,5 @@ _CLC_OVERLOAD _CLC_DEF double sinh(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinh, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(sinh)
diff --git a/libclc/generic/lib/math/sinpi.cl b/libclc/generic/lib/math/sinpi.cl
index dbb995f..520bba5 100644
--- a/libclc/generic/lib/math/sinpi.cl
+++ b/libclc/generic/lib/math/sinpi.cl
@@ -21,10 +21,10 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
 #include "sincospiF_piby4.h"
-#include "../clcmacro.h"
 #ifdef cl_khr_fp64
 #include "sincosD_piby4.h"
 #endif
@@ -129,3 +129,5 @@ _CLC_OVERLOAD _CLC_DEF double sinpi(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinpi, double)
 
 #endif
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(sinpi)
diff --git a/libclc/generic/lib/math/tables.h b/libclc/generic/lib/math/tables.h
index 8045242..ea5221e 100644
--- a/libclc/generic/lib/math/tables.h
+++ b/libclc/generic/lib/math/tables.h
@@ -20,6 +20,8 @@
  * THE SOFTWARE.
  */
 
+#include <clc/clctypes.h>
+
 #define TABLE_SPACE __constant
 
 #define TABLE_MANGLE(NAME) __clc_##NAME
diff --git a/libclc/generic/lib/math/tanh.cl b/libclc/generic/lib/math/tanh.cl
index e9c4079..e558bb9 100644
--- a/libclc/generic/lib/math/tanh.cl
+++ b/libclc/generic/lib/math/tanh.cl
@@ -21,9 +21,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float tanh(float x)
 {
@@ -144,3 +144,5 @@ _CLC_OVERLOAD _CLC_DEF double tanh(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tanh, double);
 
 #endif // cl_khr_fp64
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(tanh)
diff --git a/libclc/generic/lib/math/tgamma.cl b/libclc/generic/lib/math/tgamma.cl
index 29c069f..314ffda 100644
--- a/libclc/generic/lib/math/tgamma.cl
+++ b/libclc/generic/lib/math/tgamma.cl
@@ -22,9 +22,9 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
 #include "math.h"
-#include "../clcmacro.h"
 
 _CLC_OVERLOAD _CLC_DEF float tgamma(float x) {
     const float pi = 3.1415926535897932384626433832795f;
diff --git a/libclc/generic/lib/math/trunc.cl b/libclc/generic/lib/math/trunc.cl
index 62c7b18..00c2a4a 100644
--- a/libclc/generic/lib/math/trunc.cl
+++ b/libclc/generic/lib/math/trunc.cl
@@ -1,9 +1,5 @@
 #include <clc/clc.h>
-
-// Map the llvm intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_trunc
-#define __CLC_INTRINSIC "llvm.trunc"
-#include "math/unary_intrin.inc"
+#include <clc/math/clc_trunc.h>
 
 #undef __CLC_FUNCTION
 #define __CLC_FUNCTION trunc
diff --git a/libclc/generic/lib/math/unary_builtin.inc b/libclc/generic/lib/math/unary_builtin.inc
index 4e7ca5b..6405c3f 100644
--- a/libclc/generic/lib/math/unary_builtin.inc
+++ b/libclc/generic/lib/math/unary_builtin.inc
@@ -1,5 +1,5 @@
-#include "../clcmacro.h"
-#include "utils.h"
+#include <clc/clcmacro.h>
+#include <clc/utils.h>
 
 #ifndef __CLC_BUILTIN
 #define __CLC_BUILTIN __CLC_XCONCAT(__clc_, __CLC_FUNCTION)
diff --git a/libclc/generic/lib/relational/all.cl b/libclc/generic/lib/relational/all.cl
index 607d7a9..e4af0fc 100644
--- a/libclc/generic/lib/relational/all.cl
+++ b/libclc/generic/lib/relational/all.cl
@@ -1,27 +1,15 @@
 #include <clc/clc.h>
+#include <clc/relational/clc_all.h>
 
-#define _CLC_ALL(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
-#define _CLC_ALL2(v) (_CLC_ALL((v).s0) & _CLC_ALL((v).s1))
-#define _CLC_ALL3(v) (_CLC_ALL2((v)) & _CLC_ALL((v).s2))
-#define _CLC_ALL4(v) (_CLC_ALL3((v)) & _CLC_ALL((v).s3))
-#define _CLC_ALL8(v) (_CLC_ALL4((v)) & _CLC_ALL((v).s4) & _CLC_ALL((v).s5) \
-                                     & _CLC_ALL((v).s6) & _CLC_ALL((v).s7))
-#define _CLC_ALL16(v) (_CLC_ALL8((v)) & _CLC_ALL((v).s8) & _CLC_ALL((v).s9) \
-                                      & _CLC_ALL((v).sA) & _CLC_ALL((v).sB) \
-                                      & _CLC_ALL((v).sC) & _CLC_ALL((v).sD) \
-                                      & _CLC_ALL((v).sE) & _CLC_ALL((v).sf))
+#define ALL_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int all(TYPE v)
 
-
-#define ALL_ID(TYPE) \
-  _CLC_OVERLOAD _CLC_DEF int all(TYPE v)
-
-#define ALL_VECTORIZE(TYPE) \
-  ALL_ID(TYPE) { return _CLC_ALL(v); } \
-  ALL_ID(TYPE##2) { return _CLC_ALL2(v); } \
-  ALL_ID(TYPE##3) { return _CLC_ALL3(v); } \
-  ALL_ID(TYPE##4) { return _CLC_ALL4(v); } \
-  ALL_ID(TYPE##8) { return _CLC_ALL8(v); } \
-  ALL_ID(TYPE##16) { return _CLC_ALL16(v); }
+#define ALL_VECTORIZE(TYPE)                                                    \
+  ALL_ID(TYPE) { return __clc_all(v); }                                        \
+  ALL_ID(TYPE##2) { return __clc_all(v); }                                     \
+  ALL_ID(TYPE##3) { return __clc_all(v); }                                     \
+  ALL_ID(TYPE##4) { return __clc_all(v); }                                     \
+  ALL_ID(TYPE##8) { return __clc_all(v); }                                     \
+  ALL_ID(TYPE##16) { return __clc_all(v); }
 
 ALL_VECTORIZE(char)
 ALL_VECTORIZE(short)
diff --git a/libclc/generic/lib/relational/any.cl b/libclc/generic/lib/relational/any.cl
index 4d37210..3d975bd 100644
--- a/libclc/generic/lib/relational/any.cl
+++ b/libclc/generic/lib/relational/any.cl
@@ -1,30 +1,17 @@
 #include <clc/clc.h>
+#include <clc/relational/clc_any.h>
 
-#define _CLC_ANY(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
-#define _CLC_ANY2(v) (_CLC_ANY((v).s0) | _CLC_ANY((v).s1))
-#define _CLC_ANY3(v) (_CLC_ANY2((v)) | _CLC_ANY((v).s2))
-#define _CLC_ANY4(v) (_CLC_ANY3((v)) | _CLC_ANY((v).s3))
-#define _CLC_ANY8(v) (_CLC_ANY4((v)) | _CLC_ANY((v).s4) | _CLC_ANY((v).s5) \
-                                     | _CLC_ANY((v).s6) | _CLC_ANY((v).s7))
-#define _CLC_ANY16(v) (_CLC_ANY8((v)) | _CLC_ANY((v).s8) | _CLC_ANY((v).s9) \
-                                      | _CLC_ANY((v).sA) | _CLC_ANY((v).sB) \
-                                      | _CLC_ANY((v).sC) | _CLC_ANY((v).sD) \
-                                      | _CLC_ANY((v).sE) | _CLC_ANY((v).sf))
+#define ANY_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int any(TYPE v)
 
-
-#define ANY_ID(TYPE) \
-  _CLC_OVERLOAD _CLC_DEF int any(TYPE v)
-
-#define ANY_VECTORIZE(TYPE) \
-  ANY_ID(TYPE) { return _CLC_ANY(v); } \
-  ANY_ID(TYPE##2) { return _CLC_ANY2(v); } \
-  ANY_ID(TYPE##3) { return _CLC_ANY3(v); } \
-  ANY_ID(TYPE##4) { return _CLC_ANY4(v); } \
-  ANY_ID(TYPE##8) { return _CLC_ANY8(v); } \
-  ANY_ID(TYPE##16) { return _CLC_ANY16(v); }
+#define ANY_VECTORIZE(TYPE)                                                    \
+  ANY_ID(TYPE) { return __clc_any(v); }                                        \
+  ANY_ID(TYPE##2) { return __clc_any(v); }                                     \
+  ANY_ID(TYPE##3) { return __clc_any(v); }                                     \
+  ANY_ID(TYPE##4) { return __clc_any(v); }                                     \
+  ANY_ID(TYPE##8) { return __clc_any(v); }                                     \
+  ANY_ID(TYPE##16) { return __clc_any(v); }
 
 ANY_VECTORIZE(char)
 ANY_VECTORIZE(short)
 ANY_VECTORIZE(int)
 ANY_VECTORIZE(long)
-
diff --git a/libclc/generic/lib/relational/binary_def.inc b/libclc/generic/lib/relational/binary_def.inc
new file mode 100644
index 0000000..e1ee9de
--- /dev/null
+++ b/libclc/generic/lib/relational/binary_def.inc
@@ -0,0 +1,7 @@
+#include <clc/utils.h>
+
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b) {
+  return __CLC_FUNCTION(FUNCTION)(a, b);
+}
diff --git a/libclc/generic/lib/relational/bitselect.cl b/libclc/generic/lib/relational/bitselect.cl
index af4e70c..a470447 100644
--- a/libclc/generic/lib/relational/bitselect.cl
+++ b/libclc/generic/lib/relational/bitselect.cl
@@ -21,17 +21,18 @@
  */
 
 #include <clc/clc.h>
-
-#include "../clcmacro.h"
+#include <clc/clcmacro.h>
+#include <clc/relational/clc_bitselect.h>
 
 #define __CLC_BODY <bitselect.inc>
 #include <clc/integer/gentype.inc>
 #undef __CLC_BODY
 
-#define FLOAT_BITSELECT(f_type, i_type, width) \
-  _CLC_OVERLOAD _CLC_DEF f_type##width bitselect(f_type##width x, f_type##width y, f_type##width z) { \
-  return as_##f_type##width(bitselect(as_##i_type##width(x), as_##i_type##width(y), as_##i_type##width(z))); \
-}
+#define FLOAT_BITSELECT(f_type, i_type, width)                                 \
+  _CLC_OVERLOAD _CLC_DEF f_type##width bitselect(                              \
+      f_type##width x, f_type##width y, f_type##width z) {                     \
+    return __clc_bitselect(x, y, z);                                           \
+  }
 
 FLOAT_BITSELECT(float, uint, )
 FLOAT_BITSELECT(float, uint, 2)
diff --git a/libclc/generic/lib/relational/isequal.cl b/libclc/generic/lib/relational/isequal.cl
index 3f14f94..4ed545f 100644
--- a/libclc/generic/lib/relational/isequal.cl
+++ b/libclc/generic/lib/relational/isequal.cl
@@ -1,44 +1,7 @@
 #include <clc/clc.h>
+#include <clc/relational/clc_isequal.h>
 
-#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
-  return (x == y); \
-} \
+#define FUNCTION isequal
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_ISEQUAL(int, isequal, float, float)
-_CLC_DEFINE_ISEQUAL(int2, isequal, float2, float2)
-_CLC_DEFINE_ISEQUAL(int3, isequal, float3, float3)
-_CLC_DEFINE_ISEQUAL(int4, isequal, float4, float4)
-_CLC_DEFINE_ISEQUAL(int8, isequal, float8, float8)
-_CLC_DEFINE_ISEQUAL(int16, isequal, float16, float16)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isequal(double) returns an int, but the vector versions
-// return long.
-_CLC_DEFINE_ISEQUAL(int, isequal, double, double)
-_CLC_DEFINE_ISEQUAL(long2, isequal, double2, double2)
-_CLC_DEFINE_ISEQUAL(long3, isequal, double3, double3)
-_CLC_DEFINE_ISEQUAL(long4, isequal, double4, double4)
-_CLC_DEFINE_ISEQUAL(long8, isequal, double8, double8)
-_CLC_DEFINE_ISEQUAL(long16, isequal, double16, double16)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isequal(half) returns an int, but the vector versions
-// return short.
-_CLC_DEFINE_ISEQUAL(int, isequal, half, half)
-_CLC_DEFINE_ISEQUAL(short2, isequal, half2, half2)
-_CLC_DEFINE_ISEQUAL(short3, isequal, half3, half3)
-_CLC_DEFINE_ISEQUAL(short4, isequal, half4, half4)
-_CLC_DEFINE_ISEQUAL(short8, isequal, half8, half8)
-_CLC_DEFINE_ISEQUAL(short16, isequal, half16, half16)
-
-#endif
-
-#undef _CLC_DEFINE_ISEQUAL
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isfinite.cl b/libclc/generic/lib/relational/isfinite.cl
index 15b92fa..d73bf6e 100644
--- a/libclc/generic/lib/relational/isfinite.cl
+++ b/libclc/generic/lib/relational/isfinite.cl
@@ -1,31 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isfinite.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, isfinite, __builtin_isfinite, float)
+#define FUNCTION isfinite
+#define __CLC_BODY "unary_def.inc"
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isfinite(double) returns an int, but the vector versions
-// return long.
-_CLC_DEF _CLC_OVERLOAD int isfinite(double x) {
-  return __builtin_isfinite(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isfinite, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isfinite(half) returns an int, but the vector versions
-// return short.
-_CLC_DEF _CLC_OVERLOAD int isfinite(half x) {
-  return __builtin_isfinite(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, isfinite, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isgreater.cl b/libclc/generic/lib/relational/isgreater.cl
index 167d6f2..c4f7b43 100644
--- a/libclc/generic/lib/relational/isgreater.cl
+++ b/libclc/generic/lib/relational/isgreater.cl
@@ -1,37 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isgreater.h>
 
-//Note: It would be nice to use __builtin_isgreater with vector inputs, but it seems to only take scalar values as
-//      input, which will produce incorrect output for vector input types.
+#define FUNCTION isgreater
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, isgreater, __builtin_isgreater, float, float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isgreater(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEF _CLC_OVERLOAD int isgreater(double x, double y){
-	return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isgreater, double, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isgreater(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEF _CLC_OVERLOAD int isgreater(half x, half y){
-	return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, isgreater, half, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isgreaterequal.cl b/libclc/generic/lib/relational/isgreaterequal.cl
index 128a1d0..28473393 100644
--- a/libclc/generic/lib/relational/isgreaterequal.cl
+++ b/libclc/generic/lib/relational/isgreaterequal.cl
@@ -1,36 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isgreaterequal.h>
 
-//Note: It would be nice to use __builtin_isgreaterequal with vector inputs, but it seems to only take scalar values as
-//      input, which will produce incorrect output for vector input types.
+#define FUNCTION isgreaterequal
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, isgreaterequal, __builtin_isgreaterequal, float, float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isgreaterequal(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEF _CLC_OVERLOAD int isgreaterequal(double x, double y){
-	return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isgreaterequal, double, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isgreaterequal(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEF _CLC_OVERLOAD int isgreaterequal(half x, half y){
-	return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, isgreaterequal, half, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isinf.cl b/libclc/generic/lib/relational/isinf.cl
index 96aae4a..f681665 100644
--- a/libclc/generic/lib/relational/isinf.cl
+++ b/libclc/generic/lib/relational/isinf.cl
@@ -1,30 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isinf.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, isinf, __builtin_isinf, float)
+#define FUNCTION isinf
+#define __CLC_BODY "unary_def.inc"
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isinf(double) returns an int, but the vector versions
-// return long.
-_CLC_DEF _CLC_OVERLOAD int isinf(double x) {
-  return __builtin_isinf(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isinf, double)
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isinf(half) returns an int, but the vector versions
-// return short.
-_CLC_DEF _CLC_OVERLOAD int isinf(half x) {
-  return __builtin_isinf(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, isinf, half)
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isless.cl b/libclc/generic/lib/relational/isless.cl
index 1dbf767..ea79ce4 100644
--- a/libclc/generic/lib/relational/isless.cl
+++ b/libclc/generic/lib/relational/isless.cl
@@ -1,36 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isless.h>
 
-//Note: It would be nice to use __builtin_isless with vector inputs, but it seems to only take scalar values as
-//      input, which will produce incorrect output for vector input types.
+#define FUNCTION isless
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, isless, __builtin_isless, float, float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isless(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEF _CLC_OVERLOAD int isless(double x, double y){
-	return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isless, double, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isless(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEF _CLC_OVERLOAD int isless(half x, half y){
-	return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, isless, half, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/islessequal.cl b/libclc/generic/lib/relational/islessequal.cl
index db64bea..9b09577 100644
--- a/libclc/generic/lib/relational/islessequal.cl
+++ b/libclc/generic/lib/relational/islessequal.cl
@@ -1,36 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_islessequal.h>
 
-//Note: It would be nice to use __builtin_islessequal with vector inputs, but it seems to only take scalar values as
-//      input, which will produce incorrect output for vector input types.
+#define FUNCTION islessequal
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, islessequal, __builtin_islessequal, float, float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of islessequal(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEF _CLC_OVERLOAD int islessequal(double x, double y){
-	return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, islessequal, double, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of islessequal(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEF _CLC_OVERLOAD int islessequal(half x, half y){
-	return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, islessequal, half, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/islessgreater.cl b/libclc/generic/lib/relational/islessgreater.cl
index 9e9b11e..08f7c95 100644
--- a/libclc/generic/lib/relational/islessgreater.cl
+++ b/libclc/generic/lib/relational/islessgreater.cl
@@ -1,36 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_islessgreater.h>
 
-//Note: It would be nice to use __builtin_islessgreater with vector inputs, but it seems to only take scalar values as
-//      input, which will produce incorrect output for vector input types.
+#define FUNCTION islessgreater
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, islessgreater, __builtin_islessgreater, float, float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of islessgreater(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEF _CLC_OVERLOAD int islessgreater(double x, double y){
-	return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, islessgreater, double, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of islessgreater(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEF _CLC_OVERLOAD int islessgreater(half x, half y){
-	return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, islessgreater, half, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isnan.cl b/libclc/generic/lib/relational/isnan.cl
index 3d31047..c613437 100644
--- a/libclc/generic/lib/relational/isnan.cl
+++ b/libclc/generic/lib/relational/isnan.cl
@@ -1,32 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isnan.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, isnan, __builtin_isnan, float)
+#define FUNCTION isnan
+#define __CLC_BODY "unary_def.inc"
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isnan(double) returns an int, but the vector versions
-// return long.
-_CLC_DEF _CLC_OVERLOAD int isnan(double x) {
-  return __builtin_isnan(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isnan, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isnan(half) returns an int, but the vector versions
-// return short.
-_CLC_DEF _CLC_OVERLOAD int isnan(half x) {
-  return __builtin_isnan(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, isnan, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isnormal.cl b/libclc/generic/lib/relational/isnormal.cl
index a3dbf66..de2bd6ad 100644
--- a/libclc/generic/lib/relational/isnormal.cl
+++ b/libclc/generic/lib/relational/isnormal.cl
@@ -1,31 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isnormal.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, isnormal, __builtin_isnormal, float)
+#define FUNCTION isnormal
+#define __CLC_BODY "unary_def.inc"
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isnormal(double) returns an int, but the vector versions
-// return long.
-_CLC_DEF _CLC_OVERLOAD int isnormal(double x) {
-  return __builtin_isnormal(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isnormal, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isnormal(half) returns an int, but the vector versions
-// return short.
-_CLC_DEF _CLC_OVERLOAD int isnormal(half x) {
-  return __builtin_isnormal(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, isnormal, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isnotequal.cl b/libclc/generic/lib/relational/isnotequal.cl
index afd293d..c04752b 100644
--- a/libclc/generic/lib/relational/isnotequal.cl
+++ b/libclc/generic/lib/relational/isnotequal.cl
@@ -1,33 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isnotequal.h>
 
-#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
-  return (x != y); \
-} \
+#define FUNCTION isnotequal
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_ISNOTEQUAL(int, isnotequal, float, float)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, isnotequal, float, float)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isnotequal(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEFINE_ISNOTEQUAL(int, isnotequal, double, double)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isnotequal, double, double)
-
-#endif
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isnotequal(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEFINE_ISNOTEQUAL(int, isnotequal, half, half)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, isnotequal, half, half)
-
-#endif
-
-#undef _CLC_DEFINE_ISNOTEQUAL
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isordered.cl b/libclc/generic/lib/relational/isordered.cl
index cedd05f6..347fc2d 100644
--- a/libclc/generic/lib/relational/isordered.cl
+++ b/libclc/generic/lib/relational/isordered.cl
@@ -1,33 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isordered.h>
 
-#define _CLC_DEFINE_ISORDERED(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
-  return isequal(x, x) && isequal(y, y); \
-} \
+#define FUNCTION isordered
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_ISORDERED(int, isordered, float, float)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, isordered, float, float)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isordered(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEFINE_ISORDERED(int, isordered, double, double)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isordered, double, double)
-
-#endif
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isordered(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEFINE_ISORDERED(int, isordered, half, half)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, isordered, half, half)
-
-#endif
-
-#undef _CLC_DEFINE_ISORDERED
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/isunordered.cl b/libclc/generic/lib/relational/isunordered.cl
index 90939807ff..46db603 100644
--- a/libclc/generic/lib/relational/isunordered.cl
+++ b/libclc/generic/lib/relational/isunordered.cl
@@ -1,36 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_isunordered.h>
 
-//Note: It would be nice to use __builtin_isunordered with vector inputs, but it seems to only take scalar values as
-//      input, which will produce incorrect output for vector input types.
+#define FUNCTION isunordered
+#define __CLC_BODY "binary_def.inc"
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, isunordered, __builtin_isunordered, float, float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of isunordered(double, double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEF _CLC_OVERLOAD int isunordered(double x, double y){
-	return __builtin_isunordered(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isunordered, double, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of isunordered(half, half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEF _CLC_OVERLOAD int isunordered(half x, half y){
-	return __builtin_isunordered(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, isunordered, half, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/relational.h b/libclc/generic/lib/relational/relational.h
deleted file mode 100644
index e492750..0000000
--- a/libclc/generic/lib/relational/relational.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Contains relational macros that have to return 1 for scalar and -1 for vector
- * when the result is true.
- */
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x){ \
-	return BUILTIN_NAME(x); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
-  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
-  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
-  return (RET_TYPE)( \
-	(RET_TYPE){ \
-		FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3) \
-	} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
-  return (RET_TYPE)( \
-	(RET_TYPE){ \
-		FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
-		FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7) \
-	} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
-  return (RET_TYPE)( \
-	(RET_TYPE){ \
-		FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
-		FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \
-		FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \
-		FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf) \
-	} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) \
-_CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2) \
-_CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3) \
-_CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4) \
-_CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8) \
-_CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16)
-
-#define _CLC_DEFINE_RELATIONAL_UNARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
-_CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) \
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y){ \
-	return BUILTIN_NAME(x, y); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
-  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
-  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
-  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2)} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
-  return (RET_TYPE)( \
-	(RET_TYPE){ \
-		FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3) \
-	} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
-  return (RET_TYPE)( \
-	(RET_TYPE){ \
-		FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \
-		FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7) \
-	} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
-  return (RET_TYPE)( \
-	(RET_TYPE){ \
-		FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \
-		FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), \
-		FUNCTION(x.s8, y.s8), FUNCTION(x.s9, y.s9), FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \
-		FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), FUNCTION(x.se, y.se), FUNCTION(x.sf, y.sf) \
-	} != (RET_TYPE)0); \
-}
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE##2, FUNCTION, ARG0_TYPE##2, ARG1_TYPE##2) \
-_CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE##3, FUNCTION, ARG0_TYPE##3, ARG1_TYPE##3) \
-_CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE##4, FUNCTION, ARG0_TYPE##4, ARG1_TYPE##4) \
-_CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE##8, FUNCTION, ARG0_TYPE##8, ARG1_TYPE##8) \
-_CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE##16, FUNCTION, ARG0_TYPE##16, ARG1_TYPE##16)
-
-#define _CLC_DEFINE_RELATIONAL_BINARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG0_TYPE, ARG1_TYPE) \
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE)
diff --git a/libclc/generic/lib/relational/select.cl b/libclc/generic/lib/relational/select.cl
index dc2e273..094f4f9 100644
--- a/libclc/generic/lib/relational/select.cl
+++ b/libclc/generic/lib/relational/select.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include <utils.h>
+#include <clc/utils.h>
 
 #define __CLC_BODY <select.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/relational/signbit.cl b/libclc/generic/lib/relational/signbit.cl
index a7378d7..1cf993e 100644
--- a/libclc/generic/lib/relational/signbit.cl
+++ b/libclc/generic/lib/relational/signbit.cl
@@ -1,33 +1,7 @@
 #include <clc/clc.h>
-#include "relational.h"
+#include <clc/relational/clc_signbit.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, signbit, __builtin_signbitf, float)
+#define FUNCTION signbit
+#define __CLC_BODY "unary_def.inc"
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// The scalar version of signbit(double) returns an int, but the vector versions
-// return long.
-
-_CLC_DEF _CLC_OVERLOAD int signbit(double x){
-	return __builtin_signbit(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, signbit, double)
-
-#endif
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// The scalar version of signbit(half) returns an int, but the vector versions
-// return short.
-
-_CLC_DEF _CLC_OVERLOAD int signbit(half x){
-	return __builtin_signbit(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, signbit, half)
-
-#endif
+#include <clc/relational/floatn.inc>
diff --git a/libclc/generic/lib/relational/unary_def.inc b/libclc/generic/lib/relational/unary_def.inc
new file mode 100644
index 0000000..0bec358
--- /dev/null
+++ b/libclc/generic/lib/relational/unary_def.inc
@@ -0,0 +1,7 @@
+#include <clc/utils.h>
+
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a) {
+  return __CLC_FUNCTION(FUNCTION)(a);
+}
diff --git a/libclc/generic/lib/shared/clamp.cl b/libclc/generic/lib/shared/clamp.cl
index b946220..f470fc8 100644
--- a/libclc/generic/lib/shared/clamp.cl
+++ b/libclc/generic/lib/shared/clamp.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/shared/clc_clamp.h>
 
 #define __CLC_BODY <clamp.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/shared/clamp.inc b/libclc/generic/lib/shared/clamp.inc
index c918f9c..7e02cb2 100644
--- a/libclc/generic/lib/shared/clamp.inc
+++ b/libclc/generic/lib/shared/clamp.inc
@@ -1,9 +1,9 @@
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) {
-  return (x > z ? z : (x < y ? y : x));
+  return __clc_clamp(x, y, z);
 }
 
 #ifndef __CLC_SCALAR
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_SCALAR_GENTYPE y, __CLC_SCALAR_GENTYPE z) {
-  return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x));
+  return __clc_clamp(x, y, z);
 }
 #endif
diff --git a/libclc/generic/lib/shared/max.cl b/libclc/generic/lib/shared/max.cl
index eb573cd..2266d59 100644
--- a/libclc/generic/lib/shared/max.cl
+++ b/libclc/generic/lib/shared/max.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/shared/clc_max.h>
 
 #define __CLC_BODY <max.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/shared/max.inc b/libclc/generic/lib/shared/max.inc
index 75a24c0..ec433a8 100644
--- a/libclc/generic/lib/shared/max.inc
+++ b/libclc/generic/lib/shared/max.inc
@@ -1,9 +1,10 @@
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b) {
-  return (a > b ? a : b);
+  return __clc_max(a, b);
 }
 
 #ifndef __CLC_SCALAR
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) {
-  return (a > (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a,
+                                         __CLC_SCALAR_GENTYPE b) {
+  return __clc_max(a, b);
 }
 #endif
diff --git a/libclc/generic/lib/shared/min.cl b/libclc/generic/lib/shared/min.cl
index 19a7d79..f5c4d57 100644
--- a/libclc/generic/lib/shared/min.cl
+++ b/libclc/generic/lib/shared/min.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/shared/clc_min.h>
 
 #define __CLC_BODY <min.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/shared/min.inc b/libclc/generic/lib/shared/min.inc
index e15e055..6a00944 100644
--- a/libclc/generic/lib/shared/min.inc
+++ b/libclc/generic/lib/shared/min.inc
@@ -1,9 +1,10 @@
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) {
-  return (b < a ? b : a);
+  return __clc_min(a, b);
 }
 
 #ifndef __CLC_SCALAR
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) {
-  return (b < (__CLC_GENTYPE)a ? (__CLC_GENTYPE)b : a);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a,
+                                         __CLC_SCALAR_GENTYPE b) {
+  return __clc_min(a, b);
 }
 #endif
diff --git a/libclc/ptx/lib/math/nextafter.cl b/libclc/ptx/lib/math/nextafter.cl
index 5b4521d..809eeca 100644
--- a/libclc/ptx/lib/math/nextafter.cl
+++ b/libclc/ptx/lib/math/nextafter.cl
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
-#include "../lib/clcmacro.h"
+#include <clc/clcmacro.h>
 #include <math/clc_nextafter.h>
 
 _CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float)
diff --git a/libclc/r600/lib/math/fmax.cl b/libclc/r600/lib/math/fmax.cl
index e4b9e4c..a43530fc 100644
--- a/libclc/r600/lib/math/fmax.cl
+++ b/libclc/r600/lib/math/fmax.cl
@@ -1,6 +1,6 @@
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
-#include "../../../generic/lib/clcmacro.h"
 #include "../../../generic/lib/math/math.h"
 
 _CLC_DEF _CLC_OVERLOAD float fmax(float x, float y)
diff --git a/libclc/r600/lib/math/fmin.cl b/libclc/r600/lib/math/fmin.cl
index 09f1e4c..a43655d 100644
--- a/libclc/r600/lib/math/fmin.cl
+++ b/libclc/r600/lib/math/fmin.cl
@@ -1,6 +1,6 @@
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
 
-#include "../../../generic/lib/clcmacro.h"
 #include "../../../generic/lib/math/math.h"
 
 _CLC_DEF _CLC_OVERLOAD float fmin(float x, float y)
diff --git a/libclc/r600/lib/math/native_rsqrt.cl b/libclc/r600/lib/math/native_rsqrt.cl
index edf473e..78871f3 100644
--- a/libclc/r600/lib/math/native_rsqrt.cl
+++ b/libclc/r600/lib/math/native_rsqrt.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "../../../generic/lib/clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float native_rsqrt(float x)
 {
diff --git a/libclc/r600/lib/math/rsqrt.cl b/libclc/r600/lib/math/rsqrt.cl
index 37a8037..53f7d40 100644
--- a/libclc/r600/lib/math/rsqrt.cl
+++ b/libclc/r600/lib/math/rsqrt.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "../../../generic/lib/clcmacro.h"
+#include <clc/clcmacro.h>
 
 _CLC_OVERLOAD _CLC_DEF float rsqrt(float x)
 {