aboutsummaryrefslogtreecommitdiff
path: root/libclc/generic/lib
diff options
context:
space:
mode:
Diffstat (limited to 'libclc/generic/lib')
-rw-r--r--libclc/generic/lib/SOURCES139
-rw-r--r--libclc/generic/lib/async/async_work_group_copy.cl9
-rw-r--r--libclc/generic/lib/async/async_work_group_copy.inc17
-rw-r--r--libclc/generic/lib/async/async_work_group_strided_copy.cl9
-rw-r--r--libclc/generic/lib/async/async_work_group_strided_copy.inc34
-rw-r--r--libclc/generic/lib/async/prefetch.cl9
-rw-r--r--libclc/generic/lib/async/prefetch.inc1
-rw-r--r--libclc/generic/lib/async/wait_group_events.cl5
-rw-r--r--libclc/generic/lib/atomic/atomic_impl.ll133
-rw-r--r--libclc/generic/lib/atomic/atomic_xchg.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_add.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_cmpxchg.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_sub.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_xchg.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_and.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_max.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_min.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_or.cl9
-rw-r--r--libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_xor.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_add.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_cmpxchg.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_sub.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_xchg.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_and.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_max.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_min.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_or.cl9
-rw-r--r--libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_xor.cl9
-rw-r--r--libclc/generic/lib/clcmacro.h148
-rw-r--r--libclc/generic/lib/common/degrees.cl45
-rw-r--r--libclc/generic/lib/common/mix.cl8
-rw-r--r--libclc/generic/lib/common/mix.inc9
-rw-r--r--libclc/generic/lib/common/radians.cl45
-rw-r--r--libclc/generic/lib/common/sign.cl28
-rw-r--r--libclc/generic/lib/common/smoothstep.cl55
-rw-r--r--libclc/generic/lib/common/step.cl54
-rw-r--r--libclc/generic/lib/gen_convert.py388
-rw-r--r--libclc/generic/lib/geometric/cross.cl25
-rw-r--r--libclc/generic/lib/geometric/distance.cl30
-rw-r--r--libclc/generic/lib/geometric/distance.inc25
-rw-r--r--libclc/generic/lib/geometric/dot.cl39
-rw-r--r--libclc/generic/lib/geometric/fast_distance.cl28
-rw-r--r--libclc/generic/lib/geometric/fast_distance.inc25
-rw-r--r--libclc/generic/lib/geometric/fast_length.cl39
-rw-r--r--libclc/generic/lib/geometric/fast_normalize.cl32
-rw-r--r--libclc/generic/lib/geometric/fast_normalize.inc31
-rw-r--r--libclc/generic/lib/geometric/length.cl87
-rw-r--r--libclc/generic/lib/geometric/normalize.cl157
-rw-r--r--libclc/generic/lib/image/get_image_dim.cl9
-rw-r--r--libclc/generic/lib/integer/abs.cl4
-rw-r--r--libclc/generic/lib/integer/abs.inc3
-rw-r--r--libclc/generic/lib/integer/abs_diff.cl4
-rw-r--r--libclc/generic/lib/integer/abs_diff.inc3
-rw-r--r--libclc/generic/lib/integer/add_sat.cl53
-rw-r--r--libclc/generic/lib/integer/add_sat_if.ll55
-rw-r--r--libclc/generic/lib/integer/add_sat_impl.ll83
-rw-r--r--libclc/generic/lib/integer/clz.cl53
-rw-r--r--libclc/generic/lib/integer/clz_if.ll55
-rw-r--r--libclc/generic/lib/integer/clz_impl.ll44
-rw-r--r--libclc/generic/lib/integer/hadd.cl4
-rw-r--r--libclc/generic/lib/integer/hadd.inc6
-rw-r--r--libclc/generic/lib/integer/mad24.cl4
-rw-r--r--libclc/generic/lib/integer/mad24.inc3
-rw-r--r--libclc/generic/lib/integer/mad_sat.cl72
-rw-r--r--libclc/generic/lib/integer/mul24.cl4
-rw-r--r--libclc/generic/lib/integer/mul24.inc11
-rw-r--r--libclc/generic/lib/integer/mul_hi.cl109
-rw-r--r--libclc/generic/lib/integer/rhadd.cl4
-rw-r--r--libclc/generic/lib/integer/rhadd.inc6
-rw-r--r--libclc/generic/lib/integer/rotate.cl4
-rw-r--r--libclc/generic/lib/integer/rotate.inc42
-rw-r--r--libclc/generic/lib/integer/sub_sat.cl53
-rw-r--r--libclc/generic/lib/integer/sub_sat_if.ll55
-rw-r--r--libclc/generic/lib/integer/sub_sat_impl.ll83
-rw-r--r--libclc/generic/lib/integer/upsample.cl34
-rw-r--r--libclc/generic/lib/math/acos.cl8
-rw-r--r--libclc/generic/lib/math/acos.inc29
-rw-r--r--libclc/generic/lib/math/acosh.cl127
-rw-r--r--libclc/generic/lib/math/acospi.cl172
-rw-r--r--libclc/generic/lib/math/asin.cl8
-rw-r--r--libclc/generic/lib/math/asin.inc12
-rw-r--r--libclc/generic/lib/math/asinh.cl293
-rw-r--r--libclc/generic/lib/math/asinpi.cl170
-rw-r--r--libclc/generic/lib/math/atan.cl183
-rw-r--r--libclc/generic/lib/math/atan2.cl237
-rw-r--r--libclc/generic/lib/math/atan2pi.cl221
-rw-r--r--libclc/generic/lib/math/atanh.cl113
-rw-r--r--libclc/generic/lib/math/atanpi.cl182
-rw-r--r--libclc/generic/lib/math/binary_impl.inc22
-rw-r--r--libclc/generic/lib/math/clc_ldexp.cl128
-rw-r--r--libclc/generic/lib/math/clc_nextafter.cl43
-rw-r--r--libclc/generic/lib/math/clc_sqrt.cl37
-rw-r--r--libclc/generic/lib/math/clc_sqrt_impl.inc38
-rw-r--r--libclc/generic/lib/math/copysign.cl12
-rw-r--r--libclc/generic/lib/math/cos.cl77
-rw-r--r--libclc/generic/lib/math/cospi.cl136
-rw-r--r--libclc/generic/lib/math/ep_log.cl94
-rw-r--r--libclc/generic/lib/math/ep_log.h29
-rw-r--r--libclc/generic/lib/math/erfc.cl413
-rw-r--r--libclc/generic/lib/math/exp.cl90
-rw-r--r--libclc/generic/lib/math/exp10.cl8
-rw-r--r--libclc/generic/lib/math/exp10.inc10
-rw-r--r--libclc/generic/lib/math/exp2.cl86
-rw-r--r--libclc/generic/lib/math/exp_helper.cl69
-rw-r--r--libclc/generic/lib/math/exp_helper.h29
-rw-r--r--libclc/generic/lib/math/fmax.cl16
-rw-r--r--libclc/generic/lib/math/fmax.inc18
-rw-r--r--libclc/generic/lib/math/fmin.cl16
-rw-r--r--libclc/generic/lib/math/fmin.inc18
-rw-r--r--libclc/generic/lib/math/fmod.cl12
-rw-r--r--libclc/generic/lib/math/fract.cl30
-rw-r--r--libclc/generic/lib/math/fract.inc49
-rw-r--r--libclc/generic/lib/math/frexp.cl10
-rw-r--r--libclc/generic/lib/math/frexp.inc110
-rw-r--r--libclc/generic/lib/math/half_rsqrt.cl28
-rw-r--r--libclc/generic/lib/math/half_rsqrt.inc25
-rw-r--r--libclc/generic/lib/math/half_sqrt.cl28
-rw-r--r--libclc/generic/lib/math/half_sqrt.inc25
-rw-r--r--libclc/generic/lib/math/hypot.cl8
-rw-r--r--libclc/generic/lib/math/hypot.inc3
-rw-r--r--libclc/generic/lib/math/ldexp.cl41
-rw-r--r--libclc/generic/lib/math/ldexp.inc29
-rw-r--r--libclc/generic/lib/math/log.cl26
-rw-r--r--libclc/generic/lib/math/log10.cl8
-rw-r--r--libclc/generic/lib/math/log10.inc13
-rw-r--r--libclc/generic/lib/math/log1p.cl177
-rw-r--r--libclc/generic/lib/math/log2.cl39
-rw-r--r--libclc/generic/lib/math/log_base.h299
-rw-r--r--libclc/generic/lib/math/mad.cl8
-rw-r--r--libclc/generic/lib/math/mad.inc3
-rw-r--r--libclc/generic/lib/math/math.h90
-rw-r--r--libclc/generic/lib/math/modf.cl32
-rw-r--r--libclc/generic/lib/math/modf.inc37
-rw-r--r--libclc/generic/lib/math/native_log.cl32
-rw-r--r--libclc/generic/lib/math/native_log.inc25
-rw-r--r--libclc/generic/lib/math/native_log2.cl32
-rw-r--r--libclc/generic/lib/math/native_log2.inc25
-rw-r--r--libclc/generic/lib/math/nextafter.cl12
-rw-r--r--libclc/generic/lib/math/pown.cl10
-rw-r--r--libclc/generic/lib/math/sin.cl79
-rw-r--r--libclc/generic/lib/math/sincos.cl8
-rw-r--r--libclc/generic/lib/math/sincos.inc11
-rw-r--r--libclc/generic/lib/math/sincosD_piby4.h78
-rw-r--r--libclc/generic/lib/math/sincos_helpers.cl545
-rw-r--r--libclc/generic/lib/math/sincos_helpers.h35
-rw-r--r--libclc/generic/lib/math/sincospiF_piby4.h56
-rw-r--r--libclc/generic/lib/math/sinpi.cl131
-rw-r--r--libclc/generic/lib/math/sqrt.cl35
-rw-r--r--libclc/generic/lib/math/tables.cl841
-rw-r--r--libclc/generic/lib/math/tables.h53
-rw-r--r--libclc/generic/lib/math/tan.cl8
-rw-r--r--libclc/generic/lib/math/tan.inc17
-rw-r--r--libclc/generic/lib/math/tanh.cl146
-rw-r--r--libclc/generic/lib/relational/all.cl29
-rw-r--r--libclc/generic/lib/relational/any.cl30
-rw-r--r--libclc/generic/lib/relational/bitselect.cl53
-rw-r--r--libclc/generic/lib/relational/bitselect.inc25
-rw-r--r--libclc/generic/lib/relational/isequal.cl30
-rw-r--r--libclc/generic/lib/relational/isfinite.cl18
-rw-r--r--libclc/generic/lib/relational/isgreater.cl22
-rw-r--r--libclc/generic/lib/relational/isgreaterequal.cl22
-rw-r--r--libclc/generic/lib/relational/isinf.cl18
-rw-r--r--libclc/generic/lib/relational/isless.cl22
-rw-r--r--libclc/generic/lib/relational/islessequal.cl22
-rw-r--r--libclc/generic/lib/relational/islessgreater.cl22
-rw-r--r--libclc/generic/lib/relational/isnan.cl18
-rw-r--r--libclc/generic/lib/relational/isnormal.cl18
-rw-r--r--libclc/generic/lib/relational/isnotequal.cl23
-rw-r--r--libclc/generic/lib/relational/isordered.cl23
-rw-r--r--libclc/generic/lib/relational/isunordered.cl22
-rw-r--r--libclc/generic/lib/relational/relational.h117
-rw-r--r--libclc/generic/lib/relational/signbit.cl19
-rw-r--r--libclc/generic/lib/shared/clamp.cl11
-rw-r--r--libclc/generic/lib/shared/clamp.inc9
-rw-r--r--libclc/generic/lib/shared/max.cl11
-rw-r--r--libclc/generic/lib/shared/max.inc9
-rw-r--r--libclc/generic/lib/shared/min.cl11
-rw-r--r--libclc/generic/lib/shared/min.inc9
-rw-r--r--libclc/generic/lib/shared/vload.cl52
-rw-r--r--libclc/generic/lib/shared/vstore.cl52
-rw-r--r--libclc/generic/lib/subnormal_config.cl37
-rw-r--r--libclc/generic/lib/subnormal_disable.ll1
-rw-r--r--libclc/generic/lib/subnormal_helper_func.ll8
-rw-r--r--libclc/generic/lib/subnormal_use_default.ll1
-rw-r--r--libclc/generic/lib/workitem/get_global_id.cl5
-rw-r--r--libclc/generic/lib/workitem/get_global_size.cl5
190 files changed, 9948 insertions, 0 deletions
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES
new file mode 100644
index 0000000..c3a5a8a
--- /dev/null
+++ b/libclc/generic/lib/SOURCES
@@ -0,0 +1,139 @@
+subnormal_config.cl
+subnormal_helper_func.ll
+async/async_work_group_copy.cl
+async/async_work_group_strided_copy.cl
+async/prefetch.cl
+async/wait_group_events.cl
+atomic/atomic_xchg.cl
+atomic/atomic_impl.ll
+cl_khr_global_int32_base_atomics/atom_add.cl
+cl_khr_global_int32_base_atomics/atom_cmpxchg.cl
+cl_khr_global_int32_base_atomics/atom_dec.cl
+cl_khr_global_int32_base_atomics/atom_inc.cl
+cl_khr_global_int32_base_atomics/atom_sub.cl
+cl_khr_global_int32_base_atomics/atom_xchg.cl
+cl_khr_global_int32_extended_atomics/atom_and.cl
+cl_khr_global_int32_extended_atomics/atom_max.cl
+cl_khr_global_int32_extended_atomics/atom_min.cl
+cl_khr_global_int32_extended_atomics/atom_or.cl
+cl_khr_global_int32_extended_atomics/atom_xor.cl
+cl_khr_local_int32_base_atomics/atom_add.cl
+cl_khr_local_int32_base_atomics/atom_cmpxchg.cl
+cl_khr_local_int32_base_atomics/atom_dec.cl
+cl_khr_local_int32_base_atomics/atom_inc.cl
+cl_khr_local_int32_base_atomics/atom_sub.cl
+cl_khr_local_int32_base_atomics/atom_xchg.cl
+cl_khr_local_int32_extended_atomics/atom_and.cl
+cl_khr_local_int32_extended_atomics/atom_max.cl
+cl_khr_local_int32_extended_atomics/atom_min.cl
+cl_khr_local_int32_extended_atomics/atom_or.cl
+cl_khr_local_int32_extended_atomics/atom_xor.cl
+convert.cl
+common/degrees.cl
+common/mix.cl
+common/radians.cl
+common/sign.cl
+common/smoothstep.cl
+common/step.cl
+geometric/cross.cl
+geometric/distance.cl
+geometric/dot.cl
+geometric/fast_distance.cl
+geometric/fast_length.cl
+geometric/fast_normalize.cl
+geometric/length.cl
+geometric/normalize.cl
+integer/abs.cl
+integer/abs_diff.cl
+integer/add_sat.cl
+integer/add_sat_if.ll
+integer/add_sat_impl.ll
+integer/clz.cl
+integer/clz_if.ll
+integer/clz_impl.ll
+integer/hadd.cl
+integer/mad24.cl
+integer/mad_sat.cl
+integer/mul24.cl
+integer/mul_hi.cl
+integer/rhadd.cl
+integer/rotate.cl
+integer/sub_sat.cl
+integer/sub_sat_if.ll
+integer/sub_sat_impl.ll
+integer/upsample.cl
+math/acos.cl
+math/acosh.cl
+math/acospi.cl
+math/asin.cl
+math/asinh.cl
+math/asinpi.cl
+math/atan.cl
+math/atan2.cl
+math/atan2pi.cl
+math/atanh.cl
+math/atanpi.cl
+math/copysign.cl
+math/cos.cl
+math/cospi.cl
+math/ep_log.cl
+math/erfc.cl
+math/exp.cl
+math/exp_helper.cl
+math/exp2.cl
+math/exp10.cl
+math/fmax.cl
+math/fmin.cl
+math/fmod.cl
+math/fract.cl
+math/frexp.cl
+math/half_rsqrt.cl
+math/half_sqrt.cl
+math/hypot.cl
+math/clc_ldexp.cl
+math/ldexp.cl
+math/log.cl
+math/log10.cl
+math/log1p.cl
+math/log2.cl
+math/mad.cl
+math/modf.cl
+math/native_log.cl
+math/native_log2.cl
+math/tables.cl
+math/clc_nextafter.cl
+math/nextafter.cl
+math/pown.cl
+math/sin.cl
+math/sincos.cl
+math/sincos_helpers.cl
+math/sinpi.cl
+math/clc_sqrt.cl
+math/sqrt.cl
+math/tan.cl
+math/tanh.cl
+relational/all.cl
+relational/any.cl
+relational/bitselect.cl
+relational/isequal.cl
+relational/isfinite.cl
+relational/isgreater.cl
+relational/isgreaterequal.cl
+relational/isinf.cl
+relational/isless.cl
+relational/islessequal.cl
+relational/islessgreater.cl
+relational/isnan.cl
+relational/isnormal.cl
+relational/isnotequal.cl
+relational/isordered.cl
+relational/isunordered.cl
+relational/signbit.cl
+shared/clamp.cl
+shared/max.cl
+shared/min.cl
+shared/vload.cl
+shared/vstore.cl
+workitem/get_global_id.cl
+workitem/get_global_size.cl
+image/get_image_dim.cl
diff --git a/libclc/generic/lib/async/async_work_group_copy.cl b/libclc/generic/lib/async/async_work_group_copy.cl
new file mode 100644
index 0000000..fe20ecf
--- /dev/null
+++ b/libclc/generic/lib/async/async_work_group_copy.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <async_work_group_copy.inc>
+#include <clc/async/gentype.inc>
+#undef __CLC_BODY
diff --git a/libclc/generic/lib/async/async_work_group_copy.inc b/libclc/generic/lib/async/async_work_group_copy.inc
new file mode 100644
index 0000000..a143ddf
--- /dev/null
+++ b/libclc/generic/lib/async/async_work_group_copy.inc
@@ -0,0 +1,17 @@
+_CLC_OVERLOAD _CLC_DEF event_t async_work_group_copy(
+ local __CLC_GENTYPE *dst,
+ const global __CLC_GENTYPE *src,
+ size_t num_gentypes,
+ event_t event) {
+
+ return async_work_group_strided_copy(dst, src, num_gentypes, 1, event);
+}
+
+_CLC_OVERLOAD _CLC_DEF event_t async_work_group_copy(
+ global __CLC_GENTYPE *dst,
+ const local __CLC_GENTYPE *src,
+ size_t num_gentypes,
+ event_t event) {
+
+ return async_work_group_strided_copy(dst, src, num_gentypes, 1, event);
+}
diff --git a/libclc/generic/lib/async/async_work_group_strided_copy.cl b/libclc/generic/lib/async/async_work_group_strided_copy.cl
new file mode 100644
index 0000000..61b8898
--- /dev/null
+++ b/libclc/generic/lib/async/async_work_group_strided_copy.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <async_work_group_strided_copy.inc>
+#include <clc/async/gentype.inc>
+#undef __CLC_BODY
diff --git a/libclc/generic/lib/async/async_work_group_strided_copy.inc b/libclc/generic/lib/async/async_work_group_strided_copy.inc
new file mode 100644
index 0000000..d81a8b7
--- /dev/null
+++ b/libclc/generic/lib/async/async_work_group_strided_copy.inc
@@ -0,0 +1,34 @@
+
+#define STRIDED_COPY(dst, src, num_gentypes, dst_stride, src_stride) \
+ size_t size = get_local_size(0) * get_local_size(1) * get_local_size(2); \
+ size_t id = (get_local_size(1) * get_local_size(2) * get_local_id(0)) + \
+ (get_local_size(2) * get_local_id(1)) + \
+ get_local_id(2); \
+ size_t i; \
+ \
+ for (i = id; i < num_gentypes; i += size) { \
+ dst[i * dst_stride] = src[i * src_stride]; \
+ }
+
+
+_CLC_OVERLOAD _CLC_DEF event_t async_work_group_strided_copy(
+ local __CLC_GENTYPE *dst,
+ const global __CLC_GENTYPE *src,
+ size_t num_gentypes,
+ size_t src_stride,
+ event_t event) {
+
+ STRIDED_COPY(dst, src, num_gentypes, 1, src_stride);
+ return event;
+}
+
+_CLC_OVERLOAD _CLC_DEF event_t async_work_group_strided_copy(
+ global __CLC_GENTYPE *dst,
+ const local __CLC_GENTYPE *src,
+ size_t num_gentypes,
+ size_t dst_stride,
+ event_t event) {
+
+ STRIDED_COPY(dst, src, num_gentypes, dst_stride, 1);
+ return event;
+}
diff --git a/libclc/generic/lib/async/prefetch.cl b/libclc/generic/lib/async/prefetch.cl
new file mode 100644
index 0000000..45af21b
--- /dev/null
+++ b/libclc/generic/lib/async/prefetch.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <prefetch.inc>
+#include <clc/async/gentype.inc>
+#undef __CLC_BODY
diff --git a/libclc/generic/lib/async/prefetch.inc b/libclc/generic/lib/async/prefetch.inc
new file mode 100644
index 0000000..6747e4c
--- /dev/null
+++ b/libclc/generic/lib/async/prefetch.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DEF void prefetch(const global __CLC_GENTYPE *p, size_t num_gentypes) { }
diff --git a/libclc/generic/lib/async/wait_group_events.cl b/libclc/generic/lib/async/wait_group_events.cl
new file mode 100644
index 0000000..05c9d58
--- /dev/null
+++ b/libclc/generic/lib/async/wait_group_events.cl
@@ -0,0 +1,5 @@
+#include <clc/clc.h>
+
+_CLC_DEF void wait_group_events(int num_events, event_t *event_list) {
+ barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/libclc/generic/lib/atomic/atomic_impl.ll b/libclc/generic/lib/atomic/atomic_impl.ll
new file mode 100644
index 0000000..019147f
--- /dev/null
+++ b/libclc/generic/lib/atomic/atomic_impl.ll
@@ -0,0 +1,133 @@
+define i32 @__clc_atomic_add_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_add_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile add i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_and_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_and_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile and i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_cmpxchg_addr1(i32 addrspace(1)* nocapture %ptr, i32 %compare, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+ %1 = extractvalue { i32, i1 } %0, 0
+ ret i32 %1
+}
+
+define i32 @__clc_atomic_cmpxchg_addr3(i32 addrspace(3)* nocapture %ptr, i32 %compare, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+ %1 = extractvalue { i32, i1 } %0, 0
+ ret i32 %1
+}
+
+define i32 @__clc_atomic_max_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_max_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile max i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_min_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_min_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile min i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_or_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_or_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile or i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_umax_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_umax_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umax i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_umin_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_umin_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umin i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_sub_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_sub_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile sub i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_xchg_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_xchg_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xchg i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_xor_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__clc_atomic_xor_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xor i32 addrspace(3)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
diff --git a/libclc/generic/lib/atomic/atomic_xchg.cl b/libclc/generic/lib/atomic/atomic_xchg.cl
new file mode 100644
index 0000000..9aee595
--- /dev/null
+++ b/libclc/generic/lib/atomic/atomic_xchg.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile global float *p, float val) {
+ return as_float(atomic_xchg((volatile global int *)p, as_int(val)));
+}
+
+_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile local float *p, float val) {
+ return as_float(atomic_xchg((volatile local int *)p, as_int(val)));
+}
diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_add.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_add.cl
new file mode 100644
index 0000000..9151b0c
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_add.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_add(global TYPE *p, TYPE val) { \
+ return atomic_add(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_cmpxchg.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_cmpxchg.cl
new file mode 100644
index 0000000..7647740
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_cmpxchg.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(global TYPE *p, TYPE cmp, TYPE val) { \
+ return atomic_cmpxchg(p, cmp, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl
new file mode 100644
index 0000000..a74158d
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_dec(global TYPE *p) { \
+ return atom_sub(p, 1); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl
new file mode 100644
index 0000000..1404b5a
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_inc(global TYPE *p) { \
+ return atom_add(p, 1); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_sub.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_sub.cl
new file mode 100644
index 0000000..7faa3cc
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_sub.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_sub(global TYPE *p, TYPE val) { \
+ return atomic_sub(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_xchg.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_xchg.cl
new file mode 100644
index 0000000..9c77db1
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_xchg.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(global TYPE *p, TYPE val) { \
+ return atomic_xchg(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_and.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_and.cl
new file mode 100644
index 0000000..e587969
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_and.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_and(global TYPE *p, TYPE val) { \
+ return atomic_and(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int) \ No newline at end of file
diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_max.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_max.cl
new file mode 100644
index 0000000..09177ed
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_max.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_max(global TYPE *p, TYPE val) { \
+ return atomic_max(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_min.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_min.cl
new file mode 100644
index 0000000..277c41b
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_min.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_min(global TYPE *p, TYPE val) { \
+ return atomic_min(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_or.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_or.cl
new file mode 100644
index 0000000..a936a8e
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_or.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_or(global TYPE *p, TYPE val) { \
+ return atomic_or(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_xor.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_xor.cl
new file mode 100644
index 0000000..1a8e350
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_xor.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_xor(global TYPE *p, TYPE val) { \
+ return atomic_xor(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_add.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_add.cl
new file mode 100644
index 0000000..a5dea18
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_add.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_add(local TYPE *p, TYPE val) { \
+ return atomic_add(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_cmpxchg.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_cmpxchg.cl
new file mode 100644
index 0000000..16e9579
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_cmpxchg.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(local TYPE *p, TYPE cmp, TYPE val) { \
+ return atomic_cmpxchg(p, cmp, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl
new file mode 100644
index 0000000..d22c333
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_dec(local TYPE *p) { \
+ return atom_sub(p, 1); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl
new file mode 100644
index 0000000..4ba0d06
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_inc(local TYPE *p) { \
+ return atom_add(p, 1); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_sub.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_sub.cl
new file mode 100644
index 0000000..c96696a
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_sub.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_sub(local TYPE *p, TYPE val) { \
+ return atomic_sub(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_xchg.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_xchg.cl
new file mode 100644
index 0000000..7d4bcca
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_xchg.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(local TYPE *p, TYPE val) { \
+ return atomic_xchg(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_and.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_and.cl
new file mode 100644
index 0000000..180103a
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_and.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_and(local TYPE *p, TYPE val) { \
+ return atomic_and(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int) \ No newline at end of file
diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_max.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_max.cl
new file mode 100644
index 0000000..b90301b
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_max.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_max(local TYPE *p, TYPE val) { \
+ return atomic_max(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_min.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_min.cl
new file mode 100644
index 0000000..3acedd8
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_min.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_min(local TYPE *p, TYPE val) { \
+ return atomic_min(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_or.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_or.cl
new file mode 100644
index 0000000..338ff2c
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_or.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_or(local TYPE *p, TYPE val) { \
+ return atomic_or(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_xor.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_xor.cl
new file mode 100644
index 0000000..51ae3c0
--- /dev/null
+++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_xor.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+#define IMPL(TYPE) \
+_CLC_OVERLOAD _CLC_DEF TYPE atom_xor(local TYPE *p, TYPE val) { \
+ return atomic_xor(p, val); \
+}
+
+IMPL(int)
+IMPL(unsigned int)
diff --git a/libclc/generic/lib/clcmacro.h b/libclc/generic/lib/clcmacro.h
new file mode 100644
index 0000000..88f3b2a
--- /dev/null
+++ b/libclc/generic/lib/clcmacro.h
@@ -0,0 +1,148 @@
+#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \
+ return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \
+ return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \
+ return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \
+ return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \
+ return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \
+ }
+
+#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \
+ return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \
+ return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \
+ FUNCTION(x.z, y.z)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) { \
+ return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) { \
+ return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) { \
+ return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
+ }
+
+#define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
+ return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) { \
+ return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y), \
+ FUNCTION(x, y.z)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) { \
+ return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) { \
+ return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) { \
+ return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ } \
+\
+
+#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, ARG3_TYPE##2 z) { \
+ return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, ARG3_TYPE##3 z) { \
+ return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \
+ FUNCTION(x.z, y.z, z.z)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, ARG3_TYPE##4 z) { \
+ return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, ARG3_TYPE##8 z) { \
+ return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
+ return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
+ }
+
+#define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) { \
+ return (RET_TYPE##2)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##3 z) { \
+ return (RET_TYPE##3)(FUNCTION(x, y, z.x), FUNCTION(x, y, z.y), \
+ FUNCTION(x, y, z.z)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##4 z) { \
+ return (RET_TYPE##4)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##8 z) { \
+ return (RET_TYPE##8)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##16 z) { \
+ return (RET_TYPE##16)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \
+ } \
+\
+
+#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 *y) { \
+ return (RET_TYPE##2)(FUNCTION(x.x, (ARG2_TYPE*)y), FUNCTION(x.y, (ARG2_TYPE*)y+1)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 *y) { \
+ return (RET_TYPE##3)(FUNCTION(x.x, (ARG2_TYPE*)y), FUNCTION(x.y, (ARG2_TYPE*)y+1), \
+ FUNCTION(x.z, (ARG2_TYPE*)y+2)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 *y) { \
+ return (RET_TYPE##4)(FUNCTION(x.lo, (ARG2_TYPE##2*)y), FUNCTION(x.hi, (ARG2_TYPE##2*)((ARG2_TYPE*)y+2))); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 *y) { \
+ return (RET_TYPE##8)(FUNCTION(x.lo, (ARG2_TYPE##4*)y), FUNCTION(x.hi, (ARG2_TYPE##4*)((ARG2_TYPE*)y+4))); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 *y) { \
+ return (RET_TYPE##16)(FUNCTION(x.lo, (ARG2_TYPE##8*)y), FUNCTION(x.hi, (ARG2_TYPE##8*)((ARG2_TYPE*)y+8))); \
+ }
+
+#define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
+ return BUILTIN(x, y); \
+} \
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
+
+#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
+_CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
+_CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
+
+#define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { \
+ return BUILTIN(x); \
+} \
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE)
diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl
new file mode 100644
index 0000000..5de56f8
--- /dev/null
+++ b/libclc/generic/lib/common/degrees.cl
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float degrees(float radians) {
+ // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+ return 0x1.ca5dc2p+5F * radians;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, degrees, float);
+
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double degrees(double radians) {
+ // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+ return 0x1.ca5dc1a63c1f8p+5 * radians;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, degrees, double);
+
+#endif
diff --git a/libclc/generic/lib/common/mix.cl b/libclc/generic/lib/common/mix.cl
new file mode 100644
index 0000000..294f332e
--- /dev/null
+++ b/libclc/generic/lib/common/mix.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <mix.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/common/mix.inc b/libclc/generic/lib/common/mix.inc
new file mode 100644
index 0000000..1e8b936
--- /dev/null
+++ b/libclc/generic/lib/common/mix.inc
@@ -0,0 +1,9 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) {
+ return mad( y - x, a, x );
+}
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) {
+ return mix(x, y, (__CLC_GENTYPE)a);
+}
+#endif
diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl
new file mode 100644
index 0000000..3838dd6
--- /dev/null
+++ b/libclc/generic/lib/common/radians.cl
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float radians(float degrees) {
+ // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+ return 0x1.1df46ap-6F * degrees;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, radians, float);
+
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double radians(double degrees) {
+ // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+ return 0x1.1df46a2529d39p-6 * degrees;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, radians, double);
+
+#endif
diff --git a/libclc/generic/lib/common/sign.cl b/libclc/generic/lib/common/sign.cl
new file mode 100644
index 0000000..25832e0
--- /dev/null
+++ b/libclc/generic/lib/common/sign.cl
@@ -0,0 +1,28 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+#define SIGN(TYPE, F) \
+_CLC_DEF _CLC_OVERLOAD TYPE sign(TYPE x) { \
+ if (isnan(x)) { \
+ return 0.0F; \
+ } \
+ if (x > 0.0F) { \
+ return 1.0F; \
+ } \
+ if (x < 0.0F) { \
+ return -1.0F; \
+ } \
+ return x; /* -0.0 or +0.0 */ \
+}
+
+SIGN(float, f)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sign, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+SIGN(double, )
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sign, double)
+
+#endif
diff --git a/libclc/generic/lib/common/smoothstep.cl b/libclc/generic/lib/common/smoothstep.cl
new file mode 100644
index 0000000..68d1a13
--- /dev/null
+++ b/libclc/generic/lib/common/smoothstep.cl
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float smoothstep(float edge0, float edge1, float x) {
+ float t = clamp((x - edge0) / (edge1 - edge0), 0.0f, 1.0f);
+ return t * t * (3.0f - 2.0f * t);
+}
+
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, float, float, float);
+
+_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, float, float, float);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define SMOOTH_STEP_DEF(edge_type, x_type, impl) \
+ _CLC_OVERLOAD _CLC_DEF x_type smoothstep(edge_type edge0, edge_type edge1, x_type x) { \
+ double t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0); \
+ return t * t * (3.0 - 2.0 * t); \
+ }
+
+SMOOTH_STEP_DEF(double, double, SMOOTH_STEP_IMPL_D);
+
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, double, double);
+
+SMOOTH_STEP_DEF(float, double, SMOOTH_STEP_IMPL_D);
+SMOOTH_STEP_DEF(double, float, SMOOTH_STEP_IMPL_D);
+
+_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, float, float, double);
+_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, double, double, float);
+
+#endif
diff --git a/libclc/generic/lib/common/step.cl b/libclc/generic/lib/common/step.cl
new file mode 100644
index 0000000..4b022f1
--- /dev/null
+++ b/libclc/generic/lib/common/step.cl
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float step(float edge, float x) {
+ return x < edge ? 0.0f : 1.0f;
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, step, float, float);
+
+_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, step, float, float);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define STEP_DEF(edge_type, x_type) \
+ _CLC_OVERLOAD _CLC_DEF x_type step(edge_type edge, x_type x) { \
+ return x < edge ? 0.0 : 1.0; \
+ }
+
+STEP_DEF(double, double);
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double);
+_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double);
+
+STEP_DEF(float, double);
+STEP_DEF(double, float);
+
+_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, float, double);
+_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, step, double, float);
+
+#endif
diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py
new file mode 100644
index 0000000..f91a89a
--- /dev/null
+++ b/libclc/generic/lib/gen_convert.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+
+# OpenCL built-in library: type conversion functions
+#
+# Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
+# Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# This script generates the file convert_type.cl, which contains all of the
+# OpenCL functions in the form:
+#
+# convert_<destTypen><_sat><_roundingMode>(<sourceTypen>)
+
+types = ['char', 'uchar', 'short', 'ushort', 'int', 'uint', 'long', 'ulong', 'float', 'double']
+int_types = ['char', 'uchar', 'short', 'ushort', 'int', 'uint', 'long', 'ulong']
+unsigned_types = ['uchar', 'ushort', 'uint', 'ulong']
+float_types = ['float', 'double']
+int64_types = ['long', 'ulong']
+float64_types = ['double']
+vector_sizes = ['', '2', '3', '4', '8', '16']
+half_sizes = [('2',''), ('4','2'), ('8','4'), ('16','8')]
+
+saturation = ['','_sat']
+rounding_modes = ['_rtz','_rte','_rtp','_rtn']
+float_prefix = {'float':'FLT_', 'double':'DBL_'}
+float_suffix = {'float':'f', 'double':''}
+
+bool_type = {'char' : 'char',
+ 'uchar' : 'char',
+ 'short' : 'short',
+ 'ushort': 'short',
+ 'int' : 'int',
+ 'uint' : 'int',
+ 'long' : 'long',
+ 'ulong' : 'long',
+ 'float' : 'int',
+ 'double' : 'long'}
+
+unsigned_type = {'char' : 'uchar',
+ 'uchar' : 'uchar',
+ 'short' : 'ushort',
+ 'ushort': 'ushort',
+ 'int' : 'uint',
+ 'uint' : 'uint',
+ 'long' : 'ulong',
+ 'ulong' : 'ulong'}
+
+sizeof_type = {'char' : 1, 'uchar' : 1,
+ 'short' : 2, 'ushort' : 2,
+ 'int' : 4, 'uint' : 4,
+ 'long' : 8, 'ulong' : 8,
+ 'float' : 4, 'double' : 8}
+
+limit_max = {'char' : 'CHAR_MAX',
+ 'uchar' : 'UCHAR_MAX',
+ 'short' : 'SHRT_MAX',
+ 'ushort': 'USHRT_MAX',
+ 'int' : 'INT_MAX',
+ 'uint' : 'UINT_MAX',
+ 'long' : 'LONG_MAX',
+ 'ulong' : 'ULONG_MAX'}
+
+limit_min = {'char' : 'CHAR_MIN',
+ 'uchar' : '0',
+ 'short' : 'SHRT_MIN',
+ 'ushort': '0',
+ 'int' : 'INT_MIN',
+ 'uint' : '0',
+ 'long' : 'LONG_MIN',
+ 'ulong' : '0'}
+
+def conditional_guard(src, dst):
+ int64_count = 0
+ float64_count = 0
+ if src in int64_types:
+ int64_count = int64_count +1
+ elif src in float64_types:
+ float64_count = float64_count + 1
+ if dst in int64_types:
+ int64_count = int64_count +1
+ elif dst in float64_types:
+ float64_count = float64_count + 1
+ if float64_count > 0 and int64_count > 0:
+ print("#if defined(cl_khr_fp64) && defined(cles_khr_int64)")
+ return True
+ elif float64_count > 0:
+ print("#ifdef cl_khr_fp64")
+ return True
+ elif int64_count > 0:
+ print("#ifdef cles_khr_int64")
+ return True
+ return False
+
+
+print("""/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!!
+
+ DON'T CHANGE THIS FILE. MAKE YOUR CHANGES TO convert_type.py AND RUN:
+ $ ./generate-conversion-type-cl.sh
+
+ OpenCL type conversion functions
+
+ Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
+ Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+""")
+
+#
+# Default Conversions
+#
+# All conversions are in accordance with the OpenCL specification,
+# which cites the C99 conversion rules.
+#
+# Casting from floating point to integer results in conversions
+# with truncation, so it should be suitable for the default convert
+# functions.
+#
+# Conversions from integer to floating-point, and floating-point to
+# floating-point through casting is done with the default rounding
+# mode. While C99 allows dynamically changing the rounding mode
+# during runtime, it is not a supported feature in OpenCL according
+# to Section 7.1 - Rounding Modes in the OpenCL 1.2 specification.
+#
+# Therefore, we can assume for optimization purposes that the
+# rounding mode is fixed to round-to-nearest-even. Platform target
+# authors should ensure that the rounding-control registers remain
+# in this state, and that this invariant holds.
+#
+# Also note, even though the OpenCL specification isn't entirely
+# clear on this matter, we implement all rounding mode combinations
+# even for integer-to-integer conversions. When such a conversion
+# is used, the rounding mode is ignored.
+#
+
+def generate_default_conversion(src, dst, mode):
+ close_conditional = conditional_guard(src, dst)
+
+ # scalar conversions
+ print("""_CLC_DEF _CLC_OVERLOAD
+{DST} convert_{DST}{M}({SRC} x)
+{{
+ return ({DST})x;
+}}
+""".format(SRC=src, DST=dst, M=mode))
+
+ # vector conversions, done through decomposition to components
+ for size, half_size in half_sizes:
+ print("""_CLC_DEF _CLC_OVERLOAD
+{DST}{N} convert_{DST}{N}{M}({SRC}{N} x)
+{{
+ return ({DST}{N})(convert_{DST}{H}(x.lo), convert_{DST}{H}(x.hi));
+}}
+""".format(SRC=src, DST=dst, N=size, H=half_size, M=mode))
+
+ # 3-component vector conversions
+ print("""_CLC_DEF _CLC_OVERLOAD
+{DST}3 convert_{DST}3{M}({SRC}3 x)
+{{
+ return ({DST}3)(convert_{DST}2(x.s01), convert_{DST}(x.s2));
+}}""".format(SRC=src, DST=dst, M=mode))
+
+ if close_conditional:
+ print("#endif")
+
+
+for src in types:
+ for dst in types:
+ generate_default_conversion(src, dst, '')
+
+for src in int_types:
+ for dst in int_types:
+ for mode in rounding_modes:
+ generate_default_conversion(src, dst, mode)
+
+#
+# Saturated Conversions To Integers
+#
+# These functions are dependent on the unsaturated conversion functions
+# generated above, and use clamp, max, min, and select to eliminate
+# branching and vectorize the conversions.
+#
+# Again, as above, we allow all rounding modes for integer-to-integer
+# conversions with saturation.
+#
+
+def generate_saturated_conversion(src, dst, size):
+ # Header
+ close_conditional = conditional_guard(src, dst)
+ print("""_CLC_DEF _CLC_OVERLOAD
+{DST}{N} convert_{DST}{N}_sat({SRC}{N} x)
+{{""".format(DST=dst, SRC=src, N=size))
+
+ # FIXME: This is a work around for lack of select function with
+ # signed third argument when the first two arguments are unsigned types.
+ # We cast to the signed type for sign-extension, then do a bitcast to
+ # the unsigned type.
+ if dst in unsigned_types:
+ bool_prefix = "as_{DST}{N}(convert_{BOOL}{N}".format(DST=dst, BOOL=bool_type[dst], N=size);
+ bool_suffix = ")"
+ else:
+ bool_prefix = "convert_{BOOL}{N}".format(BOOL=bool_type[dst], N=size);
+ bool_suffix = ""
+
+ # Body
+ if src == dst:
+
+ # Conversion between same types
+ print(" return x;")
+
+ elif src in float_types:
+
+ # Conversion from float to int
+ print(""" {DST}{N} y = convert_{DST}{N}(x);
+ y = select(y, ({DST}{N}){DST_MIN}, {BP}(x < ({SRC}{N}){DST_MIN}){BS});
+ y = select(y, ({DST}{N}){DST_MAX}, {BP}(x > ({SRC}{N}){DST_MAX}){BS});
+ return y;""".format(SRC=src, DST=dst, N=size,
+ DST_MIN=limit_min[dst], DST_MAX=limit_max[dst],
+ BP=bool_prefix, BS=bool_suffix))
+
+ else:
+
+ # Integer to integer convesion with sizeof(src) == sizeof(dst)
+ if sizeof_type[src] == sizeof_type[dst]:
+ if src in unsigned_types:
+ print(" x = min(x, ({SRC}){DST_MAX});".format(SRC=src, DST_MAX=limit_max[dst]))
+ else:
+ print(" x = max(x, ({SRC})0);".format(SRC=src))
+
+ # Integer to integer conversion where sizeof(src) > sizeof(dst)
+ elif sizeof_type[src] > sizeof_type[dst]:
+ if src in unsigned_types:
+ print(" x = min(x, ({SRC}){DST_MAX});".format(SRC=src, DST_MAX=limit_max[dst]))
+ else:
+ print(" x = clamp(x, ({SRC}){DST_MIN}, ({SRC}){DST_MAX});"
+ .format(SRC=src, DST_MIN=limit_min[dst], DST_MAX=limit_max[dst]))
+
+ # Integer to integer conversion where sizeof(src) < sizeof(dst)
+ elif src not in unsigned_types and dst in unsigned_types:
+ print(" x = max(x, ({SRC})0);".format(SRC=src))
+
+ print(" return convert_{DST}{N}(x);".format(DST=dst, N=size))
+
+ # Footer
+ print("}")
+ if close_conditional:
+ print("#endif")
+
+
+for src in types:
+ for dst in int_types:
+ for size in vector_sizes:
+ generate_saturated_conversion(src, dst, size)
+
+
+def generate_saturated_conversion_with_rounding(src, dst, size, mode):
+ # Header
+ close_conditional = conditional_guard(src, dst)
+
+ # Body
+ print("""_CLC_DEF _CLC_OVERLOAD
+{DST}{N} convert_{DST}{N}_sat{M}({SRC}{N} x)
+{{
+ return convert_{DST}{N}_sat(x);
+}}
+""".format(DST=dst, SRC=src, N=size, M=mode))
+
+ # Footer
+ if close_conditional:
+ print("#endif")
+
+
+for src in int_types:
+ for dst in int_types:
+ for size in vector_sizes:
+ for mode in rounding_modes:
+ generate_saturated_conversion_with_rounding(src, dst, size, mode)
+
+#
+# Conversions To/From Floating-Point With Rounding
+#
+# Note that we assume as above that casts from floating-point to
+# integer are done with truncation, and that the default rounding
+# mode is fixed to round-to-nearest-even, as per C99 and OpenCL
+# rounding rules.
+#
+# These functions rely on the use of abs, ceil, fabs, floor,
+# nextafter, sign, rint and the above generated conversion functions.
+#
+# Only conversions to integers can have saturation.
+#
+
+def generate_float_conversion(src, dst, size, mode, sat):
+ # Header
+ close_conditional = conditional_guard(src, dst)
+ print("""_CLC_DEF _CLC_OVERLOAD
+{DST}{N} convert_{DST}{N}{S}{M}({SRC}{N} x)
+{{""".format(SRC=src, DST=dst, N=size, M=mode, S=sat))
+
+ # Perform conversion
+ if dst in int_types:
+ if mode == '_rte':
+ print(" x = rint(x);");
+ elif mode == '_rtp':
+ print(" x = ceil(x);");
+ elif mode == '_rtn':
+ print(" x = floor(x);");
+ print(" return convert_{DST}{N}{S}(x);".format(DST=dst, N=size, S=sat))
+ elif mode == '_rte':
+ print(" return convert_{DST}{N}(x);".format(DST=dst, N=size))
+ else:
+ print(" {DST}{N} r = convert_{DST}{N}(x);".format(DST=dst, N=size))
+ print(" {SRC}{N} y = convert_{SRC}{N}(y);".format(SRC=src, N=size))
+ if mode == '_rtz':
+ if src in int_types:
+ print(" {USRC}{N} abs_x = abs(x);".format(USRC=unsigned_type[src], N=size))
+ print(" {USRC}{N} abs_y = abs(y);".format(USRC=unsigned_type[src], N=size))
+ else:
+ print(" {SRC}{N} abs_x = fabs(x);".format(SRC=src, N=size))
+ print(" {SRC}{N} abs_y = fabs(y);".format(SRC=src, N=size))
+ print(" return select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), convert_{BOOL}{N}(abs_y > abs_x));"
+ .format(DST=dst, N=size, BOOL=bool_type[dst]))
+ if mode == '_rtp':
+ print(" return select(r, nextafter(r, ({DST}{N})INFINITY), convert_{BOOL}{N}(y < x));"
+ .format(DST=dst, N=size, BOOL=bool_type[dst]))
+ if mode == '_rtn':
+ print(" return select(r, nextafter(r, ({DST}{N})-INFINITY), convert_{BOOL}{N}(y > x));"
+ .format(DST=dst, N=size, BOOL=bool_type[dst]))
+
+ # Footer
+ print("}")
+ if close_conditional:
+ print("#endif")
+
+
+for src in float_types:
+ for dst in int_types:
+ for size in vector_sizes:
+ for mode in rounding_modes:
+ for sat in saturation:
+ generate_float_conversion(src, dst, size, mode, sat)
+
+
+for src in types:
+ for dst in float_types:
+ for size in vector_sizes:
+ for mode in rounding_modes:
+ generate_float_conversion(src, dst, size, mode, '')
diff --git a/libclc/generic/lib/geometric/cross.cl b/libclc/generic/lib/geometric/cross.cl
new file mode 100644
index 0000000..3b4ca6c
--- /dev/null
+++ b/libclc/generic/lib/geometric/cross.cl
@@ -0,0 +1,25 @@
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float3 cross(float3 p0, float3 p1) {
+ return (float3)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x);
+}
+
+_CLC_OVERLOAD _CLC_DEF float4 cross(float4 p0, float4 p1) {
+ return (float4)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x, 0.f);
+}
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double3 cross(double3 p0, double3 p1) {
+ return (double3)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x);
+}
+
+_CLC_OVERLOAD _CLC_DEF double4 cross(double4 p0, double4 p1) {
+ return (double4)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x, 0.f);
+}
+#endif
diff --git a/libclc/generic/lib/geometric/distance.cl b/libclc/generic/lib/geometric/distance.cl
new file mode 100644
index 0000000..4a5b8e25
--- /dev/null
+++ b/libclc/generic/lib/geometric/distance.cl
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <distance.inc>
+#include <clc/geometric/floatn.inc>
diff --git a/libclc/generic/lib/geometric/distance.inc b/libclc/generic/lib/geometric/distance.inc
new file mode 100644
index 0000000..193665e
--- /dev/null
+++ b/libclc/generic/lib/geometric/distance.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOAT distance(__CLC_FLOATN p0, __CLC_FLOATN p1) {
+ return length(p0 - p1);
+}
diff --git a/libclc/generic/lib/geometric/dot.cl b/libclc/generic/lib/geometric/dot.cl
new file mode 100644
index 0000000..0d6fe6c
--- /dev/null
+++ b/libclc/generic/lib/geometric/dot.cl
@@ -0,0 +1,39 @@
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float dot(float p0, float p1) {
+ return p0*p1;
+}
+
+_CLC_OVERLOAD _CLC_DEF float dot(float2 p0, float2 p1) {
+ return p0.x*p1.x + p0.y*p1.y;
+}
+
+_CLC_OVERLOAD _CLC_DEF float dot(float3 p0, float3 p1) {
+ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z;
+}
+
+_CLC_OVERLOAD _CLC_DEF float dot(float4 p0, float4 p1) {
+ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w;
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double dot(double p0, double p1) {
+ return p0*p1;
+}
+
+_CLC_OVERLOAD _CLC_DEF double dot(double2 p0, double2 p1) {
+ return p0.x*p1.x + p0.y*p1.y;
+}
+
+_CLC_OVERLOAD _CLC_DEF double dot(double3 p0, double3 p1) {
+ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z;
+}
+
+_CLC_OVERLOAD _CLC_DEF double dot(double4 p0, double4 p1) {
+ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w;
+}
+
+#endif
diff --git a/libclc/generic/lib/geometric/fast_distance.cl b/libclc/generic/lib/geometric/fast_distance.cl
new file mode 100644
index 0000000..0a4f82c
--- /dev/null
+++ b/libclc/generic/lib/geometric/fast_distance.cl
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#define __CLC_BODY <fast_distance.inc>
+#define __FLOAT_ONLY
+#include <clc/geometric/floatn.inc>
+#undef __FLOAT_ONLY
diff --git a/libclc/generic/lib/geometric/fast_distance.inc b/libclc/generic/lib/geometric/fast_distance.inc
new file mode 100644
index 0000000..d8fe3e0
--- /dev/null
+++ b/libclc/generic/lib/geometric/fast_distance.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOAT fast_distance(__CLC_FLOATN p0, __CLC_FLOATN p1) {
+ return fast_length(p0 - p1);
+}
diff --git a/libclc/generic/lib/geometric/fast_length.cl b/libclc/generic/lib/geometric/fast_length.cl
new file mode 100644
index 0000000..8f6ffc6
--- /dev/null
+++ b/libclc/generic/lib/geometric/fast_length.cl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float fast_length(float p) {
+ return fabs(p);
+}
+
+_CLC_OVERLOAD _CLC_DEF float fast_length(float2 p) {
+ return half_sqrt(dot(p, p));
+}
+
+_CLC_OVERLOAD _CLC_DEF float fast_length(float3 p) {
+ return half_sqrt(dot(p, p));
+}
+
+_CLC_OVERLOAD _CLC_DEF float fast_length(float4 p) {
+ return half_sqrt(dot(p, p));
+}
diff --git a/libclc/generic/lib/geometric/fast_normalize.cl b/libclc/generic/lib/geometric/fast_normalize.cl
new file mode 100644
index 0000000..af5f994
--- /dev/null
+++ b/libclc/generic/lib/geometric/fast_normalize.cl
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float fast_normalize(float p) {
+ return normalize(p);
+}
+
+#define __CLC_BODY <fast_normalize.inc>
+#define __FLOAT_ONLY
+#include <clc/geometric/floatn.inc>
+#undef __FLOAT_ONLY
diff --git a/libclc/generic/lib/geometric/fast_normalize.inc b/libclc/generic/lib/geometric/fast_normalize.inc
new file mode 100644
index 0000000..c1be2b8
--- /dev/null
+++ b/libclc/generic/lib/geometric/fast_normalize.inc
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __CLC_SCALAR
+
+// Only handle vector implementations
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN fast_normalize(__CLC_FLOATN p) {
+ __CLC_FLOAT l2 = dot(p, p);
+ return l2 == 0.0f ? p : p * half_rsqrt(l2);
+}
+
+#endif
diff --git a/libclc/generic/lib/geometric/length.cl b/libclc/generic/lib/geometric/length.cl
new file mode 100644
index 0000000..e7f31b4
--- /dev/null
+++ b/libclc/generic/lib/geometric/length.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float length(float p) {
+ return fabs(p);
+}
+
+#define V_FLENGTH(p) \
+ float l2 = dot(p, p); \
+ \
+ if (l2 < FLT_MIN) { \
+ p *= 0x1.0p+86F; \
+ return sqrt(dot(p, p)) * 0x1.0p-86F; \
+ } else if (l2 == INFINITY) { \
+ p *= 0x1.0p-65F; \
+ return sqrt(dot(p, p)) * 0x1.0p+65F; \
+ } \
+ \
+ return sqrt(l2);
+
+_CLC_OVERLOAD _CLC_DEF float length(float2 p) {
+ V_FLENGTH(p);
+}
+
+_CLC_OVERLOAD _CLC_DEF float length(float3 p) {
+ V_FLENGTH(p);
+}
+
+_CLC_OVERLOAD _CLC_DEF float length(float4 p) {
+ V_FLENGTH(p);
+}
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double length(double p){
+ return fabs(p);
+}
+
+#define V_DLENGTH(p) \
+ double l2 = dot(p, p); \
+ \
+ if (l2 < DBL_MIN) { \
+ p *= 0x1.0p+563; \
+ return sqrt(dot(p, p)) * 0x1.0p-563; \
+ } else if (l2 == INFINITY) { \
+ p *= 0x1.0p-513; \
+ return sqrt(dot(p, p)) * 0x1.0p+513; \
+ } \
+ \
+ return sqrt(l2);
+
+_CLC_OVERLOAD _CLC_DEF double length(double2 p) {
+ V_DLENGTH(p);
+}
+
+_CLC_OVERLOAD _CLC_DEF double length(double3 p) {
+ V_DLENGTH(p);
+}
+
+_CLC_OVERLOAD _CLC_DEF double
+length(double4 p) {
+ V_DLENGTH(p);
+}
+
+#endif
diff --git a/libclc/generic/lib/geometric/normalize.cl b/libclc/generic/lib/geometric/normalize.cl
new file mode 100644
index 0000000..f61ac94
--- /dev/null
+++ b/libclc/generic/lib/geometric/normalize.cl
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float normalize(float p) {
+ return sign(p);
+}
+
+_CLC_OVERLOAD _CLC_DEF float2 normalize(float2 p) {
+ if (all(p == (float2)0.0F))
+ return p;
+
+ float l2 = dot(p, p);
+
+ if (l2 < FLT_MIN) {
+ p *= 0x1.0p+86F;
+ l2 = dot(p, p);
+ } else if (l2 == INFINITY) {
+ p *= 0x1.0p-65f;
+ l2 = dot(p, p);
+ if (l2 == INFINITY) {
+ p = copysign(select((float2)0.0F, (float2)1.0F, isinf(p)), p);
+ l2 = dot(p, p);
+ }
+ }
+ return p * rsqrt(l2);
+}
+
+_CLC_OVERLOAD _CLC_DEF float3 normalize(float3 p) {
+ if (all(p == (float3)0.0F))
+ return p;
+
+ float l2 = dot(p, p);
+
+ if (l2 < FLT_MIN) {
+ p *= 0x1.0p+86F;
+ l2 = dot(p, p);
+ } else if (l2 == INFINITY) {
+ p *= 0x1.0p-66f;
+ l2 = dot(p, p);
+ if (l2 == INFINITY) {
+ p = copysign(select((float3)0.0F, (float3)1.0F, isinf(p)), p);
+ l2 = dot(p, p);
+ }
+ }
+ return p * rsqrt(l2);
+}
+
+_CLC_OVERLOAD _CLC_DEF float4 normalize(float4 p) {
+ if (all(p == (float4)0.0F))
+ return p;
+
+ float l2 = dot(p, p);
+
+ if (l2 < FLT_MIN) {
+ p *= 0x1.0p+86F;
+ l2 = dot(p, p);
+ } else if (l2 == INFINITY) {
+ p *= 0x1.0p-66f;
+ l2 = dot(p, p);
+ if (l2 == INFINITY) {
+ p = copysign(select((float4)0.0F, (float4)1.0F, isinf(p)), p);
+ l2 = dot(p, p);
+ }
+ }
+ return p * rsqrt(l2);
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double normalize(double p) {
+ return sign(p);
+}
+
+_CLC_OVERLOAD _CLC_DEF double2 normalize(double2 p) {
+ if (all(p == (double2)0.0))
+ return p;
+
+ double l2 = dot(p, p);
+
+ if (l2 < DBL_MIN) {
+ p *= 0x1.0p+563;
+ l2 = dot(p, p);
+ } else if (l2 == INFINITY) {
+ p *= 0x1.0p-513;
+ l2 = dot(p, p);
+ if (l2 == INFINITY) {
+ p = copysign(select((double2)0.0, (double2)1.0, isinf(p)), p);
+ l2 = dot(p, p);
+ }
+ }
+ return p * rsqrt(l2);
+}
+
+_CLC_OVERLOAD _CLC_DEF double3 normalize(double3 p) {
+ if (all(p == (double3)0.0))
+ return p;
+
+ double l2 = dot(p, p);
+
+ if (l2 < DBL_MIN) {
+ p *= 0x1.0p+563;
+ l2 = dot(p, p);
+ } else if (l2 == INFINITY) {
+ p *= 0x1.0p-514;
+ l2 = dot(p, p);
+ if (l2 == INFINITY) {
+ p = copysign(select((double3)0.0, (double3)1.0, isinf(p)), p);
+ l2 = dot(p, p);
+ }
+ }
+ return p * rsqrt(l2);
+}
+
+_CLC_OVERLOAD _CLC_DEF double4 normalize(double4 p) {
+ if (all(p == (double4)0.0))
+ return p;
+
+ double l2 = dot(p, p);
+
+ if (l2 < DBL_MIN) {
+ p *= 0x1.0p+563;
+ l2 = dot(p, p);
+ } else if (l2 == INFINITY) {
+ p *= 0x1.0p-514;
+ l2 = dot(p, p);
+ if (l2 == INFINITY) {
+ p = copysign(select((double4)0.0, (double4)1.0, isinf(p)), p);
+ l2 = dot(p, p);
+ }
+ }
+ return p * rsqrt(l2);
+}
+
+#endif
diff --git a/libclc/generic/lib/image/get_image_dim.cl b/libclc/generic/lib/image/get_image_dim.cl
new file mode 100644
index 0000000..26dbd00
--- /dev/null
+++ b/libclc/generic/lib/image/get_image_dim.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF int2 get_image_dim (image2d_t image) {
+ return (int2)(get_image_width(image), get_image_height(image));
+}
+_CLC_OVERLOAD _CLC_DEF int4 get_image_dim (image3d_t image) {
+ return (int4)(get_image_width(image), get_image_height(image),
+ get_image_depth(image), 0);
+}
diff --git a/libclc/generic/lib/integer/abs.cl b/libclc/generic/lib/integer/abs.cl
new file mode 100644
index 0000000..faff8d0
--- /dev/null
+++ b/libclc/generic/lib/integer/abs.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <abs.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/abs.inc b/libclc/generic/lib/integer/abs.inc
new file mode 100644
index 0000000..cfe7bfe
--- /dev/null
+++ b/libclc/generic/lib/integer/abs.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs(__CLC_GENTYPE x) {
+ return __builtin_astype((__CLC_GENTYPE)(x > (__CLC_GENTYPE)(0) ? x : -x), __CLC_U_GENTYPE);
+}
diff --git a/libclc/generic/lib/integer/abs_diff.cl b/libclc/generic/lib/integer/abs_diff.cl
new file mode 100644
index 0000000..3d75105
--- /dev/null
+++ b/libclc/generic/lib/integer/abs_diff.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <abs_diff.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/abs_diff.inc b/libclc/generic/lib/integer/abs_diff.inc
new file mode 100644
index 0000000..f39c3ff
--- /dev/null
+++ b/libclc/generic/lib/integer/abs_diff.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs_diff(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ return __builtin_astype((__CLC_GENTYPE)(x > y ? x-y : y-x), __CLC_U_GENTYPE);
+}
diff --git a/libclc/generic/lib/integer/add_sat.cl b/libclc/generic/lib/integer/add_sat.cl
new file mode 100644
index 0000000..d4df66d
--- /dev/null
+++ b/libclc/generic/lib/integer/add_sat.cl
@@ -0,0 +1,53 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+// From add_sat.ll
+_CLC_DECL char __clc_add_sat_s8(char, char);
+_CLC_DECL uchar __clc_add_sat_u8(uchar, uchar);
+_CLC_DECL short __clc_add_sat_s16(short, short);
+_CLC_DECL ushort __clc_add_sat_u16(ushort, ushort);
+_CLC_DECL int __clc_add_sat_s32(int, int);
+_CLC_DECL uint __clc_add_sat_u32(uint, uint);
+_CLC_DECL long __clc_add_sat_s64(long, long);
+_CLC_DECL ulong __clc_add_sat_u64(ulong, ulong);
+
+_CLC_OVERLOAD _CLC_DEF char add_sat(char x, char y) {
+ return __clc_add_sat_s8(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF uchar add_sat(uchar x, uchar y) {
+ return __clc_add_sat_u8(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF short add_sat(short x, short y) {
+ return __clc_add_sat_s16(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF ushort add_sat(ushort x, ushort y) {
+ return __clc_add_sat_u16(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF int add_sat(int x, int y) {
+ return __clc_add_sat_s32(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF uint add_sat(uint x, uint y) {
+ return __clc_add_sat_u32(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF long add_sat(long x, long y) {
+ return __clc_add_sat_s64(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong add_sat(ulong x, ulong y) {
+ return __clc_add_sat_u64(x, y);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, add_sat, char, char)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, add_sat, uchar, uchar)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, add_sat, short, short)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, add_sat, ushort, ushort)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, add_sat, int, int)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, add_sat, uint, uint)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, add_sat, long, long)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, add_sat, ulong, ulong)
diff --git a/libclc/generic/lib/integer/add_sat_if.ll b/libclc/generic/lib/integer/add_sat_if.ll
new file mode 100644
index 0000000..bcbe4c0
--- /dev/null
+++ b/libclc/generic/lib/integer/add_sat_if.ll
@@ -0,0 +1,55 @@
+declare i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y)
+
+define i8 @__clc_add_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y)
+ ret i8 %call
+}
+
+declare i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y)
+
+define i8 @__clc_add_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y)
+ ret i8 %call
+}
+
+declare i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y)
+
+define i16 @__clc_add_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y)
+ ret i16 %call
+}
+
+declare i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y)
+
+define i16 @__clc_add_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y)
+ ret i16 %call
+}
+
+declare i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y)
+
+define i32 @__clc_add_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y)
+ ret i32 %call
+}
+
+declare i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y)
+
+define i32 @__clc_add_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y)
+ ret i32 %call
+}
+
+declare i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y)
+
+define i64 @__clc_add_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y)
+ ret i64 %call
+}
+
+declare i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y)
+
+define i64 @__clc_add_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y)
+ ret i64 %call
+}
diff --git a/libclc/generic/lib/integer/add_sat_impl.ll b/libclc/generic/lib/integer/add_sat_impl.ll
new file mode 100644
index 0000000..c150ecb
--- /dev/null
+++ b/libclc/generic/lib/integer/add_sat_impl.ll
@@ -0,0 +1,83 @@
+declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8)
+declare {i8, i1} @llvm.uadd.with.overflow.i8(i8, i8)
+
+define i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %x, i8 %y)
+ %res = extractvalue {i8, i1} %call, 0
+ %over = extractvalue {i8, i1} %call, 1
+ %x.msb = ashr i8 %x, 7
+ %x.limit = xor i8 %x.msb, 127
+ %sat = select i1 %over, i8 %x.limit, i8 %res
+ ret i8 %sat
+}
+
+define i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %x, i8 %y)
+ %res = extractvalue {i8, i1} %call, 0
+ %over = extractvalue {i8, i1} %call, 1
+ %sat = select i1 %over, i8 -1, i8 %res
+ ret i8 %sat
+}
+
+declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16)
+declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16)
+
+define i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %x, i16 %y)
+ %res = extractvalue {i16, i1} %call, 0
+ %over = extractvalue {i16, i1} %call, 1
+ %x.msb = ashr i16 %x, 15
+ %x.limit = xor i16 %x.msb, 32767
+ %sat = select i1 %over, i16 %x.limit, i16 %res
+ ret i16 %sat
+}
+
+define i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %x, i16 %y)
+ %res = extractvalue {i16, i1} %call, 0
+ %over = extractvalue {i16, i1} %call, 1
+ %sat = select i1 %over, i16 -1, i16 %res
+ ret i16 %sat
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
+
+define i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %x, i32 %y)
+ %res = extractvalue {i32, i1} %call, 0
+ %over = extractvalue {i32, i1} %call, 1
+ %x.msb = ashr i32 %x, 31
+ %x.limit = xor i32 %x.msb, 2147483647
+ %sat = select i1 %over, i32 %x.limit, i32 %res
+ ret i32 %sat
+}
+
+define i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
+ %res = extractvalue {i32, i1} %call, 0
+ %over = extractvalue {i32, i1} %call, 1
+ %sat = select i1 %over, i32 -1, i32 %res
+ ret i32 %sat
+}
+
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64)
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+
+define i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %x, i64 %y)
+ %res = extractvalue {i64, i1} %call, 0
+ %over = extractvalue {i64, i1} %call, 1
+ %x.msb = ashr i64 %x, 63
+ %x.limit = xor i64 %x.msb, 9223372036854775807
+ %sat = select i1 %over, i64 %x.limit, i64 %res
+ ret i64 %sat
+}
+
+define i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %x, i64 %y)
+ %res = extractvalue {i64, i1} %call, 0
+ %over = extractvalue {i64, i1} %call, 1
+ %sat = select i1 %over, i64 -1, i64 %res
+ ret i64 %sat
+}
diff --git a/libclc/generic/lib/integer/clz.cl b/libclc/generic/lib/integer/clz.cl
new file mode 100644
index 0000000..17e3fe0
--- /dev/null
+++ b/libclc/generic/lib/integer/clz.cl
@@ -0,0 +1,53 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+// From clz.ll
+_CLC_DECL char __clc_clz_s8(char);
+_CLC_DECL uchar __clc_clz_u8(uchar);
+_CLC_DECL short __clc_clz_s16(short);
+_CLC_DECL ushort __clc_clz_u16(ushort);
+_CLC_DECL int __clc_clz_s32(int);
+_CLC_DECL uint __clc_clz_u32(uint);
+_CLC_DECL long __clc_clz_s64(long);
+_CLC_DECL ulong __clc_clz_u64(ulong);
+
+_CLC_OVERLOAD _CLC_DEF char clz(char x) {
+ return __clc_clz_s8(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF uchar clz(uchar x) {
+ return __clc_clz_u8(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF short clz(short x) {
+ return __clc_clz_s16(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF ushort clz(ushort x) {
+ return __clc_clz_u16(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF int clz(int x) {
+ return __clc_clz_s32(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF uint clz(uint x) {
+ return __clc_clz_u32(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF long clz(long x) {
+ return __clc_clz_s64(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong clz(ulong x) {
+ return __clc_clz_u64(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, clz, char)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, clz, uchar)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, clz, short)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, clz, ushort)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, clz, int)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, clz, uint)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, clz, long)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, clz, ulong)
diff --git a/libclc/generic/lib/integer/clz_if.ll b/libclc/generic/lib/integer/clz_if.ll
new file mode 100644
index 0000000..23dfc74
--- /dev/null
+++ b/libclc/generic/lib/integer/clz_if.ll
@@ -0,0 +1,55 @@
+declare i8 @__clc_clz_impl_s8(i8 %x)
+
+define i8 @__clc_clz_s8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @__clc_clz_impl_s8(i8 %x)
+ ret i8 %call
+}
+
+declare i8 @__clc_clz_impl_u8(i8 %x)
+
+define i8 @__clc_clz_u8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @__clc_clz_impl_u8(i8 %x)
+ ret i8 %call
+}
+
+declare i16 @__clc_clz_impl_s16(i16 %x)
+
+define i16 @__clc_clz_s16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @__clc_clz_impl_s16(i16 %x)
+ ret i16 %call
+}
+
+declare i16 @__clc_clz_impl_u16(i16 %x)
+
+define i16 @__clc_clz_u16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @__clc_clz_impl_u16(i16 %x)
+ ret i16 %call
+}
+
+declare i32 @__clc_clz_impl_s32(i32 %x)
+
+define i32 @__clc_clz_s32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @__clc_clz_impl_s32(i32 %x)
+ ret i32 %call
+}
+
+declare i32 @__clc_clz_impl_u32(i32 %x)
+
+define i32 @__clc_clz_u32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @__clc_clz_impl_u32(i32 %x)
+ ret i32 %call
+}
+
+declare i64 @__clc_clz_impl_s64(i64 %x)
+
+define i64 @__clc_clz_s64(i64 %x) nounwind readnone alwaysinline {
+ %call = call i64 @__clc_clz_impl_s64(i64 %x)
+ ret i64 %call
+}
+
+declare i64 @__clc_clz_impl_u64(i64 %x)
+
+define i64 @__clc_clz_u64(i64 %x) nounwind readnone alwaysinline {
+ %call = call i64 @__clc_clz_impl_u64(i64 %x)
+ ret i64 %call
+}
diff --git a/libclc/generic/lib/integer/clz_impl.ll b/libclc/generic/lib/integer/clz_impl.ll
new file mode 100644
index 0000000..b5c3d98
--- /dev/null
+++ b/libclc/generic/lib/integer/clz_impl.ll
@@ -0,0 +1,44 @@
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @__clc_clz_impl_s8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i8 @__clc_clz_impl_u8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i16 @__clc_clz_impl_s16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i16 @__clc_clz_impl_u16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i32 @__clc_clz_impl_s32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i32 @__clc_clz_impl_u32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i64 @__clc_clz_impl_s64(i64 %x) nounwind readnone alwaysinline {
+ %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+ ret i64 %call
+}
+
+define i64 @__clc_clz_impl_u64(i64 %x) nounwind readnone alwaysinline {
+ %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+ ret i64 %call
+}
diff --git a/libclc/generic/lib/integer/hadd.cl b/libclc/generic/lib/integer/hadd.cl
new file mode 100644
index 0000000..749026e
--- /dev/null
+++ b/libclc/generic/lib/integer/hadd.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <hadd.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/hadd.inc b/libclc/generic/lib/integer/hadd.inc
new file mode 100644
index 0000000..ea59d9b
--- /dev/null
+++ b/libclc/generic/lib/integer/hadd.inc
@@ -0,0 +1,6 @@
+//hadd = (x+y)>>1
+//This can be simplified to x>>1 + y>>1 + (1 if both x and y have the 1s bit set)
+//This saves us having to do any checks for overflow in the addition sum
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE hadd(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ return (x>>(__CLC_GENTYPE)1)+(y>>(__CLC_GENTYPE)1)+(x&y&(__CLC_GENTYPE)1);
+}
diff --git a/libclc/generic/lib/integer/mad24.cl b/libclc/generic/lib/integer/mad24.cl
new file mode 100644
index 0000000..e29e99f
--- /dev/null
+++ b/libclc/generic/lib/integer/mad24.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <mad24.inc>
+#include <clc/integer/integer-gentype.inc>
diff --git a/libclc/generic/lib/integer/mad24.inc b/libclc/generic/lib/integer/mad24.inc
new file mode 100644
index 0000000..902b0aa
--- /dev/null
+++ b/libclc/generic/lib/integer/mad24.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z){
+ return mul24(x, y) + z;
+}
diff --git a/libclc/generic/lib/integer/mad_sat.cl b/libclc/generic/lib/integer/mad_sat.cl
new file mode 100644
index 0000000..1708b29
--- /dev/null
+++ b/libclc/generic/lib/integer/mad_sat.cl
@@ -0,0 +1,72 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF char mad_sat(char x, char y, char z) {
+ return clamp((short)mad24((short)x, (short)y, (short)z), (short)CHAR_MIN, (short) CHAR_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF uchar mad_sat(uchar x, uchar y, uchar z) {
+ return clamp((ushort)mad24((ushort)x, (ushort)y, (ushort)z), (ushort)0, (ushort) UCHAR_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF short mad_sat(short x, short y, short z) {
+ return clamp((int)mad24((int)x, (int)y, (int)z), (int)SHRT_MIN, (int) SHRT_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF ushort mad_sat(ushort x, ushort y, ushort z) {
+ return clamp((uint)mad24((uint)x, (uint)y, (uint)z), (uint)0, (uint) USHRT_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF int mad_sat(int x, int y, int z) {
+ int mhi = mul_hi(x, y);
+ uint mlo = x * y;
+ long m = upsample(mhi, mlo);
+ m += z;
+ if (m > INT_MAX)
+ return INT_MAX;
+ if (m < INT_MIN)
+ return INT_MIN;
+ return m;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint mad_sat(uint x, uint y, uint z) {
+ if (mul_hi(x, y) != 0)
+ return UINT_MAX;
+ return add_sat(x * y, z);
+}
+
+_CLC_OVERLOAD _CLC_DEF long mad_sat(long x, long y, long z) {
+ long hi = mul_hi(x, y);
+ ulong ulo = x * y;
+ long slo = x * y;
+ /* Big overflow of more than 2 bits, add can't fix this */
+ if (((x < 0) == (y < 0)) && hi != 0)
+ return LONG_MAX;
+ /* Low overflow in mul and z not neg enough to correct it */
+ if (hi == 0 && ulo >= LONG_MAX && (z > 0 || (ulo + z) > LONG_MAX))
+ return LONG_MAX;
+ /* Big overflow of more than 2 bits, add can't fix this */
+ if (((x < 0) != (y < 0)) && hi != -1)
+ return LONG_MIN;
+ /* Low overflow in mul and z not pos enough to correct it */
+ if (hi == -1 && ulo <= ((ulong)LONG_MAX + 1UL) && (z < 0 || z < (LONG_MAX - ulo)))
+ return LONG_MIN;
+ /* We have checked all conditions, any overflow in addition returns
+ * the correct value */
+ return ulo + z;
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong mad_sat(ulong x, ulong y, ulong z) {
+ if (mul_hi(x, y) != 0)
+ return ULONG_MAX;
+ return add_sat(x * y, z);
+}
+
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, mad_sat, char, char, char)
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, mad_sat, uchar, uchar, uchar)
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, mad_sat, short, short, short)
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, mad_sat, ushort, ushort, ushort)
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, mad_sat, int, int, int)
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, mad_sat, uint, uint, uint)
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, mad_sat, long, long, long)
+_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, mad_sat, ulong, ulong, ulong)
diff --git a/libclc/generic/lib/integer/mul24.cl b/libclc/generic/lib/integer/mul24.cl
new file mode 100644
index 0000000..8aedca6
--- /dev/null
+++ b/libclc/generic/lib/integer/mul24.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <mul24.inc>
+#include <clc/integer/integer-gentype.inc>
diff --git a/libclc/generic/lib/integer/mul24.inc b/libclc/generic/lib/integer/mul24.inc
new file mode 100644
index 0000000..95a2f1d
--- /dev/null
+++ b/libclc/generic/lib/integer/mul24.inc
@@ -0,0 +1,11 @@
+
+// We need to use shifts here in order to mantain the sign bit for signed
+// integers. The compiler should optimize this to (x & 0x00FFFFFF) for
+// unsigned integers.
+#define CONVERT_TO_24BIT(x) (((x) << 8) >> 8)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y){
+ return CONVERT_TO_24BIT(x) * CONVERT_TO_24BIT(y);
+}
+
+#undef CONVERT_TO_24BIT
diff --git a/libclc/generic/lib/integer/mul_hi.cl b/libclc/generic/lib/integer/mul_hi.cl
new file mode 100644
index 0000000..174d893
--- /dev/null
+++ b/libclc/generic/lib/integer/mul_hi.cl
@@ -0,0 +1,109 @@
+#include <clc/clc.h>
+
+//For all types EXCEPT long, which is implemented separately
+#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
+ _CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \
+ return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
+ } \
+
+//FOIL-based long mul_hi
+//
+// Summary: Treat mul_hi(long x, long y) as:
+// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
+// and b and d are the low-order parts of x and y.
+// Thinking back to algebra, we use FOIL to do the work.
+
+_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){
+ long f, o, i;
+ ulong l;
+
+ //Move the high/low halves of x/y into the lower 32-bits of variables so
+ //that we can multiply them without worrying about overflow.
+ long x_hi = x >> 32;
+ long x_lo = x & UINT_MAX;
+ long y_hi = y >> 32;
+ long y_lo = y & UINT_MAX;
+
+ //Multiply all of the components according to FOIL method
+ f = x_hi * y_hi;
+ o = x_hi * y_lo;
+ i = x_lo * y_hi;
+ l = x_lo * y_lo;
+
+ //Now add the components back together in the following steps:
+ //F: doesn't need to be modified
+ //O/I: Need to be added together.
+ //L: Shift right by 32-bits, then add into the sum of O and I
+ //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+ //
+ //We use hadd to give us a bit of extra precision for the intermediate sums
+ //but as a result, we shift by 31 bits instead of 32
+ return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y){
+ ulong f, o, i;
+ ulong l;
+
+ //Move the high/low halves of x/y into the lower 32-bits of variables so
+ //that we can multiply them without worrying about overflow.
+ ulong x_hi = x >> 32;
+ ulong x_lo = x & UINT_MAX;
+ ulong y_hi = y >> 32;
+ ulong y_lo = y & UINT_MAX;
+
+ //Multiply all of the components according to FOIL method
+ f = x_hi * y_hi;
+ o = x_hi * y_lo;
+ i = x_lo * y_hi;
+ l = x_lo * y_lo;
+
+ //Now add the components back together, taking care to respect the fact that:
+ //F: doesn't need to be modified
+ //O/I: Need to be added together.
+ //L: Shift right by 32-bits, then add into the sum of O and I
+ //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+ //
+ //We use hadd to give us a bit of extra precision for the intermediate sums
+ //but as a result, we shift by 31 bits instead of 32
+ return (f + (hadd(o, (i + (l>>32))) >> 31));
+}
+
+#define __CLC_MUL_HI_VEC(GENTYPE) \
+ _CLC_OVERLOAD _CLC_DEF GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \
+ return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \
+ return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \
+ return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \
+ return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \
+ return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
+ } \
+
+#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
+ __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
+ __CLC_MUL_HI_VEC(TYPE)
+
+#define __CLC_MUL_HI_TYPES() \
+ __CLC_MUL_HI_DEC_IMPL(short, char, 8) \
+ __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \
+ __CLC_MUL_HI_DEC_IMPL(int, short, 16) \
+ __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
+ __CLC_MUL_HI_DEC_IMPL(long, int, 32) \
+ __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
+ __CLC_MUL_HI_VEC(long) \
+ __CLC_MUL_HI_VEC(ulong)
+
+__CLC_MUL_HI_TYPES()
+
+#undef __CLC_MUL_HI_TYPES
+#undef __CLC_MUL_HI_DEC_IMPL
+#undef __CLC_MUL_HI_IMPL
+#undef __CLC_MUL_HI_VEC
+#undef __CLC_B32
diff --git a/libclc/generic/lib/integer/rhadd.cl b/libclc/generic/lib/integer/rhadd.cl
new file mode 100644
index 0000000..c985870
--- /dev/null
+++ b/libclc/generic/lib/integer/rhadd.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <rhadd.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/rhadd.inc b/libclc/generic/lib/integer/rhadd.inc
new file mode 100644
index 0000000..3d60768
--- /dev/null
+++ b/libclc/generic/lib/integer/rhadd.inc
@@ -0,0 +1,6 @@
+//rhadd = (x+y+1)>>1
+//This can be simplified to x>>1 + y>>1 + (1 if either x or y have the 1s bit set)
+//This saves us having to do any checks for overflow in the addition sums
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rhadd(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ return (x>>(__CLC_GENTYPE)1)+(y>>(__CLC_GENTYPE)1)+((x&(__CLC_GENTYPE)1)|(y&(__CLC_GENTYPE)1));
+}
diff --git a/libclc/generic/lib/integer/rotate.cl b/libclc/generic/lib/integer/rotate.cl
new file mode 100644
index 0000000..27ce515
--- /dev/null
+++ b/libclc/generic/lib/integer/rotate.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <rotate.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/rotate.inc b/libclc/generic/lib/integer/rotate.inc
new file mode 100644
index 0000000..33bb0a8
--- /dev/null
+++ b/libclc/generic/lib/integer/rotate.inc
@@ -0,0 +1,42 @@
+/**
+ * Not necessarily optimal... but it produces correct results (at least for int)
+ * If we're lucky, LLVM will recognize the pattern and produce rotate
+ * instructions:
+ * http://llvm.1065342.n5.nabble.com/rotate-td47679.html
+ *
+ * Eventually, someone should feel free to implement an llvm-specific version
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rotate(__CLC_GENTYPE x, __CLC_GENTYPE n){
+ //Try to avoid extra work if someone's spinning the value through multiple
+ //full rotations
+ n = n % (__CLC_GENTYPE)__CLC_GENSIZE;
+
+#ifdef __CLC_SCALAR
+ if (n > 0){
+ return (x << n) | (((__CLC_U_GENTYPE)x) >> (__CLC_GENSIZE - n));
+ } else if (n == 0){
+ return x;
+ } else {
+ return ( (((__CLC_U_GENTYPE)x) >> -n) | (x << (__CLC_GENSIZE + n)) );
+ }
+#else
+ //XXX: There's a lot of __builtin_astype calls to cast everything to
+ // unsigned ... This should be improved so that if __CLC_GENTYPE==__CLC_U_GENTYPE, no
+ // casts are required.
+
+ __CLC_U_GENTYPE x_1 = __builtin_astype(x, __CLC_U_GENTYPE);
+
+ //XXX: Is (__CLC_U_GENTYPE >> S__CLC_GENTYPE) | (__CLC_U_GENTYPE << S__CLC_GENTYPE) legal?
+ // If so, then combine the amt and shifts into a single set of statements
+
+ __CLC_U_GENTYPE amt;
+ amt = (n < (__CLC_GENTYPE)0 ? __builtin_astype((__CLC_GENTYPE)0-n, __CLC_U_GENTYPE) : (__CLC_U_GENTYPE)0);
+ x_1 = (x_1 >> amt) | (x_1 << ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt));
+
+ amt = (n < (__CLC_GENTYPE)0 ? (__CLC_U_GENTYPE)0 : __builtin_astype(n, __CLC_U_GENTYPE));
+ x_1 = (x_1 << amt) | (x_1 >> ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt));
+
+ return __builtin_astype(x_1, __CLC_GENTYPE);
+#endif
+}
diff --git a/libclc/generic/lib/integer/sub_sat.cl b/libclc/generic/lib/integer/sub_sat.cl
new file mode 100644
index 0000000..6b42cc8
--- /dev/null
+++ b/libclc/generic/lib/integer/sub_sat.cl
@@ -0,0 +1,53 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+// From sub_sat.ll
+_CLC_DECL char __clc_sub_sat_s8(char, char);
+_CLC_DECL uchar __clc_sub_sat_u8(uchar, uchar);
+_CLC_DECL short __clc_sub_sat_s16(short, short);
+_CLC_DECL ushort __clc_sub_sat_u16(ushort, ushort);
+_CLC_DECL int __clc_sub_sat_s32(int, int);
+_CLC_DECL uint __clc_sub_sat_u32(uint, uint);
+_CLC_DECL long __clc_sub_sat_s64(long, long);
+_CLC_DECL ulong __clc_sub_sat_u64(ulong, ulong);
+
+_CLC_OVERLOAD _CLC_DEF char sub_sat(char x, char y) {
+ return __clc_sub_sat_s8(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF uchar sub_sat(uchar x, uchar y) {
+ return __clc_sub_sat_u8(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF short sub_sat(short x, short y) {
+ return __clc_sub_sat_s16(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF ushort sub_sat(ushort x, ushort y) {
+ return __clc_sub_sat_u16(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF int sub_sat(int x, int y) {
+ return __clc_sub_sat_s32(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF uint sub_sat(uint x, uint y) {
+ return __clc_sub_sat_u32(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF long sub_sat(long x, long y) {
+ return __clc_sub_sat_s64(x, y);
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong sub_sat(ulong x, ulong y) {
+ return __clc_sub_sat_u64(x, y);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, sub_sat, char, char)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, sub_sat, uchar, uchar)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, sub_sat, short, short)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, sub_sat, ushort, ushort)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, sub_sat, int, int)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, sub_sat, uint, uint)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, sub_sat, long, long)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, sub_sat, ulong, ulong)
diff --git a/libclc/generic/lib/integer/sub_sat_if.ll b/libclc/generic/lib/integer/sub_sat_if.ll
new file mode 100644
index 0000000..7252574
--- /dev/null
+++ b/libclc/generic/lib/integer/sub_sat_if.ll
@@ -0,0 +1,55 @@
+declare i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y)
+
+define i8 @__clc_sub_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y)
+ ret i8 %call
+}
+
+declare i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y)
+
+define i8 @__clc_sub_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y)
+ ret i8 %call
+}
+
+declare i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y)
+
+define i16 @__clc_sub_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y)
+ ret i16 %call
+}
+
+declare i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y)
+
+define i16 @__clc_sub_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y)
+ ret i16 %call
+}
+
+declare i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y)
+
+define i32 @__clc_sub_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y)
+ ret i32 %call
+}
+
+declare i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y)
+
+define i32 @__clc_sub_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y)
+ ret i32 %call
+}
+
+declare i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y)
+
+define i64 @__clc_sub_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y)
+ ret i64 %call
+}
+
+declare i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y)
+
+define i64 @__clc_sub_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y)
+ ret i64 %call
+}
diff --git a/libclc/generic/lib/integer/sub_sat_impl.ll b/libclc/generic/lib/integer/sub_sat_impl.ll
new file mode 100644
index 0000000..e82b632
--- /dev/null
+++ b/libclc/generic/lib/integer/sub_sat_impl.ll
@@ -0,0 +1,83 @@
+declare {i8, i1} @llvm.ssub.with.overflow.i8(i8, i8)
+declare {i8, i1} @llvm.usub.with.overflow.i8(i8, i8)
+
+define i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %x, i8 %y)
+ %res = extractvalue {i8, i1} %call, 0
+ %over = extractvalue {i8, i1} %call, 1
+ %x.msb = ashr i8 %x, 7
+ %x.limit = xor i8 %x.msb, 127
+ %sat = select i1 %over, i8 %x.limit, i8 %res
+ ret i8 %sat
+}
+
+define i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+ %call = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %x, i8 %y)
+ %res = extractvalue {i8, i1} %call, 0
+ %over = extractvalue {i8, i1} %call, 1
+ %sat = select i1 %over, i8 0, i8 %res
+ ret i8 %sat
+}
+
+declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16)
+declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16)
+
+define i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %x, i16 %y)
+ %res = extractvalue {i16, i1} %call, 0
+ %over = extractvalue {i16, i1} %call, 1
+ %x.msb = ashr i16 %x, 15
+ %x.limit = xor i16 %x.msb, 32767
+ %sat = select i1 %over, i16 %x.limit, i16 %res
+ ret i16 %sat
+}
+
+define i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+ %call = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %x, i16 %y)
+ %res = extractvalue {i16, i1} %call, 0
+ %over = extractvalue {i16, i1} %call, 1
+ %sat = select i1 %over, i16 0, i16 %res
+ ret i16 %sat
+}
+
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32)
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+
+define i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %x, i32 %y)
+ %res = extractvalue {i32, i1} %call, 0
+ %over = extractvalue {i32, i1} %call, 1
+ %x.msb = ashr i32 %x, 31
+ %x.limit = xor i32 %x.msb, 2147483647
+ %sat = select i1 %over, i32 %x.limit, i32 %res
+ ret i32 %sat
+}
+
+define i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+ %call = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %x, i32 %y)
+ %res = extractvalue {i32, i1} %call, 0
+ %over = extractvalue {i32, i1} %call, 1
+ %sat = select i1 %over, i32 0, i32 %res
+ ret i32 %sat
+}
+
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64)
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+
+define i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %x, i64 %y)
+ %res = extractvalue {i64, i1} %call, 0
+ %over = extractvalue {i64, i1} %call, 1
+ %x.msb = ashr i64 %x, 63
+ %x.limit = xor i64 %x.msb, 9223372036854775807
+ %sat = select i1 %over, i64 %x.limit, i64 %res
+ ret i64 %sat
+}
+
+define i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+ %call = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+ %res = extractvalue {i64, i1} %call, 0
+ %over = extractvalue {i64, i1} %call, 1
+ %sat = select i1 %over, i64 0, i64 %res
+ ret i64 %sat
+}
diff --git a/libclc/generic/lib/integer/upsample.cl b/libclc/generic/lib/integer/upsample.cl
new file mode 100644
index 0000000..da77315
--- /dev/null
+++ b/libclc/generic/lib/integer/upsample.cl
@@ -0,0 +1,34 @@
+#include <clc/clc.h>
+
+#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE) \
+ _CLC_OVERLOAD _CLC_DEF BGENTYPE upsample(GENTYPE hi, UGENTYPE lo){ \
+ return ((BGENTYPE)hi << GENSIZE) | lo; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF BGENTYPE##2 upsample(GENTYPE##2 hi, UGENTYPE##2 lo){ \
+ return (BGENTYPE##2){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF BGENTYPE##3 upsample(GENTYPE##3 hi, UGENTYPE##3 lo){ \
+ return (BGENTYPE##3){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1), upsample(hi.s2, lo.s2)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF BGENTYPE##4 upsample(GENTYPE##4 hi, UGENTYPE##4 lo){ \
+ return (BGENTYPE##4){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF BGENTYPE##8 upsample(GENTYPE##8 hi, UGENTYPE##8 lo){ \
+ return (BGENTYPE##8){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF BGENTYPE##16 upsample(GENTYPE##16 hi, UGENTYPE##16 lo){ \
+ return (BGENTYPE##16){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
+ } \
+
+#define __CLC_UPSAMPLE_TYPES() \
+ __CLC_UPSAMPLE_IMPL(short, char, uchar, 8) \
+ __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8) \
+ __CLC_UPSAMPLE_IMPL(int, short, ushort, 16) \
+ __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16) \
+ __CLC_UPSAMPLE_IMPL(long, int, uint, 32) \
+ __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32) \
+
+__CLC_UPSAMPLE_TYPES()
+
+#undef __CLC_UPSAMPLE_TYPES
+#undef __CLC_UPSAMPLE_IMPL
diff --git a/libclc/generic/lib/math/acos.cl b/libclc/generic/lib/math/acos.cl
new file mode 100644
index 0000000..3ce9655
--- /dev/null
+++ b/libclc/generic/lib/math/acos.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <acos.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/acos.inc b/libclc/generic/lib/math/acos.inc
new file mode 100644
index 0000000..cac94992
--- /dev/null
+++ b/libclc/generic/lib/math/acos.inc
@@ -0,0 +1,29 @@
+/*
+ * There are multiple formulas for calculating arccosine of x:
+ * 1) acos(x) = (1/2*pi) + i * ln(i*x + sqrt(1-x^2)) (notice the 'i'...)
+ * 2) acos(x) = pi/2 + asin(-x) (asin isn't implemented yet)
+ * 3) acos(x) = pi/2 - asin(x) (ditto)
+ * 4) acos(x) = 2*atan2(sqrt(1-x), sqrt(1+x))
+ * 5) acos(x) = pi/2 - atan2(x, ( sqrt(1-x^2) ) )
+ *
+ * Options 1-3 are not currently usable, #5 generates more concise radeonsi
+ * bitcode and assembly than #4 (134 vs 132 instructions on radeonsi), but
+ * precision of #4 may be better.
+ */
+
+#if __CLC_FPSIZE == 32
+#define __CLC_CONST(x) x ## f
+#else
+#define __CLC_CONST(x) x
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE acos(__CLC_GENTYPE x) {
+ return (
+ (__CLC_GENTYPE) __CLC_CONST(2.0) * atan2(
+ sqrt((__CLC_GENTYPE) __CLC_CONST(1.0) - x),
+ sqrt((__CLC_GENTYPE) __CLC_CONST(1.0) + x)
+ )
+ );
+}
+
+#undef __CLC_CONST
diff --git a/libclc/generic/lib/math/acosh.cl b/libclc/generic/lib/math/acosh.cl
new file mode 100644
index 0000000..cc10dd4
--- /dev/null
+++ b/libclc/generic/lib/math/acosh.cl
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "ep_log.h"
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float acosh(float x) {
+ uint ux = as_uint(x);
+
+ // Arguments greater than 1/sqrt(epsilon) in magnitude are
+ // approximated by acosh(x) = ln(2) + ln(x)
+ // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is
+ // acosh(x) = ln(x + sqrt(x*x-1)) */
+ int high = ux > 0x46000000U;
+ int med = ux > 0x40000000U;
+
+ float w = x - 1.0f;
+ float s = w*w + 2.0f*w;
+ float t = x*x - 1.0f;
+ float r = sqrt(med ? t : s) + (med ? x : w);
+ float v = (high ? x : r) - (med ? 1.0f : 0.0f);
+ float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f);
+
+ z = ux >= PINFBITPATT_SP32 ? x : z;
+ z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z;
+
+ return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, acosh, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double acosh(double x) {
+ const double recrteps = 0x1.6a09e667f3bcdp+26; // 1/sqrt(eps) = 9.49062656242515593767e+07
+ //log2_lead and log2_tail sum to an extra-precise version of log(2)
+ const double log2_lead = 0x1.62e42ep-1;
+ const double log2_tail = 0x1.efa39ef35793cp-25;
+
+ // Handle x >= 128 here
+ int xlarge = x > recrteps;
+ double r = x + sqrt(fma(x, x, -1.0));
+ r = xlarge ? x : r;
+
+ int xexp;
+ double r1, r2;
+ __clc_ep_log(r, &xexp, &r1, &r2);
+
+ double dxexp = xexp + xlarge;
+ r1 = fma(dxexp, log2_lead, r1);
+ r2 = fma(dxexp, log2_tail, r2);
+
+ double ret1 = r1 + r2;
+
+ // Handle 1 < x < 128 here
+ // We compute the value
+ // t = x - 1.0 + sqrt(2.0*(x - 1.0) + (x - 1.0)*(x - 1.0))
+ // using simulated quad precision.
+ double t = x - 1.0;
+ double u1 = t * 2.0;
+
+ // (t,0) * (t,0) -> (v1, v2)
+ double v1 = t * t;
+ double v2 = fma(t, t, -v1);
+
+ // (u1,0) + (v1,v2) -> (w1,w2)
+ r = u1 + v1;
+ double s = (((u1 - r) + v1) + v2);
+ double w1 = r + s;
+ double w2 = (r - w1) + s;
+
+ // sqrt(w1,w2) -> (u1,u2)
+ double p1 = sqrt(w1);
+ double a1 = p1*p1;
+ double a2 = fma(p1, p1, -a1);
+ double temp = (((w1 - a1) - a2) + w2);
+ double p2 = MATH_DIVIDE(temp * 0.5, p1);
+ u1 = p1 + p2;
+ double u2 = (p1 - u1) + p2;
+
+ // (u1,u2) + (t,0) -> (r1,r2)
+ r = u1 + t;
+ s = ((u1 - r) + t) + u2;
+ // r1 = r + s;
+ // r2 = (r - r1) + s;
+ // t = r1 + r2;
+ t = r + s;
+
+ // For arguments 1.13 <= x <= 1.5 the log1p function is good enough
+ double ret2 = log1p(t);
+
+ ulong ux = as_ulong(x);
+ double ret = x >= 128.0 ? ret1 : ret2;
+
+ ret = ux >= 0x7FF0000000000000 ? x : ret;
+ ret = x == 1.0 ? 0.0 : ret;
+ ret = (ux & SIGNBIT_DP64) != 0UL | x < 1.0 ? as_double(QNANBITPATT_DP64) : ret;
+
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acosh, double)
+
+#endif
diff --git a/libclc/generic/lib/math/acospi.cl b/libclc/generic/lib/math/acospi.cl
new file mode 100644
index 0000000..c91fc41
--- /dev/null
+++ b/libclc/generic/lib/math/acospi.cl
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float acospi(float x) {
+ // Computes arccos(x).
+ // The argument is first reduced by noting that arccos(x)
+ // is invalid for abs(x) > 1. For denormal and small
+ // arguments arccos(x) = pi/2 to machine accuracy.
+ // Remaining argument ranges are handled as follows.
+ // For abs(x) <= 0.5 use
+ // arccos(x) = pi/2 - arcsin(x)
+ // = pi/2 - (x + x^3*R(x^2))
+ // where R(x^2) is a rational minimax approximation to
+ // (arcsin(x) - x)/x^3.
+ // For abs(x) > 0.5 exploit the identity:
+ // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+ // together with the above rational approximation, and
+ // reconstruct the terms carefully.
+
+
+ // Some constants and split constants.
+ const float pi = 3.1415926535897933e+00f;
+ const float piby2_head = 1.5707963267948965580e+00f; /* 0x3ff921fb54442d18 */
+ const float piby2_tail = 6.12323399573676603587e-17f; /* 0x3c91a62633145c07 */
+
+ uint ux = as_uint(x);
+ uint aux = ux & ~SIGNBIT_SP32;
+ int xneg = ux != aux;
+ int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+ float y = as_float(aux);
+
+ // transform if |x| >= 0.5
+ int transform = xexp >= -1;
+
+ float y2 = y * y;
+ float yt = 0.5f * (1.0f - y);
+ float r = transform ? yt : y2;
+
+ // Use a rational approximation for [0.0, 0.5]
+ float a = mad(r, mad(r, mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F),
+ -0.0565298683201845211985026327361F),
+ 0.184161606965100694821398249421F);
+ float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F);
+ float u = r * MATH_DIVIDE(a, b);
+
+ float s = MATH_SQRT(r);
+ y = s;
+ float s1 = as_float(as_uint(s) & 0xffff0000);
+ float c = MATH_DIVIDE(r - s1 * s1, s + s1);
+ // float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + (y * u - piby2_tail)), pi);
+ float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + mad(y, u, -piby2_tail)), pi);
+ // float rettp = MATH_DIVIDE(2.0F * s1 + (2.0F * c + 2.0F * y * u), pi);
+ float rettp = MATH_DIVIDE(2.0f*(s1 + mad(y, u, c)), pi);
+ float rett = xneg ? rettn : rettp;
+ // float ret = MATH_DIVIDE(piby2_head - (x - (piby2_tail - x * u)), pi);
+ float ret = MATH_DIVIDE(piby2_head - (x - mad(x, -u, piby2_tail)), pi);
+
+ ret = transform ? rett : ret;
+ ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret;
+ ret = ux == 0x3f800000U ? 0.0f : ret;
+ ret = ux == 0xbf800000U ? 1.0f : ret;
+ ret = xexp < -26 ? 0.5f : ret;
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, acospi, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double acospi(double x) {
+ // Computes arccos(x).
+ // The argument is first reduced by noting that arccos(x)
+ // is invalid for abs(x) > 1. For denormal and small
+ // arguments arccos(x) = pi/2 to machine accuracy.
+ // Remaining argument ranges are handled as follows.
+ // For abs(x) <= 0.5 use
+ // arccos(x) = pi/2 - arcsin(x)
+ // = pi/2 - (x + x^3*R(x^2))
+ // where R(x^2) is a rational minimax approximation to
+ // (arcsin(x) - x)/x^3.
+ // For abs(x) > 0.5 exploit the identity:
+ // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+ // together with the above rational approximation, and
+ // reconstruct the terms carefully.
+
+ const double pi = 0x1.921fb54442d18p+1;
+ const double piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
+
+ double y = fabs(x);
+ int xneg = as_int2(x).hi < 0;
+ int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64;
+
+ // abs(x) >= 0.5
+ int transform = xexp >= -1;
+
+ // Transform y into the range [0,0.5)
+ double r1 = 0.5 * (1.0 - y);
+ double s = sqrt(r1);
+ double r = y * y;
+ r = transform ? r1 : r;
+ y = transform ? s : y;
+
+ // Use a rational approximation for [0.0, 0.5]
+ double un = fma(r,
+ fma(r,
+ fma(r,
+ fma(r,
+ fma(r, 0.0000482901920344786991880522822991,
+ 0.00109242697235074662306043804220),
+ -0.0549989809235685841612020091328),
+ 0.275558175256937652532686256258),
+ -0.445017216867635649900123110649),
+ 0.227485835556935010735943483075);
+
+ double ud = fma(r,
+ fma(r,
+ fma(r,
+ fma(r, 0.105869422087204370341222318533,
+ -0.943639137032492685763471240072),
+ 2.76568859157270989520376345954),
+ -3.28431505720958658909889444194),
+ 1.36491501334161032038194214209);
+
+ double u = r * MATH_DIVIDE(un, ud);
+
+ // Reconstruct acos carefully in transformed region
+ double res1 = fma(-2.0, MATH_DIVIDE(s + fma(y, u, -piby2_tail), pi), 1.0);
+ double s1 = as_double(as_ulong(s) & 0xffffffff00000000UL);
+ double c = MATH_DIVIDE(fma(-s1, s1, r), s + s1);
+ double res2 = MATH_DIVIDE(fma(2.0, s1, fma(2.0, c, 2.0 * y * u)), pi);
+ res1 = xneg ? res1 : res2;
+ res2 = 0.5 - fma(x, u, x) / pi;
+ res1 = transform ? res1 : res2;
+
+ const double qnan = as_double(QNANBITPATT_DP64);
+ res2 = x == 1.0 ? 0.0 : qnan;
+ res2 = x == -1.0 ? 1.0 : res2;
+ res1 = xexp >= 0 ? res2 : res1;
+ res1 = xexp < -56 ? 0.5 : res1;
+
+ return res1;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acospi, double)
+
+#endif
diff --git a/libclc/generic/lib/math/asin.cl b/libclc/generic/lib/math/asin.cl
new file mode 100644
index 0000000..d56dbd7
--- /dev/null
+++ b/libclc/generic/lib/math/asin.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <asin.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/asin.inc b/libclc/generic/lib/math/asin.inc
new file mode 100644
index 0000000..4643cf8
--- /dev/null
+++ b/libclc/generic/lib/math/asin.inc
@@ -0,0 +1,12 @@
+
+#if __CLC_FPSIZE == 32
+#define __CLC_CONST(x) x ## f
+#else
+#define __CLC_CONST(x) x
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE asin(__CLC_GENTYPE x) {
+ return atan2(x, sqrt( (__CLC_GENTYPE)__CLC_CONST(1.0) -(x*x) ));
+}
+
+#undef __CLC_CONST
diff --git a/libclc/generic/lib/math/asinh.cl b/libclc/generic/lib/math/asinh.cl
new file mode 100644
index 0000000..cfddb31c
--- /dev/null
+++ b/libclc/generic/lib/math/asinh.cl
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "ep_log.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float asinh(float x) {
+ uint ux = as_uint(x);
+ uint ax = ux & EXSIGNBIT_SP32;
+ uint xsgn = ax ^ ux;
+
+ // |x| <= 2
+ float t = x * x;
+ float a = mad(t,
+ mad(t,
+ mad(t,
+ mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f),
+ -5.063201055468483248e-1f),
+ -1.480204186473758321f),
+ -1.152965835871758072f);
+ float b = mad(t,
+ mad(t,
+ mad(t,
+ mad(t, 6.284381367285534560e-2f, 1.260024978680227945f),
+ 6.582362487198468066f),
+ 11.99423176003939087f),
+ 6.917795026025976739f);
+
+ float q = MATH_DIVIDE(a, b);
+ float z1 = mad(x*t, q, x);
+
+ // |x| > 2
+
+ // Arguments greater than 1/sqrt(epsilon) in magnitude are
+ // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x
+ // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+ // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+ // with the sign of x (see Abramowitz and Stegun 4.6.20)
+
+ float absx = as_float(ax);
+ int hi = ax > 0x46000000U;
+ float y = MATH_SQRT(absx * absx + 1.0f) + absx;
+ y = hi ? absx : y;
+ float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f);
+ float z2 = as_float(xsgn | as_uint(r));
+
+ float z = ax <= 0x40000000 ? z1 : z2;
+ z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z;
+
+ return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, asinh, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define NA0 -0.12845379283524906084997e0
+#define NA1 -0.21060688498409799700819e0
+#define NA2 -0.10188951822578188309186e0
+#define NA3 -0.13891765817243625541799e-1
+#define NA4 -0.10324604871728082428024e-3
+
+#define DA0 0.77072275701149440164511e0
+#define DA1 0.16104665505597338100747e1
+#define DA2 0.11296034614816689554875e1
+#define DA3 0.30079351943799465092429e0
+#define DA4 0.235224464765951442265117e-1
+
+#define NB0 -0.12186605129448852495563e0
+#define NB1 -0.19777978436593069928318e0
+#define NB2 -0.94379072395062374824320e-1
+#define NB3 -0.12620141363821680162036e-1
+#define NB4 -0.903396794842691998748349e-4
+
+#define DB0 0.73119630776696495279434e0
+#define DB1 0.15157170446881616648338e1
+#define DB2 0.10524909506981282725413e1
+#define DB3 0.27663713103600182193817e0
+#define DB4 0.21263492900663656707646e-1
+
+#define NC0 -0.81210026327726247622500e-1
+#define NC1 -0.12327355080668808750232e0
+#define NC2 -0.53704925162784720405664e-1
+#define NC3 -0.63106739048128554465450e-2
+#define NC4 -0.35326896180771371053534e-4
+
+#define DC0 0.48726015805581794231182e0
+#define DC1 0.95890837357081041150936e0
+#define DC2 0.62322223426940387752480e0
+#define DC3 0.15028684818508081155141e0
+#define DC4 0.10302171620320141529445e-1
+
+#define ND0 -0.4638179204422665073e-1
+#define ND1 -0.7162729496035415183e-1
+#define ND2 -0.3247795155696775148e-1
+#define ND3 -0.4225785421291932164e-2
+#define ND4 -0.3808984717603160127e-4
+#define ND5 0.8023464184964125826e-6
+
+#define DD0 0.2782907534642231184e0
+#define DD1 0.5549945896829343308e0
+#define DD2 0.3700732511330698879e0
+#define DD3 0.9395783438240780722e-1
+#define DD4 0.7200057974217143034e-2
+
+#define NE0 -0.121224194072430701e-4
+#define NE1 -0.273145455834305218e-3
+#define NE2 -0.152866982560895737e-2
+#define NE3 -0.292231744584913045e-2
+#define NE4 -0.174670900236060220e-2
+#define NE5 -0.891754209521081538e-12
+
+#define DE0 0.499426632161317606e-4
+#define DE1 0.139591210395547054e-2
+#define DE2 0.107665231109108629e-1
+#define DE3 0.325809818749873406e-1
+#define DE4 0.415222526655158363e-1
+#define DE5 0.186315628774716763e-1
+
+#define NF0 -0.195436610112717345e-4
+#define NF1 -0.233315515113382977e-3
+#define NF2 -0.645380957611087587e-3
+#define NF3 -0.478948863920281252e-3
+#define NF4 -0.805234112224091742e-12
+#define NF5 0.246428598194879283e-13
+
+#define DF0 0.822166621698664729e-4
+#define DF1 0.135346265620413852e-2
+#define DF2 0.602739242861830658e-2
+#define DF3 0.972227795510722956e-2
+#define DF4 0.510878800983771167e-2
+
+#define NG0 -0.209689451648100728e-6
+#define NG1 -0.219252358028695992e-5
+#define NG2 -0.551641756327550939e-5
+#define NG3 -0.382300259826830258e-5
+#define NG4 -0.421182121910667329e-17
+#define NG5 0.492236019998237684e-19
+
+#define DG0 0.889178444424237735e-6
+#define DG1 0.131152171690011152e-4
+#define DG2 0.537955850185616847e-4
+#define DG3 0.814966175170941864e-4
+#define DG4 0.407786943832260752e-4
+
+#define NH0 -0.178284193496441400e-6
+#define NH1 -0.928734186616614974e-6
+#define NH2 -0.923318925566302615e-6
+#define NH3 -0.776417026702577552e-19
+#define NH4 0.290845644810826014e-21
+
+#define DH0 0.786694697277890964e-6
+#define DH1 0.685435665630965488e-5
+#define DH2 0.153780175436788329e-4
+#define DH3 0.984873520613417917e-5
+
+#define NI0 -0.538003743384069117e-10
+#define NI1 -0.273698654196756169e-9
+#define NI2 -0.268129826956403568e-9
+#define NI3 -0.804163374628432850e-29
+
+#define DI0 0.238083376363471960e-9
+#define DI1 0.203579344621125934e-8
+#define DI2 0.450836980450693209e-8
+#define DI3 0.286005148753497156e-8
+
+_CLC_OVERLOAD _CLC_DEF double asinh(double x) {
+ const double rteps = 0x1.6a09e667f3bcdp-27;
+ const double recrteps = 0x1.6a09e667f3bcdp+26;
+
+ // log2_lead and log2_tail sum to an extra-precise version of log(2)
+ const double log2_lead = 0x1.62e42ep-1;
+ const double log2_tail = 0x1.efa39ef35793cp-25;
+
+ ulong ux = as_ulong(x);
+ ulong ax = ux & ~SIGNBIT_DP64;
+ double absx = as_double(ax);
+
+ double t = x * x;
+ double pn, tn, pd, td;
+
+ // XXX we are betting here that we can evaluate 8 pairs of
+ // polys faster than we can grab 12 coefficients from a table
+ // This also uses fewer registers
+
+ // |x| >= 8
+ pn = fma(t, fma(t, fma(t, NI3, NI2), NI1), NI0);
+ pd = fma(t, fma(t, fma(t, DI3, DI2), DI1), DI0);
+
+ tn = fma(t, fma(t, fma(t, fma(t, NH4, NH3), NH2), NH1), NH0);
+ td = fma(t, fma(t, fma(t, DH3, DH2), DH1), DH0);
+ pn = absx < 8.0 ? tn : pn;
+ pd = absx < 8.0 ? td : pd;
+
+ tn = fma(t, fma(t, fma(t, fma(t, fma(t, NG5, NG4), NG3), NG2), NG1), NG0);
+ td = fma(t, fma(t, fma(t, fma(t, DG4, DG3), DG2), DG1), DG0);
+ pn = absx < 4.0 ? tn : pn;
+ pd = absx < 4.0 ? td : pd;
+
+ tn = fma(t, fma(t, fma(t, fma(t, fma(t, NF5, NF4), NF3), NF2), NF1), NF0);
+ td = fma(t, fma(t, fma(t, fma(t, DF4, DF3), DF2), DF1), DF0);
+ pn = absx < 2.0 ? tn : pn;
+ pd = absx < 2.0 ? td : pd;
+
+ tn = fma(t, fma(t, fma(t, fma(t, fma(t, NE5, NE4), NE3), NE2), NE1), NE0);
+ td = fma(t, fma(t, fma(t, fma(t, fma(t, DE5, DE4), DE3), DE2), DE1), DE0);
+ pn = absx < 1.5 ? tn : pn;
+ pd = absx < 1.5 ? td : pd;
+
+ tn = fma(t, fma(t, fma(t, fma(t, fma(t, ND5, ND4), ND3), ND2), ND1), ND0);
+ td = fma(t, fma(t, fma(t, fma(t, DD4, DD3), DD2), DD1), DD0);
+ pn = absx <= 1.0 ? tn : pn;
+ pd = absx <= 1.0 ? td : pd;
+
+ tn = fma(t, fma(t, fma(t, fma(t, NC4, NC3), NC2), NC1), NC0);
+ td = fma(t, fma(t, fma(t, fma(t, DC4, DC3), DC2), DC1), DC0);
+ pn = absx < 0.75 ? tn : pn;
+ pd = absx < 0.75 ? td : pd;
+
+ tn = fma(t, fma(t, fma(t, fma(t, NB4, NB3), NB2), NB1), NB0);
+ td = fma(t, fma(t, fma(t, fma(t, DB4, DB3), DB2), DB1), DB0);
+ pn = absx < 0.5 ? tn : pn;
+ pd = absx < 0.5 ? td : pd;
+
+ tn = fma(t, fma(t, fma(t, fma(t, NA4, NA3), NA2), NA1), NA0);
+ td = fma(t, fma(t, fma(t, fma(t, DA4, DA3), DA2), DA1), DA0);
+ pn = absx < 0.25 ? tn : pn;
+ pd = absx < 0.25 ? td : pd;
+
+ double pq = MATH_DIVIDE(pn, pd);
+
+ // |x| <= 1
+ double result1 = fma(absx*t, pq, absx);
+
+ // Other ranges
+ int xout = absx <= 32.0 | absx > recrteps;
+ double y = absx + sqrt(fma(absx, absx, 1.0));
+ y = xout ? absx : y;
+
+ double r1, r2;
+ int xexp;
+ __clc_ep_log(y, &xexp, &r1, &r2);
+
+ double dxexp = (double)(xexp + xout);
+ r1 = fma(dxexp, log2_lead, r1);
+ r2 = fma(dxexp, log2_tail, r2);
+
+ // 1 < x <= 32
+ double v2 = (pq + 0.25) / t;
+ double r = v2 + r1;
+ double s = ((r1 - r) + v2) + r2;
+ double v1 = r + s;
+ v2 = (r - v1) + s;
+ double result2 = v1 + v2;
+
+ // x > 32
+ double result3 = r1 + r2;
+
+ double ret = absx > 1.0 ? result2 : result1;
+ ret = absx > 32.0 ? result3 : ret;
+ ret = x < 0.0 ? -ret : ret;
+
+ // NaN, +-Inf, or x small enough that asinh(x) = x
+ ret = ax >= PINFBITPATT_DP64 | absx < rteps ? x : ret;
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, asinh, double)
+
+#endif
diff --git a/libclc/generic/lib/math/asinpi.cl b/libclc/generic/lib/math/asinpi.cl
new file mode 100644
index 0000000..511d74e
--- /dev/null
+++ b/libclc/generic/lib/math/asinpi.cl
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float asinpi(float x) {
+ // Computes arcsin(x).
+ // The argument is first reduced by noting that arcsin(x)
+ // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+ // For denormal and small arguments arcsin(x) = x to machine
+ // accuracy. Remaining argument ranges are handled as follows.
+ // For abs(x) <= 0.5 use
+ // arcsin(x) = x + x^3*R(x^2)
+ // where R(x^2) is a rational minimax approximation to
+ // (arcsin(x) - x)/x^3.
+ // For abs(x) > 0.5 exploit the identity:
+ // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+ // together with the above rational approximation, and
+ // reconstruct the terms carefully.
+
+
+ const float pi = 3.1415926535897933e+00f;
+ const float piby2_tail = 7.5497894159e-08F; /* 0x33a22168 */
+ const float hpiby2_head = 7.8539812565e-01F; /* 0x3f490fda */
+
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint xs = ux ^ aux;
+ float shalf = as_float(xs | as_uint(0.5f));
+
+ int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+ float y = as_float(aux);
+
+ // abs(x) >= 0.5
+ int transform = xexp >= -1;
+
+ float y2 = y * y;
+ float rt = 0.5f * (1.0f - y);
+ float r = transform ? rt : y2;
+
+ // Use a rational approximation for [0.0, 0.5]
+ float a = mad(r,
+ mad(r,
+ mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F),
+ -0.0565298683201845211985026327361F),
+ 0.184161606965100694821398249421F);
+ float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F);
+ float u = r * MATH_DIVIDE(a, b);
+
+ float s = MATH_SQRT(r);
+ float s1 = as_float(as_uint(s) & 0xffff0000);
+ float c = MATH_DIVIDE(mad(-s1, s1, r), s + s1);
+ float p = mad(2.0f*s, u, -mad(c, -2.0f, piby2_tail));
+ float q = mad(s1, -2.0f, hpiby2_head);
+ float vt = hpiby2_head - (p - q);
+ float v = mad(y, u, y);
+ v = transform ? vt : v;
+ v = MATH_DIVIDE(v, pi);
+ float xbypi = MATH_DIVIDE(x, pi);
+
+ float ret = as_float(xs | as_uint(v));
+ ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret;
+ ret = aux == 0x3f800000U ? shalf : ret;
+ ret = xexp < -14 ? xbypi : ret;
+
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, asinpi, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double asinpi(double x) {
+ // Computes arcsin(x).
+ // The argument is first reduced by noting that arcsin(x)
+ // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+ // For denormal and small arguments arcsin(x) = x to machine
+ // accuracy. Remaining argument ranges are handled as follows.
+ // For abs(x) <= 0.5 use
+ // arcsin(x) = x + x^3*R(x^2)
+ // where R(x^2) is a rational minimax approximation to
+ // (arcsin(x) - x)/x^3.
+ // For abs(x) > 0.5 exploit the identity:
+ // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+ // together with the above rational approximation, and
+ // reconstruct the terms carefully.
+
+ const double pi = 0x1.921fb54442d18p+1;
+ const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */
+ const double hpiby2_head = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */
+
+ double y = fabs(x);
+ int xneg = as_int2(x).hi < 0;
+ int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64;
+
+ // abs(x) >= 0.5
+ int transform = xexp >= -1;
+
+ double rt = 0.5 * (1.0 - y);
+ double y2 = y * y;
+ double r = transform ? rt : y2;
+
+ // Use a rational approximation for [0.0, 0.5]
+ double un = fma(r,
+ fma(r,
+ fma(r,
+ fma(r,
+ fma(r, 0.0000482901920344786991880522822991,
+ 0.00109242697235074662306043804220),
+ -0.0549989809235685841612020091328),
+ 0.275558175256937652532686256258),
+ -0.445017216867635649900123110649),
+ 0.227485835556935010735943483075);
+
+ double ud = fma(r,
+ fma(r,
+ fma(r,
+ fma(r, 0.105869422087204370341222318533,
+ -0.943639137032492685763471240072),
+ 2.76568859157270989520376345954),
+ -3.28431505720958658909889444194),
+ 1.36491501334161032038194214209);
+
+ double u = r * MATH_DIVIDE(un, ud);
+
+
+ // Reconstruct asin carefully in transformed region
+ double s = sqrt(r);
+ double sh = as_double(as_ulong(s) & 0xffffffff00000000UL);
+ double c = MATH_DIVIDE(fma(-sh, sh, r), s + sh);
+ double p = fma(2.0*s, u, -fma(-2.0, c, piby2_tail));
+ double q = fma(-2.0, sh, hpiby2_head);
+ double vt = hpiby2_head - (p - q);
+ double v = fma(y, u, y);
+ v = transform ? vt : v;
+
+ v = xexp < -28 ? y : v;
+ v = MATH_DIVIDE(v, pi);
+ v = xexp >= 0 ? as_double(QNANBITPATT_DP64) : v;
+ v = y == 1.0 ? 0.5 : v;
+ return xneg ? -v : v;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, asinpi, double)
+
+#endif
diff --git a/libclc/generic/lib/math/atan.cl b/libclc/generic/lib/math/atan.cl
new file mode 100644
index 0000000..fa3633c
--- /dev/null
+++ b/libclc/generic/lib/math/atan.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math.h"
+#include "../clcmacro.h"
+
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF float atan(float x)
+{
+ const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18
+
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint sx = ux ^ aux;
+
+ float spiby2 = as_float(sx | as_uint(piby2));
+
+ float v = as_float(aux);
+
+ // Return for NaN
+ float ret = x;
+
+ // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+ ret = aux <= PINFBITPATT_SP32 ? spiby2 : ret;
+
+ // Reduce arguments 2^-19 <= |x| < 2^26
+
+ // 39/16 <= x < 2^26
+ x = -MATH_RECIP(v);
+ float c = 1.57079632679489655800f; // atan(infinity)
+
+ // 19/16 <= x < 39/16
+ int l = aux < 0x401c0000;
+ float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+ x = l ? xx : x;
+ c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+ // 11/16 <= x < 19/16
+ l = aux < 0x3f980000U;
+ xx = MATH_DIVIDE(v - 1.0f, 1.0f + v);
+ x = l ? xx : x;
+ c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+ // 7/16 <= x < 11/16
+ l = aux < 0x3f300000;
+ xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+ x = l ? xx : x;
+ c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+ // 2^-19 <= x < 7/16
+ l = aux < 0x3ee00000;
+ x = l ? v : x;
+ c = l ? 0.0f : c;
+
+ // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+ float s = x * x;
+ float a = mad(s,
+ mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+ 0.296528598819239217902158651186f);
+
+ float b = mad(s,
+ mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+ 0.889585796862432286486651434570f);
+
+ float q = x * s * MATH_DIVIDE(a, b);
+
+ float z = c - (q - x);
+ float zs = as_float(sx | as_uint(z));
+
+ ret = aux < 0x4c800000 ? zs : ret;
+
+ // |x| < 2^-19
+ ret = aux < 0x36000000 ? as_float(ux) : ret;
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+_CLC_OVERLOAD _CLC_DEF double atan(double x)
+{
+ const double piby2 = 1.5707963267948966e+00; // 0x3ff921fb54442d18
+
+ double v = fabs(x);
+
+ // 2^56 > v > 39/16
+ double a = -1.0;
+ double b = v;
+ // (chi + clo) = arctan(infinity)
+ double chi = 1.57079632679489655800e+00;
+ double clo = 6.12323399573676480327e-17;
+
+ double ta = v - 1.5;
+ double tb = 1.0 + 1.5 * v;
+ int l = v <= 0x1.38p+1; // 39/16 > v > 19/16
+ a = l ? ta : a;
+ b = l ? tb : b;
+ // (chi + clo) = arctan(1.5)
+ chi = l ? 9.82793723247329054082e-01 : chi;
+ clo = l ? 1.39033110312309953701e-17 : clo;
+
+ ta = v - 1.0;
+ tb = 1.0 + v;
+ l = v <= 0x1.3p+0; // 19/16 > v > 11/16
+ a = l ? ta : a;
+ b = l ? tb : b;
+ // (chi + clo) = arctan(1.)
+ chi = l ? 7.85398163397448278999e-01 : chi;
+ clo = l ? 3.06161699786838240164e-17 : clo;
+
+ ta = 2.0 * v - 1.0;
+ tb = 2.0 + v;
+ l = v <= 0x1.6p-1; // 11/16 > v > 7/16
+ a = l ? ta : a;
+ b = l ? tb : b;
+ // (chi + clo) = arctan(0.5)
+ chi = l ? 4.63647609000806093515e-01 : chi;
+ clo = l ? 2.26987774529616809294e-17 : clo;
+
+ l = v <= 0x1.cp-2; // v < 7/16
+ a = l ? v : a;
+ b = l ? 1.0 : b;;
+ chi = l ? 0.0 : chi;
+ clo = l ? 0.0 : clo;
+
+ // Core approximation: Remez(4,4) on [-7/16,7/16]
+ double r = a / b;
+ double s = r * r;
+ double qn = fma(s,
+ fma(s,
+ fma(s,
+ fma(s, 0.142316903342317766e-3,
+ 0.304455919504853031e-1),
+ 0.220638780716667420e0),
+ 0.447677206805497472e0),
+ 0.268297920532545909e0);
+
+ double qd = fma(s,
+ fma(s,
+ fma(s,
+ fma(s, 0.389525873944742195e-1,
+ 0.424602594203847109e0),
+ 0.141254259931958921e1),
+ 0.182596787737507063e1),
+ 0.804893761597637733e0);
+
+ double q = r * s * qn / qd;
+ r = chi - ((q - clo) - r);
+
+ double z = isnan(x) ? x : piby2;
+ z = v <= 0x1.0p+56 ? r : z;
+ z = v < 0x1.0p-26 ? v : z;
+ return x == v ? z : -z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan, double);
+
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/atan2.cl b/libclc/generic/lib/math/atan2.cl
new file mode 100644
index 0000000..a2f104f
--- /dev/null
+++ b/libclc/generic/lib/math/atan2.cl
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float atan2(float y, float x)
+{
+ const float pi = 0x1.921fb6p+1f;
+ const float piby2 = 0x1.921fb6p+0f;
+ const float piby4 = 0x1.921fb6p-1f;
+ const float threepiby4 = 0x1.2d97c8p+1f;
+
+ float ax = fabs(x);
+ float ay = fabs(y);
+ float v = min(ax, ay);
+ float u = max(ax, ay);
+
+ // Scale since u could be large, as in "regular" divide
+ float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+ float vbyu = s * MATH_DIVIDE(v, s*u);
+
+ float vbyu2 = vbyu * vbyu;
+
+#define USE_2_2_APPROXIMATION
+#if defined USE_2_2_APPROXIMATION
+ float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+ float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+#else
+ float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu;
+ float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f);
+#endif
+
+ // Octant 0 result
+ float a = mad(p, MATH_RECIP(q), vbyu);
+
+ // Fix up 3 other octants
+ float at = piby2 - a;
+ a = ay > ax ? at : a;
+ at = pi - a;
+ a = x < 0.0F ? at : a;
+
+ // y == 0 => 0 for x >= 0, pi for x < 0
+ at = as_int(x) < 0 ? pi : 0.0f;
+ a = y == 0.0f ? at : a;
+
+ // if (!FINITE_ONLY()) {
+ // x and y are +- Inf
+ at = x > 0.0f ? piby4 : threepiby4;
+ a = ax == INFINITY & ay == INFINITY ? at : a;
+
+ // x or y is NaN
+ a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+ // }
+
+ // Fixup sign and return
+ return copysign(a, y);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan2, float, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double atan2(double y, double x)
+{
+ const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */
+ const double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
+ const double piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */
+ const double three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */
+ const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */
+ const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */
+ const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */
+ const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */
+
+ double x2 = x;
+ int xneg = as_int2(x).hi < 0;
+ int xexp = (as_int2(x).hi >> 20) & 0x7ff;
+
+ double y2 = y;
+ int yneg = as_int2(y).hi < 0;
+ int yexp = (as_int2(y).hi >> 20) & 0x7ff;
+
+ int cond2 = (xexp < 1021) & (yexp < 1021);
+ int diffexp = yexp - xexp;
+
+ // Scale up both x and y if they are both below 1/4
+ double x1 = ldexp(x, 1024);
+ int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff;
+ double y1 = ldexp(y, 1024);
+ int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff;
+ int diffexp1 = yexp1 - xexp1;
+
+ diffexp = cond2 ? diffexp1 : diffexp;
+ x = cond2 ? x1 : x;
+ y = cond2 ? y1 : y;
+
+ // General case: take absolute values of arguments
+ double u = fabs(x);
+ double v = fabs(y);
+
+ // Swap u and v if necessary to obtain 0 < v < u. Compute v/u.
+ int swap_vu = u < v;
+ double uu = u;
+ u = swap_vu ? v : u;
+ v = swap_vu ? uu : v;
+
+ double vbyu = v / u;
+ double q1, q2;
+
+ // General values of v/u. Use a look-up table and series expansion.
+
+ {
+ double val = vbyu > 0.0625 ? vbyu : 0.063;
+ int index = convert_int(fma(256.0, val, 0.5));
+ double2 tv = USE_TABLE(atan_jby256_tbl, index - 16);
+ q1 = tv.s0;
+ q2 = tv.s1;
+ double c = (double)index * 0x1.0p-8;
+
+ // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1
+ // u_exponent could be EMAX so we have to do it in 2 steps
+ int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
+ //double um = __amdil_ldexp_f64(u, m);
+ //double vm = __amdil_ldexp_f64(v, m);
+ double um = ldexp(u, m);
+ double vm = ldexp(v, m);
+
+ // 26 leading bits of u
+ double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL);
+ double u2 = um - u1;
+
+ double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um));
+
+ // Polynomial approximation to atan(r)
+ double s = r * r;
+ q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r);
+ }
+
+
+ double q3, q4;
+ {
+ q3 = 0.0;
+ q4 = vbyu;
+ }
+
+ double q5, q6;
+ {
+ double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL);
+ double u2 = u - u1;
+ double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL);
+ double vu2 = vbyu - vu1;
+
+ q5 = 0.0;
+ double s = vbyu * vbyu;
+ q6 = vbyu + fma(-vbyu * s,
+ fma(-s,
+ fma(-s,
+ fma(-s,
+ fma(-s, 0.90029810285449784439E-01,
+ 0.11110736283514525407),
+ 0.14285713561807169030),
+ 0.19999999999393223405),
+ 0.33333333333333170500),
+ MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u));
+ }
+
+
+ q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5;
+ q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6;
+
+ q1 = vbyu > 0.0625 ? q1 : q3;
+ q2 = vbyu > 0.0625 ? q2 : q4;
+
+ // Tidy-up according to which quadrant the arguments lie in
+ double res1, res2, res3, res4;
+ q1 = swap_vu ? piby2_head - q1 : q1;
+ q2 = swap_vu ? piby2_tail - q2 : q2;
+ q1 = xneg ? pi_head - q1 : q1;
+ q2 = xneg ? pi_tail - q2 : q2;
+ q1 = q1 + q2;
+ res4 = yneg ? -q1 : q1;
+
+ res1 = yneg ? -three_piby4 : three_piby4;
+ res2 = yneg ? -piby4 : piby4;
+ res3 = xneg ? res1 : res2;
+
+ res3 = isinf(x2) & isinf(y2) ? res3 : res4;
+ res1 = yneg ? -pi : pi;
+
+ // abs(x)/abs(y) > 2^56 and x < 0
+ res3 = (diffexp < -56 && xneg) ? res1 : res3;
+
+ res4 = MATH_DIVIDE(y, x);
+ // x positive and dominant over y by a factor of 2^28
+ res3 = diffexp < -28 & xneg == 0 ? res4 : res3;
+
+ // abs(y)/abs(x) > 2^56
+ res4 = yneg ? -piby2 : piby2; // atan(y/x) is insignificant compared to piby2
+ res3 = diffexp > 56 ? res4 : res3;
+
+ res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y
+ res4 = xneg ? res1 : y2;
+
+ res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x
+ res3 = isnan(y2) ? y2 : res3;
+ res3 = isnan(x2) ? x2 : res3;
+
+ return res3;
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2, double, double);
+
+#endif
diff --git a/libclc/generic/lib/math/atan2pi.cl b/libclc/generic/lib/math/atan2pi.cl
new file mode 100644
index 0000000..a15b14f
--- /dev/null
+++ b/libclc/generic/lib/math/atan2pi.cl
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float atan2pi(float y, float x) {
+ const float pi = 0x1.921fb6p+1f;
+
+ float ax = fabs(x);
+ float ay = fabs(y);
+ float v = min(ax, ay);
+ float u = max(ax, ay);
+
+ // Scale since u could be large, as in "regular" divide
+ float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+ float vbyu = s * MATH_DIVIDE(v, s*u);
+
+ float vbyu2 = vbyu * vbyu;
+
+ float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+ float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+
+ // Octant 0 result
+ float a = MATH_DIVIDE(mad(p, MATH_RECIP(q), vbyu), pi);
+
+ // Fix up 3 other octants
+ float at = 0.5f - a;
+ a = ay > ax ? at : a;
+ at = 1.0f - a;
+ a = x < 0.0F ? at : a;
+
+ // y == 0 => 0 for x >= 0, pi for x < 0
+ at = as_int(x) < 0 ? 1.0f : 0.0f;
+ a = y == 0.0f ? at : a;
+
+ // if (!FINITE_ONLY()) {
+ // x and y are +- Inf
+ at = x > 0.0f ? 0.25f : 0.75f;
+ a = ax == INFINITY & ay == INFINITY ? at : a;
+
+ // x or y is NaN
+ a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a;
+ // }
+
+ // Fixup sign and return
+ return copysign(a, y);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan2pi, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double atan2pi(double y, double x) {
+ const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */
+ const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */
+ const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */
+ const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */
+ const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */
+
+ double x2 = x;
+ int xneg = as_int2(x).hi < 0;
+ int xexp = (as_int2(x).hi >> 20) & 0x7ff;
+
+ double y2 = y;
+ int yneg = as_int2(y).hi < 0;
+ int yexp = (as_int2(y).hi >> 20) & 0x7ff;
+
+ int cond2 = (xexp < 1021) & (yexp < 1021);
+ int diffexp = yexp - xexp;
+
+ // Scale up both x and y if they are both below 1/4
+ double x1 = ldexp(x, 1024);
+ int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff;
+ double y1 = ldexp(y, 1024);
+ int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff;
+ int diffexp1 = yexp1 - xexp1;
+
+ diffexp = cond2 ? diffexp1 : diffexp;
+ x = cond2 ? x1 : x;
+ y = cond2 ? y1 : y;
+
+ // General case: take absolute values of arguments
+ double u = fabs(x);
+ double v = fabs(y);
+
+ // Swap u and v if necessary to obtain 0 < v < u. Compute v/u.
+ int swap_vu = u < v;
+ double uu = u;
+ u = swap_vu ? v : u;
+ v = swap_vu ? uu : v;
+
+ double vbyu = v / u;
+ double q1, q2;
+
+ // General values of v/u. Use a look-up table and series expansion.
+
+ {
+ double val = vbyu > 0.0625 ? vbyu : 0.063;
+ int index = convert_int(fma(256.0, val, 0.5));
+ double2 tv = USE_TABLE(atan_jby256_tbl, (index - 16));
+ q1 = tv.s0;
+ q2 = tv.s1;
+ double c = (double)index * 0x1.0p-8;
+
+ // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1
+ // u_exponent could be EMAX so we have to do it in 2 steps
+ int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
+ double um = ldexp(u, m);
+ double vm = ldexp(v, m);
+
+ // 26 leading bits of u
+ double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL);
+ double u2 = um - u1;
+
+ double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um));
+
+ // Polynomial approximation to atan(r)
+ double s = r * r;
+ q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r);
+ }
+
+
+ double q3, q4;
+ {
+ q3 = 0.0;
+ q4 = vbyu;
+ }
+
+ double q5, q6;
+ {
+ double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL);
+ double u2 = u - u1;
+ double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL);
+ double vu2 = vbyu - vu1;
+
+ q5 = 0.0;
+ double s = vbyu * vbyu;
+ q6 = vbyu + fma(-vbyu * s,
+ fma(-s,
+ fma(-s,
+ fma(-s,
+ fma(-s, 0.90029810285449784439E-01,
+ 0.11110736283514525407),
+ 0.14285713561807169030),
+ 0.19999999999393223405),
+ 0.33333333333333170500),
+ MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u));
+ }
+
+
+ q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5;
+ q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6;
+
+ q1 = vbyu > 0.0625 ? q1 : q3;
+ q2 = vbyu > 0.0625 ? q2 : q4;
+
+ // Tidy-up according to which quadrant the arguments lie in
+ double res1, res2, res3, res4;
+ q1 = swap_vu ? piby2_head - q1 : q1;
+ q2 = swap_vu ? piby2_tail - q2 : q2;
+ q1 = xneg ? pi_head - q1 : q1;
+ q2 = xneg ? pi_tail - q2 : q2;
+ q1 = MATH_DIVIDE(q1 + q2, pi);
+ res4 = yneg ? -q1 : q1;
+
+ res1 = yneg ? -0.75 : 0.75;
+ res2 = yneg ? -0.25 : 0.25;
+ res3 = xneg ? res1 : res2;
+
+ res3 = isinf(y2) & isinf(x2) ? res3 : res4;
+ res1 = yneg ? -1.0 : 1.0;
+
+ // abs(x)/abs(y) > 2^56 and x < 0
+ res3 = (diffexp < -56 && xneg) ? res1 : res3;
+
+ res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi);
+ // x positive and dominant over y by a factor of 2^28
+ res3 = diffexp < -28 & xneg == 0 ? res4 : res3;
+
+ // abs(y)/abs(x) > 2^56
+ res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2
+ res3 = diffexp > 56 ? res4 : res3;
+
+ res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y
+ res4 = xneg ? res1 : y2;
+
+ res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x
+ res3 = isnan(y2) ? y2 : res3;
+ res3 = isnan(x2) ? x2 : res3;
+
+ return res3;
+}
+
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2pi, double, double)
+
+#endif
diff --git a/libclc/generic/lib/math/atanh.cl b/libclc/generic/lib/math/atanh.cl
new file mode 100644
index 0000000..4af2f45
--- /dev/null
+++ b/libclc/generic/lib/math/atanh.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float atanh(float x) {
+ uint ux = as_uint(x);
+ uint ax = ux & EXSIGNBIT_SP32;
+ uint xs = ux ^ ax;
+
+ // |x| > 1 or NaN
+ float z = as_float(QNANBITPATT_SP32);
+
+ // |x| == 1
+ float t = as_float(xs | PINFBITPATT_SP32);
+ z = ax == 0x3f800000U ? t : z;
+
+ // 1/2 <= |x| < 1
+ t = as_float(ax);
+ t = MATH_DIVIDE(2.0f*t, 1.0f - t);
+ t = 0.5f * log1p(t);
+ t = as_float(xs | as_uint(t));
+ z = ax < 0x3f800000U ? t : z;
+
+ // |x| < 1/2
+ t = x * x;
+ float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f);
+ float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f);
+ float p = MATH_DIVIDE(a, b);
+ t = mad(x*t, p, x);
+ z = ax < 0x3f000000 ? t : z;
+
+ // |x| < 2^-13
+ z = ax < 0x39000000U ? x : z;
+
+ return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atanh, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double atanh(double x) {
+ double absx = fabs(x);
+
+ double ret = absx == 1.0 ? as_double(PINFBITPATT_DP64) : as_double(QNANBITPATT_DP64);
+
+ // |x| >= 0.5
+ // Note that atanh(x) = 0.5 * ln((1+x)/(1-x))
+ // For greater accuracy we use
+ // ln((1+x)/(1-x)) = ln(1 + 2x/(1-x)) = log1p(2x/(1-x)).
+ double r = 0.5 * log1p(2.0 * absx / (1.0 - absx));
+ ret = absx < 1.0 ? r : ret;
+
+ r = -ret;
+ ret = x < 0.0 ? r : ret;
+
+ // Arguments up to 0.5 in magnitude are
+ // approximated by a [5,5] minimax polynomial
+ double t = x * x;
+
+ double pn = fma(t,
+ fma(t,
+ fma(t,
+ fma(t,
+ fma(t, -0.10468158892753136958e-3, 0.28728638600548514553e-1),
+ -0.28180210961780814148e0),
+ 0.88468142536501647470e0),
+ -0.11028356797846341457e1),
+ 0.47482573589747356373e0);
+
+ double pd = fma(t,
+ fma(t,
+ fma(t,
+ fma(t,
+ fma(t, -0.35861554370169537512e-1, 0.49561196555503101989e0),
+ -0.22608883748988489342e1),
+ 0.45414700626084508355e1),
+ -0.41631933639693546274e1),
+ 0.14244772076924206909e1);
+
+ r = fma(x*t, pn/pd, x);
+ ret = absx < 0.5 ? r : ret;
+
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanh, double)
+
+#endif
diff --git a/libclc/generic/lib/math/atanpi.cl b/libclc/generic/lib/math/atanpi.cl
new file mode 100644
index 0000000..2e2f032
--- /dev/null
+++ b/libclc/generic/lib/math/atanpi.cl
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float atanpi(float x) {
+ const float pi = 3.1415926535897932f;
+
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint sx = ux ^ aux;
+
+ float xbypi = MATH_DIVIDE(x, pi);
+ float shalf = as_float(sx | as_uint(0.5f));
+
+ float v = as_float(aux);
+
+ // Return for NaN
+ float ret = x;
+
+ // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+ ret = aux <= PINFBITPATT_SP32 ? shalf : ret;
+
+ // Reduce arguments 2^-19 <= |x| < 2^26
+
+ // 39/16 <= x < 2^26
+ x = -MATH_RECIP(v);
+ float c = 1.57079632679489655800f; // atan(infinity)
+
+ // 19/16 <= x < 39/16
+ int l = aux < 0x401c0000;
+ float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f));
+ x = l ? xx : x;
+ c = l ? 9.82793723247329054082e-01f : c; // atan(1.5)
+
+ // 11/16 <= x < 19/16
+ l = aux < 0x3f980000U;
+ xx = MATH_DIVIDE(v - 1.0f, 1.0f + v);
+ x = l ? xx : x;
+ c = l ? 7.85398163397448278999e-01f : c; // atan(1)
+
+ // 7/16 <= x < 11/16
+ l = aux < 0x3f300000;
+ xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v);
+ x = l ? xx : x;
+ c = l ? 4.63647609000806093515e-01f : c; // atan(0.5)
+
+ // 2^-19 <= x < 7/16
+ l = aux < 0x3ee00000;
+ x = l ? v : x;
+ c = l ? 0.0f : c;
+
+ // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+ float s = x * x;
+ float a = mad(s,
+ mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f),
+ 0.296528598819239217902158651186f);
+
+ float b = mad(s,
+ mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f),
+ 0.889585796862432286486651434570f);
+
+ float q = x * s * MATH_DIVIDE(a, b);
+
+ float z = c - (q - x);
+ z = MATH_DIVIDE(z, pi);
+ float zs = as_float(sx | as_uint(z));
+
+ ret = aux < 0x4c800000 ? zs : ret;
+
+ // |x| < 2^-19
+ ret = aux < 0x36000000 ? xbypi : ret;
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atanpi, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double atanpi(double x) {
+ const double pi = 0x1.921fb54442d18p+1;
+
+ double v = fabs(x);
+
+ // 2^56 > v > 39/16
+ double a = -1.0;
+ double b = v;
+ // (chi + clo) = arctan(infinity)
+ double chi = 1.57079632679489655800e+00;
+ double clo = 6.12323399573676480327e-17;
+
+ double ta = v - 1.5;
+ double tb = 1.0 + 1.5 * v;
+ int l = v <= 0x1.38p+1; // 39/16 > v > 19/16
+ a = l ? ta : a;
+ b = l ? tb : b;
+ // (chi + clo) = arctan(1.5)
+ chi = l ? 9.82793723247329054082e-01 : chi;
+ clo = l ? 1.39033110312309953701e-17 : clo;
+
+ ta = v - 1.0;
+ tb = 1.0 + v;
+ l = v <= 0x1.3p+0; // 19/16 > v > 11/16
+ a = l ? ta : a;
+ b = l ? tb : b;
+ // (chi + clo) = arctan(1.)
+ chi = l ? 7.85398163397448278999e-01 : chi;
+ clo = l ? 3.06161699786838240164e-17 : clo;
+
+ ta = 2.0 * v - 1.0;
+ tb = 2.0 + v;
+ l = v <= 0x1.6p-1; // 11/16 > v > 7/16
+ a = l ? ta : a;
+ b = l ? tb : b;
+ // (chi + clo) = arctan(0.5)
+ chi = l ? 4.63647609000806093515e-01 : chi;
+ clo = l ? 2.26987774529616809294e-17 : clo;
+
+ l = v <= 0x1.cp-2; // v < 7/16
+ a = l ? v : a;
+ b = l ? 1.0 : b;;
+ chi = l ? 0.0 : chi;
+ clo = l ? 0.0 : clo;
+
+ // Core approximation: Remez(4,4) on [-7/16,7/16]
+ double r = a / b;
+ double s = r * r;
+ double qn = fma(s,
+ fma(s,
+ fma(s,
+ fma(s, 0.142316903342317766e-3,
+ 0.304455919504853031e-1),
+ 0.220638780716667420e0),
+ 0.447677206805497472e0),
+ 0.268297920532545909e0);
+
+ double qd = fma(s,
+ fma(s,
+ fma(s,
+ fma(s, 0.389525873944742195e-1,
+ 0.424602594203847109e0),
+ 0.141254259931958921e1),
+ 0.182596787737507063e1),
+ 0.804893761597637733e0);
+
+ double q = r * s * qn / qd;
+ r = (chi - ((q - clo) - r)) / pi;
+ double vp = v / pi;
+
+ double z = isnan(x) ? x : 0.5;
+ z = v <= 0x1.0p+56 ? r : z;
+ z = v < 0x1.0p-26 ? vp : z;
+ return x == v ? z : -z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanpi, double)
+
+#endif
diff --git a/libclc/generic/lib/math/binary_impl.inc b/libclc/generic/lib/math/binary_impl.inc
new file mode 100644
index 0000000..c9bf972
--- /dev/null
+++ b/libclc/generic/lib/math/binary_impl.inc
@@ -0,0 +1,22 @@
+
+#ifndef __CLC_SCALAR
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ return FUNCTION_IMPL(x, y);
+}
+
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, float y) {
+ __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y);
+ return FUNCTION_IMPL(x, vec_y);
+}
+
+#ifdef cl_khr_fp64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, double y) {
+ __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y);
+ return FUNCTION_IMPL(x, vec_y);
+}
+
+#endif
diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl
new file mode 100644
index 0000000..61e34a5
--- /dev/null
+++ b/libclc/generic/lib/math/clc_ldexp.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "config.h"
+#include "../clcmacro.h"
+#include "math.h"
+
+_CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) {
+
+ if (!__clc_fp32_subnormals_supported()) {
+
+ // This treats subnormals as zeros
+ int i = as_int(x);
+ int e = (i >> 23) & 0xff;
+ int m = i & 0x007fffff;
+ int s = i & 0x80000000;
+ int v = add_sat(e, n);
+ v = clamp(v, 0, 0xff);
+ int mr = e == 0 | v == 0 | v == 0xff ? 0 : m;
+ int c = e == 0xff;
+ mr = c ? m : mr;
+ int er = c ? e : v;
+ er = e ? er : e;
+ return as_float( s | (er << 23) | mr );
+ }
+
+ /* supports denormal values */
+ const int multiplier = 24;
+ float val_f;
+ uint val_ui;
+ uint sign;
+ int exponent;
+ val_ui = as_uint(x);
+ sign = val_ui & 0x80000000;
+ val_ui = val_ui & 0x7fffffff;/* remove the sign bit */
+ int val_x = val_ui;
+
+ exponent = val_ui >> 23; /* get the exponent */
+ int dexp = exponent;
+
+ /* denormal support */
+ int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23);
+ int dexponent = 25 - fbh;
+ uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23));
+ int ex = dexponent + n - multiplier;
+ dexponent = ex;
+ uint val = sign | (ex << 23) | (dval_ui & 0x007fffff);
+ int ex1 = dexponent + multiplier;
+ ex1 = -ex1 +25;
+ dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1);
+ dval_ui = dexponent > 0 ? val :dval_ui;
+ dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/
+ dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/
+ dval_ui = dval_ui | sign;
+ val_f = as_float(dval_ui);
+
+ exponent += n;
+
+ val = sign | (exponent << 23) | (val_ui & 0x007fffff);
+ ex1 = exponent + multiplier;
+ ex1 = -ex1 +25;
+ val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1);
+ val_ui = exponent > 0 ? val :val_ui;
+ val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/
+ val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/
+ val_ui = val_ui | sign;
+
+ val_ui = dexp == 0? dval_ui : val_ui;
+ val_f = as_float(val_ui);
+
+ val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f;
+ return val_f;
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) {
+ long l = as_ulong(x);
+ int e = (l >> 52) & 0x7ff;
+ long s = l & 0x8000000000000000;
+
+ ulong ux = as_ulong(x * 0x1.0p+53);
+ int de = ((int)(ux >> 52) & 0x7ff) - 53;
+ int c = e == 0;
+ e = c ? de: e;
+
+ ux = c ? ux : l;
+
+ int v = e + n;
+ v = clamp(v, -0x7ff, 0x7ff);
+
+ ux &= ~EXPBITS_DP64;
+
+ double mr = as_double(ux | ((ulong)(v+53) << 52));
+ mr = mr * 0x1.0p-53;
+
+ mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr;
+
+ mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr;
+ mr = v < -53 ? as_double(s) : mr;
+
+ mr = ((n == 0) | isinf(x) | (x == 0) ) ? x : mr;
+ return mr;
+}
+
+#endif
diff --git a/libclc/generic/lib/math/clc_nextafter.cl b/libclc/generic/lib/math/clc_nextafter.cl
new file mode 100644
index 0000000..e53837d
--- /dev/null
+++ b/libclc/generic/lib/math/clc_nextafter.cl
@@ -0,0 +1,43 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+// This file provides OpenCL C implementations of nextafter for targets that
+// don't support the clang builtin.
+
+#define FLT_NAN 0.0f/0.0f
+
+#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, NAN, ZERO, NEXTAFTER_ZERO) \
+_CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, FLOAT_TYPE y) { \
+ union { \
+ FLOAT_TYPE f; \
+ UINT_TYPE i; \
+ } next; \
+ if (isnan(x) || isnan(y)) { \
+ return NAN; \
+ } \
+ if (x == y) { \
+ return y; \
+ } \
+ next.f = x; \
+ if (x < y) { \
+ next.i++; \
+ } else { \
+ if (next.f == ZERO) { \
+ next.i = NEXTAFTER_ZERO; \
+ } else { \
+ next.i--; \
+ } \
+ } \
+ return next.f; \
+}
+
+NEXTAFTER(float, uint, FLT_NAN, 0.0f, 0x80000001)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_nextafter, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#define DBL_NAN 0.0/0.0
+
+NEXTAFTER(double, ulong, DBL_NAN, 0.0, 0x8000000000000001)
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_nextafter, double, double)
+#endif
diff --git a/libclc/generic/lib/math/clc_sqrt.cl b/libclc/generic/lib/math/clc_sqrt.cl
new file mode 100644
index 0000000..86a874d
--- /dev/null
+++ b/libclc/generic/lib/math/clc_sqrt.cl
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+// Map the llvm sqrt intrinsic to an OpenCL function.
+#define __CLC_FUNCTION __clc_llvm_intr_sqrt
+#define __CLC_INTRINSIC "llvm.sqrt"
+#include <clc/math/unary_intrin.inc>
+#undef __CLC_FUNCTION
+#undef __CLC_INTRINSIC
+
+#define __CLC_BODY <clc_sqrt_impl.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/clc_sqrt_impl.inc b/libclc/generic/lib/math/clc_sqrt_impl.inc
new file mode 100644
index 0000000..e97b540
--- /dev/null
+++ b/libclc/generic/lib/math/clc_sqrt_impl.inc
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __CLC_FPSIZE == 32
+#define __CLC_NAN NAN
+#define ZERO 0.0f
+#elif __CLC_FPSIZE == 64
+#define __CLC_NAN __builtin_nan("")
+#define ZERO 0.0
+#else
+#error "Invalid value for __CLC_FPSIZE"
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sqrt(__CLC_GENTYPE val) {
+ return val < ZERO ? __CLC_NAN : __clc_llvm_intr_sqrt(val);
+}
+
+#undef __CLC_NAN
+#undef ZERO
diff --git a/libclc/generic/lib/math/copysign.cl b/libclc/generic/lib/math/copysign.cl
new file mode 100644
index 0000000..4e0c51b
--- /dev/null
+++ b/libclc/generic/lib/math/copysign.cl
@@ -0,0 +1,12 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, copysign, __builtin_copysignf, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(double, copysign, __builtin_copysign, double, double)
+
+#endif
diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl
new file mode 100644
index 0000000..157447f
--- /dev/null
+++ b/libclc/generic/lib/math/cos.cl
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "sincos_helpers.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float cos(float x)
+{
+ int ix = as_int(x);
+ int ax = ix & 0x7fffffff;
+ float dx = as_float(ax);
+
+ float r0, r1;
+ int regn = __clc_argReductionS(&r0, &r1, dx);
+
+ float ss = -__clc_sinf_piby4(r0, r1);
+ float cc = __clc_cosf_piby4(r0, r1);
+
+ float c = (regn & 1) != 0 ? ss : cc;
+ c = as_float(as_int(c) ^ ((regn > 1) << 31));
+
+ c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c;
+
+ return c;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double cos(double x) {
+ x = fabs(x);
+
+ double r, rr;
+ int regn;
+
+ if (x < 0x1.0p+47)
+ __clc_remainder_piby2_medium(x, &r, &rr, &regn);
+ else
+ __clc_remainder_piby2_large(x, &r, &rr, &regn);
+
+ double2 sc = __clc_sincos_piby4(r, rr);
+ sc.lo = -sc.lo;
+
+ int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
+ c.hi ^= (regn > 1) << 31;
+
+ return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
+
+#endif
diff --git a/libclc/generic/lib/math/cospi.cl b/libclc/generic/lib/math/cospi.cl
new file mode 100644
index 0000000..108b637
--- /dev/null
+++ b/libclc/generic/lib/math/cospi.cl
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "sincos_helpers.h"
+#include "sincospiF_piby4.h"
+#include "../clcmacro.h"
+#ifdef cl_khr_fp64
+#include "sincosD_piby4.h"
+#endif
+
+_CLC_OVERLOAD _CLC_DEF float cospi(float x)
+{
+ int ix = as_int(x) & 0x7fffffff;
+ float ax = as_float(ix);
+ int iax = (int)ax;
+ float r = ax - iax;
+ int xodd = iax & 0x1 ? 0x80000000 : 0;
+
+ // Initialize with return for +-Inf and NaN
+ int ir = 0x7fc00000;
+
+ // 2^24 <= |x| < Inf, the result is always even integer
+ ir = ix < 0x7f800000 ? 0x3f800000 : ir;
+
+ // 2^23 <= |x| < 2^24, the result is always integer
+ ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir;
+
+ // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+ // r < 1.0
+ float a = 1.0f - r;
+ int e = 1;
+ int s = xodd ^ 0x80000000;
+
+ // r <= 0.75
+ int c = r <= 0.75f;
+ a = c ? r - 0.5f : a;
+ e = c ? 0 : e;
+
+ // r < 0.5
+ c = r < 0.5f;
+ a = c ? 0.5f - r : a;
+ s = c ? xodd : s;
+
+ // r <= 0.25
+ c = r <= 0.25f;
+ a = c ? r : a;
+ e = c ? 1 : e;
+
+ float2 t = __libclc__sincosf_piby4(a * M_PI_F);
+ int jr = s ^ as_int(e ? t.hi : t.lo);
+
+ ir = ix < 0x4b000000 ? jr : ir;
+
+ return as_float(ir);
+}
+
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cospi, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double cospi(double x) {
+
+ long ix = as_long(x) & 0x7fffffffffffffffL;
+ double ax = as_double(ix);
+ long iax = (long)ax;
+ double r = ax - (double)iax;
+ long xodd = iax & 0x1L ? 0x8000000000000000L : 0L;
+
+ // Initialize with return for +-Inf and NaN
+ long ir = 0x7ff8000000000000L;
+
+ // 2^53 <= |x| < Inf, the result is always even integer
+ ir = ix < 0x7ff0000000000000 ? 0x3ff0000000000000L : ir;
+
+ // 2^52 <= |x| < 2^53, the result is always integer
+ ir = ax < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir;
+
+ // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval
+
+ // r < 1.0
+ double a = 1.0 - r;
+ int e = 1;
+ long s = xodd ^ 0x8000000000000000L;
+
+ // r <= 0.75
+ int c = r <= 0.75;
+ double t = r - 0.5;
+ a = c ? t : a;
+ e = c ? 0 : e;
+
+ // r < 0.5
+ c = r < 0.5;
+ t = 0.5 - r;
+ a = c ? t : a;
+ s = c ? xodd : s;
+
+ // r <= 0.25
+ c = r <= 0.25;
+ a = c ? r : a;
+ e = c ? 1 : e;
+
+ double2 sc = __libclc__sincos_piby4(a * M_PI, 0.0);
+ long jr = s ^ as_long(e ? sc.hi : sc.lo);
+
+ ir = ax < 0x1.0p+52 ? jr : ir;
+
+ return as_double(ir);
+}
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cospi, double);
+#endif
diff --git a/libclc/generic/lib/math/ep_log.cl b/libclc/generic/lib/math/ep_log.cl
new file mode 100644
index 0000000..3c2c62c
--- /dev/null
+++ b/libclc/generic/lib/math/ep_log.cl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifdef cl_khr_fp64
+
+#include <clc/clc.h>
+#include "ep_log.h"
+#include "math.h"
+#include "tables.h"
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define LN0 8.33333333333317923934e-02
+#define LN1 1.25000000037717509602e-02
+#define LN2 2.23213998791944806202e-03
+#define LN3 4.34887777707614552256e-04
+
+#define LF0 8.33333333333333593622e-02
+#define LF1 1.24999999978138668903e-02
+#define LF2 2.23219810758559851206e-03
+
+_CLC_DEF void __clc_ep_log(double x, int *xexp, double *r1, double *r2)
+{
+ // Computes natural log(x). Algorithm based on:
+ // Ping-Tak Peter Tang
+ // "Table-driven implementation of the logarithm function in IEEE
+ // floating-point arithmetic"
+ // ACM Transactions on Mathematical Software (TOMS)
+ // Volume 16, Issue 4 (December 1990)
+ int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0;
+
+ ulong ux = as_ulong(x);
+ ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962);
+ int c = ux < IMPBIT_DP64;
+ ux = c ? uxs : ux;
+ int expadjust = c ? 60 : 0;
+
+ // Store the exponent of x in xexp and put f into the range [0.5,1)
+ int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust;
+ double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64));
+ *xexp = near_one ? 0 : xexp1;
+
+ double r = x - 1.0;
+ double u1 = MATH_DIVIDE(r, 2.0 + r);
+ double ru1 = -r * u1;
+ u1 = u1 + u1;
+
+ int index = as_int2(ux).hi >> 13;
+ index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1);
+
+ double f1 = index * 0x1.0p-7;
+ double f2 = f - f1;
+ double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1));
+
+ double2 tv = USE_TABLE(ln_tbl, (index - 64));
+ double z1 = tv.s0;
+ double q = tv.s1;
+
+ z1 = near_one ? r : z1;
+ q = near_one ? 0.0 : q;
+ double u = near_one ? u1 : u2;
+ double v = u*u;
+
+ double cc = near_one ? ru1 : u2;
+
+ double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0);
+ double z22 = fma(v, fma(v, LF2, LF1), LF0);
+ double z2 = near_one ? z21 : z22;
+ z2 = fma(u*v, z2, cc) + q;
+
+ *r1 = z1;
+ *r2 = z2;
+}
+
+#endif
diff --git a/libclc/generic/lib/math/ep_log.h b/libclc/generic/lib/math/ep_log.h
new file mode 100644
index 0000000..7f99ac6
--- /dev/null
+++ b/libclc/generic/lib/math/ep_log.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DECL void __clc_ep_log(double x, int *xexp, double *r1, double *r2);
+
+#endif
diff --git a/libclc/generic/lib/math/erfc.cl b/libclc/generic/lib/math/erfc.cl
new file mode 100644
index 0000000..c322f86
--- /dev/null
+++ b/libclc/generic/lib/math/erfc.cl
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#define erx_f 8.4506291151e-01f /* 0x3f58560b */
+
+// Coefficients for approximation to erf on [00.84375]
+
+#define efx 1.2837916613e-01f /* 0x3e0375d4 */
+#define efx8 1.0270333290e+00f /* 0x3f8375d4 */
+
+#define pp0 1.2837916613e-01f /* 0x3e0375d4 */
+#define pp1 -3.2504209876e-01f /* 0xbea66beb */
+#define pp2 -2.8481749818e-02f /* 0xbce9528f */
+#define pp3 -5.7702702470e-03f /* 0xbbbd1489 */
+#define pp4 -2.3763017452e-05f /* 0xb7c756b1 */
+#define qq1 3.9791721106e-01f /* 0x3ecbbbce */
+#define qq2 6.5022252500e-02f /* 0x3d852a63 */
+#define qq3 5.0813062117e-03f /* 0x3ba68116 */
+#define qq4 1.3249473704e-04f /* 0x390aee49 */
+#define qq5 -3.9602282413e-06f /* 0xb684e21a */
+
+// Coefficients for approximation to erf in [0.843751.25]
+
+#define pa0 -2.3621185683e-03f /* 0xbb1acdc6 */
+#define pa1 4.1485610604e-01f /* 0x3ed46805 */
+#define pa2 -3.7220788002e-01f /* 0xbebe9208 */
+#define pa3 3.1834661961e-01f /* 0x3ea2fe54 */
+#define pa4 -1.1089469492e-01f /* 0xbde31cc2 */
+#define pa5 3.5478305072e-02f /* 0x3d1151b3 */
+#define pa6 -2.1663755178e-03f /* 0xbb0df9c0 */
+#define qa1 1.0642088205e-01f /* 0x3dd9f331 */
+#define qa2 5.4039794207e-01f /* 0x3f0a5785 */
+#define qa3 7.1828655899e-02f /* 0x3d931ae7 */
+#define qa4 1.2617121637e-01f /* 0x3e013307 */
+#define qa5 1.3637083583e-02f /* 0x3c5f6e13 */
+#define qa6 1.1984500103e-02f /* 0x3c445aa3 */
+
+// Coefficients for approximation to erfc in [1.251/0.35]
+
+#define ra0 -9.8649440333e-03f /* 0xbc21a093 */
+#define ra1 -6.9385856390e-01f /* 0xbf31a0b7 */
+#define ra2 -1.0558626175e+01f /* 0xc128f022 */
+#define ra3 -6.2375331879e+01f /* 0xc2798057 */
+#define ra4 -1.6239666748e+02f /* 0xc322658c */
+#define ra5 -1.8460508728e+02f /* 0xc3389ae7 */
+#define ra6 -8.1287437439e+01f /* 0xc2a2932b */
+#define ra7 -9.8143291473e+00f /* 0xc11d077e */
+#define sa1 1.9651271820e+01f /* 0x419d35ce */
+#define sa2 1.3765776062e+02f /* 0x4309a863 */
+#define sa3 4.3456588745e+02f /* 0x43d9486f */
+#define sa4 6.4538726807e+02f /* 0x442158c9 */
+#define sa5 4.2900814819e+02f /* 0x43d6810b */
+#define sa6 1.0863500214e+02f /* 0x42d9451f */
+#define sa7 6.5702495575e+00f /* 0x40d23f7c */
+#define sa8 -6.0424413532e-02f /* 0xbd777f97 */
+
+// Coefficients for approximation to erfc in [1/.3528]
+
+#define rb0 -9.8649431020e-03f /* 0xbc21a092 */
+#define rb1 -7.9928326607e-01f /* 0xbf4c9dd4 */
+#define rb2 -1.7757955551e+01f /* 0xc18e104b */
+#define rb3 -1.6063638306e+02f /* 0xc320a2ea */
+#define rb4 -6.3756646729e+02f /* 0xc41f6441 */
+#define rb5 -1.0250950928e+03f /* 0xc480230b */
+#define rb6 -4.8351919556e+02f /* 0xc3f1c275 */
+#define sb1 3.0338060379e+01f /* 0x41f2b459 */
+#define sb2 3.2579251099e+02f /* 0x43a2e571 */
+#define sb3 1.5367296143e+03f /* 0x44c01759 */
+#define sb4 3.1998581543e+03f /* 0x4547fdbb */
+#define sb5 2.5530502930e+03f /* 0x451f90ce */
+#define sb6 4.7452853394e+02f /* 0x43ed43a7 */
+#define sb7 -2.2440952301e+01f /* 0xc1b38712 */
+
+_CLC_OVERLOAD _CLC_DEF float erfc(float x) {
+ int hx = as_int(x);
+ int ix = hx & 0x7fffffff;
+ float absx = as_float(ix);
+
+ // Argument for polys
+ float x2 = absx * absx;
+ float t = 1.0f / x2;
+ float tt = absx - 1.0f;
+ t = absx < 1.25f ? tt : t;
+ t = absx < 0.84375f ? x2 : t;
+
+ // Evaluate polys
+ float tu, tv, u, v;
+
+ u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+ v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1);
+
+ tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+ tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1);
+ u = absx < 0x1.6db6dap+1f ? tu : u;
+ v = absx < 0x1.6db6dap+1f ? tv : v;
+
+ tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+ tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1);
+ u = absx < 1.25f ? tu : u;
+ v = absx < 1.25f ? tv : v;
+
+ tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0);
+ tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1);
+ u = absx < 0.84375f ? tu : u;
+ v = absx < 0.84375f ? tv : v;
+
+ v = mad(t, v, 1.0f);
+
+ float q = MATH_DIVIDE(u, v);
+
+ float ret = 0.0f;
+
+ float z = as_float(ix & 0xfffff000);
+ float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z - absx, z + absx, q));
+ r = MATH_DIVIDE(r, absx);
+ t = 2.0f - r;
+ r = x < 0.0f ? t : r;
+ ret = absx < 28.0f ? r : ret;
+
+ r = 1.0f - erx_f - q;
+ t = erx_f + q + 1.0f;
+ r = x < 0.0f ? t : r;
+ ret = absx < 1.25f ? r : ret;
+
+ r = 0.5f - mad(x, q, x - 0.5f);
+ ret = absx < 0.84375f ? r : ret;
+
+ ret = x < -6.0f ? 2.0f : ret;
+
+ ret = isnan(x) ? x : ret;
+
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, erfc, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* double erf(double x)
+ * double erfc(double x)
+ * x
+ * 2 |\
+ * erf(x) = --------- | exp(-t*t)dt
+ * sqrt(pi) \|
+ * 0
+ *
+ * erfc(x) = 1-erf(x)
+ * Note that
+ * erf(-x) = -erf(x)
+ * erfc(-x) = 2 - erfc(x)
+ *
+ * Method:
+ * 1. For |x| in [0, 0.84375]
+ * erf(x) = x + x*R(x^2)
+ * erfc(x) = 1 - erf(x) if x in [-.84375,0.25]
+ * = 0.5 + ((0.5-x)-x*R) if x in [0.25,0.84375]
+ * where R = P/Q where P is an odd poly of degree 8 and
+ * Q is an odd poly of degree 10.
+ * -57.90
+ * | R - (erf(x)-x)/x | <= 2
+ *
+ *
+ * Remark. The formula is derived by noting
+ * erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....)
+ * and that
+ * 2/sqrt(pi) = 1.128379167095512573896158903121545171688
+ * is close to one. The interval is chosen because the fix
+ * point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is
+ * near 0.6174), and by some experiment, 0.84375 is chosen to
+ * guarantee the error is less than one ulp for erf.
+ *
+ * 2. For |x| in [0.84375,1.25], let s = |x| - 1, and
+ * c = 0.84506291151 rounded to single (24 bits)
+ * erf(x) = sign(x) * (c + P1(s)/Q1(s))
+ * erfc(x) = (1-c) - P1(s)/Q1(s) if x > 0
+ * 1+(c+P1(s)/Q1(s)) if x < 0
+ * |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06
+ * Remark: here we use the taylor series expansion at x=1.
+ * erf(1+s) = erf(1) + s*Poly(s)
+ * = 0.845.. + P1(s)/Q1(s)
+ * That is, we use rational approximation to approximate
+ * erf(1+s) - (c = (single)0.84506291151)
+ * Note that |P1/Q1|< 0.078 for x in [0.84375,1.25]
+ * where
+ * P1(s) = degree 6 poly in s
+ * Q1(s) = degree 6 poly in s
+ *
+ * 3. For x in [1.25,1/0.35(~2.857143)],
+ * erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1)
+ * erf(x) = 1 - erfc(x)
+ * where
+ * R1(z) = degree 7 poly in z, (z=1/x^2)
+ * S1(z) = degree 8 poly in z
+ *
+ * 4. For x in [1/0.35,28]
+ * erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0
+ * = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0
+ * = 2.0 - tiny (if x <= -6)
+ * erf(x) = sign(x)*(1.0 - erfc(x)) if x < 6, else
+ * erf(x) = sign(x)*(1.0 - tiny)
+ * where
+ * R2(z) = degree 6 poly in z, (z=1/x^2)
+ * S2(z) = degree 7 poly in z
+ *
+ * Note1:
+ * To compute exp(-x*x-0.5625+R/S), let s be a single
+ * precision number and s := x; then
+ * -x*x = -s*s + (s-x)*(s+x)
+ * exp(-x*x-0.5626+R/S) =
+ * exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S);
+ * Note2:
+ * Here 4 and 5 make use of the asymptotic series
+ * exp(-x*x)
+ * erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) )
+ * x*sqrt(pi)
+ * We use rational approximation to approximate
+ * g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625
+ * Here is the error bound for R1/S1 and R2/S2
+ * |R1/S1 - f(x)| < 2**(-62.57)
+ * |R2/S2 - f(x)| < 2**(-61.52)
+ *
+ * 5. For inf > x >= 28
+ * erf(x) = sign(x) *(1 - tiny) (raise inexact)
+ * erfc(x) = tiny*tiny (raise underflow) if x > 0
+ * = 2 - tiny if x<0
+ *
+ * 7. Special case:
+ * erf(0) = 0, erf(inf) = 1, erf(-inf) = -1,
+ * erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2,
+ * erfc/erf(NaN) is NaN
+ */
+
+#define AU0 -9.86494292470009928597e-03
+#define AU1 -7.99283237680523006574e-01
+#define AU2 -1.77579549177547519889e+01
+#define AU3 -1.60636384855821916062e+02
+#define AU4 -6.37566443368389627722e+02
+#define AU5 -1.02509513161107724954e+03
+#define AU6 -4.83519191608651397019e+02
+
+#define AV0 3.03380607434824582924e+01
+#define AV1 3.25792512996573918826e+02
+#define AV2 1.53672958608443695994e+03
+#define AV3 3.19985821950859553908e+03
+#define AV4 2.55305040643316442583e+03
+#define AV5 4.74528541206955367215e+02
+#define AV6 -2.24409524465858183362e+01
+
+#define BU0 -9.86494403484714822705e-03
+#define BU1 -6.93858572707181764372e-01
+#define BU2 -1.05586262253232909814e+01
+#define BU3 -6.23753324503260060396e+01
+#define BU4 -1.62396669462573470355e+02
+#define BU5 -1.84605092906711035994e+02
+#define BU6 -8.12874355063065934246e+01
+#define BU7 -9.81432934416914548592e+00
+
+#define BV0 1.96512716674392571292e+01
+#define BV1 1.37657754143519042600e+02
+#define BV2 4.34565877475229228821e+02
+#define BV3 6.45387271733267880336e+02
+#define BV4 4.29008140027567833386e+02
+#define BV5 1.08635005541779435134e+02
+#define BV6 6.57024977031928170135e+00
+#define BV7 -6.04244152148580987438e-02
+
+#define CU0 -2.36211856075265944077e-03
+#define CU1 4.14856118683748331666e-01
+#define CU2 -3.72207876035701323847e-01
+#define CU3 3.18346619901161753674e-01
+#define CU4 -1.10894694282396677476e-01
+#define CU5 3.54783043256182359371e-02
+#define CU6 -2.16637559486879084300e-03
+
+#define CV0 1.06420880400844228286e-01
+#define CV1 5.40397917702171048937e-01
+#define CV2 7.18286544141962662868e-02
+#define CV3 1.26171219808761642112e-01
+#define CV4 1.36370839120290507362e-02
+#define CV5 1.19844998467991074170e-02
+
+#define DU0 1.28379167095512558561e-01
+#define DU1 -3.25042107247001499370e-01
+#define DU2 -2.84817495755985104766e-02
+#define DU3 -5.77027029648944159157e-03
+#define DU4 -2.37630166566501626084e-05
+
+#define DV0 3.97917223959155352819e-01
+#define DV1 6.50222499887672944485e-02
+#define DV2 5.08130628187576562776e-03
+#define DV3 1.32494738004321644526e-04
+#define DV4 -3.96022827877536812320e-06
+
+_CLC_OVERLOAD _CLC_DEF double erfc(double x) {
+ long lx = as_long(x);
+ long ax = lx & 0x7fffffffffffffffL;
+ double absx = as_double(ax);
+ int xneg = lx != ax;
+
+ // Poly arg
+ double x2 = x * x;
+ double xm1 = absx - 1.0;
+ double t = 1.0 / x2;
+ t = absx < 1.25 ? xm1 : t;
+ t = absx < 0.84375 ? x2 : t;
+
+
+ // Evaluate rational poly
+ // XXX Need to evaluate if we can grab the 14 coefficients from a
+ // table faster than evaluating 3 pairs of polys
+ double tu, tv, u, v;
+
+ // |x| < 28
+ u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0);
+ v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV6, AV5), AV4), AV3), AV2), AV1), AV0);
+
+ tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0);
+ tv = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV7, BV6), BV5), BV4), BV3), BV2), BV1), BV0);
+ u = absx < 0x1.6db6dp+1 ? tu : u;
+ v = absx < 0x1.6db6dp+1 ? tv : v;
+
+ tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0);
+ tv = fma(t, fma(t, fma(t, fma(t, fma(t, CV5, CV4), CV3), CV2), CV1), CV0);
+ u = absx < 1.25 ? tu : u;
+ v = absx < 1.25 ? tv : v;
+
+ tu = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0);
+ tv = fma(t, fma(t, fma(t, fma(t, DV4, DV3), DV2), DV1), DV0);
+ u = absx < 0.84375 ? tu : u;
+ v = absx < 0.84375 ? tv : v;
+
+ v = fma(t, v, 1.0);
+ double q = u / v;
+
+
+ // Evaluate return value
+
+ // |x| < 28
+ double z = as_double(ax & 0xffffffff00000000UL);
+ double ret = exp(-z * z - 0.5625) * exp((z - absx) * (z + absx) + q) / absx;
+ t = 2.0 - ret;
+ ret = xneg ? t : ret;
+
+ const double erx = 8.45062911510467529297e-01;
+ z = erx + q + 1.0;
+ t = 1.0 - erx - q;
+ t = xneg ? z : t;
+ ret = absx < 1.25 ? t : ret;
+
+ // z = 1.0 - fma(x, q, x);
+ // t = 0.5 - fma(x, q, x - 0.5);
+ // t = xneg == 1 | absx < 0.25 ? z : t;
+ t = fma(-x, q, 1.0 - x);
+ ret = absx < 0.84375 ? t : ret;
+
+ ret = x >= 28.0 ? 0.0 : ret;
+ ret = x <= -6.0 ? 2.0 : ret;
+ ret = ax > 0x7ff0000000000000UL ? x : ret;
+
+ return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, erfc, double);
+
+#endif
diff --git a/libclc/generic/lib/math/exp.cl b/libclc/generic/lib/math/exp.cl
new file mode 100644
index 0000000..37f693c
--- /dev/null
+++ b/libclc/generic/lib/math/exp.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float exp(float x) {
+
+ // Reduce x
+ const float ln2HI = 0x1.62e300p-1f;
+ const float ln2LO = 0x1.2fefa2p-17f;
+ const float invln2 = 0x1.715476p+0f;
+
+ float fhalF = x < 0.0f ? -0.5f : 0.5f;
+ int p = mad(x, invln2, fhalF);
+ float fp = (float)p;
+ float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here
+ float lo = -fp*ln2LO;
+
+ // Evaluate poly
+ float t = hi + lo;
+ float tt = t*t;
+ float v = mad(tt,
+ -mad(tt,
+ mad(tt,
+ mad(tt,
+ mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+ 0x1.1566aap-14f),
+ -0x1.6c16c2p-9f),
+ 0x1.555556p-3f),
+ t);
+
+ float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+ // Scale by 2^p
+ float r = as_float(as_int(y) + (p << 23));
+
+ const float ulim = 0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366
+ const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657
+
+ r = x < llim ? 0.0f : r;
+ r = x < ulim ? r : as_float(0x7f800000);
+ return isnan(x) ? x : r;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, exp, float)
+
+#ifdef cl_khr_fp64
+
+#include "exp_helper.h"
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double exp(double x) {
+
+ const double X_MIN = -0x1.74910d52d3051p+9; // -1075*ln(2)
+ const double X_MAX = 0x1.62e42fefa39efp+9; // 1024*ln(2)
+ const double R_64_BY_LOG2 = 0x1.71547652b82fep+6; // 64/ln(2)
+ const double R_LOG2_BY_64_LD = 0x1.62e42fefa0000p-7; // head ln(2)/64
+ const double R_LOG2_BY_64_TL = 0x1.cf79abc9e3b39p-46; // tail ln(2)/64
+
+ int n = convert_int(x * R_64_BY_LOG2);
+ double r = fma(-R_LOG2_BY_64_TL, (double)n, fma(-R_LOG2_BY_64_LD, (double)n, x));
+ return __clc_exp_helper(x, X_MIN, X_MAX, r, n);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp, double)
+
+#endif
diff --git a/libclc/generic/lib/math/exp10.cl b/libclc/generic/lib/math/exp10.cl
new file mode 100644
index 0000000..c8039cb
--- /dev/null
+++ b/libclc/generic/lib/math/exp10.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <exp10.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/exp10.inc b/libclc/generic/lib/math/exp10.inc
new file mode 100644
index 0000000..a592c19
--- /dev/null
+++ b/libclc/generic/lib/math/exp10.inc
@@ -0,0 +1,10 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE exp10(__CLC_GENTYPE val) {
+ // exp10(x) = exp2(x * log2(10))
+#if __CLC_FPSIZE == 32
+ return exp2(val * log2(10.0f));
+#elif __CLC_FPSIZE == 64
+ return exp2(val * log2(10.0));
+#else
+#error unknown _CLC_FPSIZE
+#endif
+}
diff --git a/libclc/generic/lib/math/exp2.cl b/libclc/generic/lib/math/exp2.cl
new file mode 100644
index 0000000..1ddccbd
--- /dev/null
+++ b/libclc/generic/lib/math/exp2.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float exp2(float x) {
+
+ // Reduce x
+ const float ln2HI = 0x1.62e300p-1f;
+ const float ln2LO = 0x1.2fefa2p-17f;
+
+ float t = rint(x);
+ int p = (int)t;
+ float tt = x - t;
+ float hi = tt * ln2HI;
+ float lo = tt * ln2LO;
+
+ // Evaluate poly
+ t = hi + lo;
+ tt = t*t;
+ float v = mad(tt,
+ -mad(tt,
+ mad(tt,
+ mad(tt,
+ mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f),
+ 0x1.1566aap-14f),
+ -0x1.6c16c2p-9f),
+ 0x1.555556p-3f),
+ t);
+
+ float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi);
+
+ // Scale by 2^p
+ float r = as_float(as_int(y) + (p << 23));
+
+ const float ulim = 128.0f;
+ const float llim = -126.0f;
+
+ r = x < llim ? 0.0f : r;
+ r = x < ulim ? r : as_float(0x7f800000);
+ return isnan(x) ? x : r;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, exp2, float)
+
+#ifdef cl_khr_fp64
+
+#include "exp_helper.h"
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double exp2(double x) {
+ const double R_LN2 = 0x1.62e42fefa39efp-1; // ln(2)
+ const double R_1_BY_64 = 1.0 / 64.0;
+
+ int n = convert_int(x * 64.0);
+ double r = R_LN2 * fma(-R_1_BY_64, (double)n, x);
+ return __clc_exp_helper(x, -1074.0, 1024.0, r, n);
+}
+
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp2, double)
+
+#endif
diff --git a/libclc/generic/lib/math/exp_helper.cl b/libclc/generic/lib/math/exp_helper.cl
new file mode 100644
index 0000000..046f306
--- /dev/null
+++ b/libclc/generic/lib/math/exp_helper.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF double __clc_exp_helper(double x, double x_min, double x_max, double r, int n) {
+
+ int j = n & 0x3f;
+ int m = n >> 6;
+
+ // 6 term tail of Taylor expansion of e^r
+ double z2 = r * fma(r,
+ fma(r,
+ fma(r,
+ fma(r,
+ fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
+ 0x1.5555555555555p-5),
+ 0x1.5555555555555p-3),
+ 0x1.0000000000000p-1),
+ 1.0);
+
+ double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+ z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+
+ int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
+
+ int n1 = m >> 2;
+ int n2 = m-n1;
+ double z3= z2 * as_double(((long)n1 + 1023) << 52);
+ z3 *= as_double(((long)n2 + 1023) << 52);
+
+ z2 = ldexp(z2, m);
+ z2 = small_value ? z3: z2;
+
+ z2 = isnan(x) ? x : z2;
+
+ z2 = x > x_max ? as_double(PINFBITPATT_DP64) : z2;
+ z2 = x < x_min ? 0.0 : z2;
+
+ return z2;
+}
+
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/exp_helper.h b/libclc/generic/lib/math/exp_helper.h
new file mode 100644
index 0000000..e6df2fd
--- /dev/null
+++ b/libclc/generic/lib/math/exp_helper.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_DECL double __clc_exp_helper(double x, double x_min, double x_max, double r, int n);
+
+#endif
diff --git a/libclc/generic/lib/math/fmax.cl b/libclc/generic/lib/math/fmax.cl
new file mode 100644
index 0000000..239da3d
--- /dev/null
+++ b/libclc/generic/lib/math/fmax.cl
@@ -0,0 +1,16 @@
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, fmax, __builtin_fmaxf, float, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(double, fmax, __builtin_fmax, double, double);
+
+#endif
+
+#define __CLC_BODY <fmax.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/fmax.inc b/libclc/generic/lib/math/fmax.inc
new file mode 100644
index 0000000..8315c5f
--- /dev/null
+++ b/libclc/generic/lib/math/fmax.inc
@@ -0,0 +1,18 @@
+
+#if !defined(__CLC_SCALAR)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmax(__CLC_GENTYPE x, float y) {
+ return fmax(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y));
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmax(__CLC_GENTYPE x, double y) {
+ return fmax(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y));
+}
+
+#endif // ifdef cl_khr_fp64
+
+#endif // !defined(__CLC_SCALAR)
diff --git a/libclc/generic/lib/math/fmin.cl b/libclc/generic/lib/math/fmin.cl
new file mode 100644
index 0000000..28c7d01
--- /dev/null
+++ b/libclc/generic/lib/math/fmin.cl
@@ -0,0 +1,16 @@
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, fmin, __builtin_fminf, float, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(double, fmin, __builtin_fmin, double, double);
+
+#endif
+
+#define __CLC_BODY <fmin.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/fmin.inc b/libclc/generic/lib/math/fmin.inc
new file mode 100644
index 0000000..d4b5ac2
--- /dev/null
+++ b/libclc/generic/lib/math/fmin.inc
@@ -0,0 +1,18 @@
+
+#if !defined(__CLC_SCALAR)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmin(__CLC_GENTYPE x, float y) {
+ return fmin(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y));
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmin(__CLC_GENTYPE x, double y) {
+ return fmin(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y));
+}
+
+#endif // ifdef cl_khr_fp64
+
+#endif // !defined(__CLC_SCALAR)
diff --git a/libclc/generic/lib/math/fmod.cl b/libclc/generic/lib/math/fmod.cl
new file mode 100644
index 0000000..f9a4e31
--- /dev/null
+++ b/libclc/generic/lib/math/fmod.cl
@@ -0,0 +1,12 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, fmod, __builtin_fmodf, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(double, fmod, __builtin_fmod, double, double)
+
+#endif
diff --git a/libclc/generic/lib/math/fract.cl b/libclc/generic/lib/math/fract.cl
new file mode 100644
index 0000000..b434071
--- /dev/null
+++ b/libclc/generic/lib/math/fract.cl
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <fract.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/fract.inc b/libclc/generic/lib/math/fract.inc
new file mode 100644
index 0000000..8d2a4d7
--- /dev/null
+++ b/libclc/generic/lib/math/fract.inc
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __CLC_FPSIZE == 32
+#define MIN_CONSTANT 0x1.fffffep-1f
+#else
+#define MIN_CONSTANT 0x1.fffffffffffffp-1
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fract(__CLC_GENTYPE x, private __CLC_GENTYPE *iptr) {
+ *iptr = floor(x);
+ __CLC_GENTYPE r = fmin(x - *iptr, MIN_CONSTANT);
+ r = isinf(x) ? 0.0f : r;
+ r = isnan(x) ? x : r;
+ return r;
+}
+
+
+#define FRACT_DEF(addrspace) \
+ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fract(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \
+ __CLC_GENTYPE private_iptr; \
+ __CLC_GENTYPE ret = fract(x, &private_iptr); \
+ *iptr = private_iptr; \
+ return ret; \
+ }
+
+FRACT_DEF(local);
+FRACT_DEF(global);
+
+#undef MIN_CONSTANT
diff --git a/libclc/generic/lib/math/frexp.cl b/libclc/generic/lib/math/frexp.cl
new file mode 100644
index 0000000..acd5d93
--- /dev/null
+++ b/libclc/generic/lib/math/frexp.cl
@@ -0,0 +1,10 @@
+#include <clc/clc.h>
+
+#include "math.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <frexp.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/frexp.inc b/libclc/generic/lib/math/frexp.inc
new file mode 100644
index 0000000..0f5ddea
--- /dev/null
+++ b/libclc/generic/lib/math/frexp.inc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ * Copyright (c) 2016 Aaron Watry
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __CLC_FPSIZE == 32
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(float x, private int *ep) {
+ int i = as_int(x);
+ int ai = i & 0x7fffffff;
+ int d = ai > 0 & ai < 0x00800000;
+ // scale subnormal by 2^26 without multiplying
+ float s = as_float(ai | 0x0d800000) - 0x1.0p-100F;
+ ai = d ? as_int(s) : ai;
+ int e = (ai >> 23) - 126 - (d ? 26 : 0);
+ int t = ai == 0 | e == 129;
+ i = (i & 0x80000000) | 0x3f000000 | (ai & 0x007fffff);
+ *ep = t ? 0 : e;
+ return t ? x : as_float(i);
+}
+#define __CLC_FREXP_VEC(width) \
+_CLC_OVERLOAD _CLC_DEF float##width frexp(float##width x, private int##width *ep) { \
+ int##width i = as_int##width(x); \
+ int##width ai = i & 0x7fffffff; \
+ int##width d = ai > 0 & ai < 0x00800000; \
+ /* scale subnormal by 2^26 without multiplying */ \
+ float##width s = as_float##width(ai | 0x0d800000) - 0x1.0p-100F; \
+ ai = bitselect(ai, as_int##width(s), d); \
+ int##width e = (ai >> 23) - 126 - bitselect((int##width)0, (int##width)26, d); \
+ int##width t = ai == (int##width)0 | e == (int##width)129; \
+ i = (i & (int##width)0x80000000) | (int##width)0x3f000000 | (ai & 0x007fffff); \
+ *ep = bitselect(e, (int##width)0, t); \
+ return bitselect(as_float##width(i), x, as_float##width(t)); \
+}
+__CLC_FREXP_VEC(2)
+__CLC_FREXP_VEC(3)
+__CLC_FREXP_VEC(4)
+__CLC_FREXP_VEC(8)
+__CLC_FREXP_VEC(16)
+#undef __CLC_FREXP_VEC
+#endif
+#endif
+
+#if __CLC_FPSIZE == 64
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, private __CLC_INTN *ep) {
+ long i = as_long(x);
+ long ai = i & 0x7fffffffffffffffL;
+ int d = ai > 0 & ai < 0x0010000000000000L;
+ // scale subnormal by 2^54 without multiplying
+ double s = as_double(ai | 0x0370000000000000L) - 0x1.0p-968;
+ ai = d ? as_long(s) : ai;
+ int e = (int)(ai >> 52) - 1022 - (d ? 54 : 0);
+ int t = ai == 0 | e == 1025;
+ i = (i & 0x8000000000000000L) | 0x3fe0000000000000L | (ai & 0x000fffffffffffffL);
+ *ep = t ? 0 : e;
+ return t ? x : as_double(i);
+}
+#define __CLC_FREXP_VEC(width) \
+_CLC_OVERLOAD _CLC_DEF double##width frexp(double##width x, private int##width *ep) { \
+ long##width i = as_long##width(x); \
+ long##width ai = i & 0x7fffffffffffffffL; \
+ long##width d = ai > 0 & ai < 0x0010000000000000L; \
+ /* scale subnormal by 2^54 without multiplying */ \
+ double##width s = as_double##width(ai | 0x0370000000000000L) - 0x1.0p-968; \
+ ai = bitselect(ai, as_long##width(s), d); \
+ int##width e = convert_int##width(ai >> 52) - 1022 - bitselect((int##width)0, (int##width)54, convert_int##width(d)); \
+ int##width t = convert_int##width(ai == (long##width)0) | (e == (int##width)129); \
+ i = (i & (long##width)0x8000000000000000L) | (long##width)0x3fe0000000000000L | (ai & 0x000fffffffffffffL); \
+ *ep = bitselect(e, (int##width)0, t); \
+ return bitselect(as_double##width(i), x, as_double##width(convert_long##width(t))); \
+}
+__CLC_FREXP_VEC(2)
+__CLC_FREXP_VEC(3)
+__CLC_FREXP_VEC(4)
+__CLC_FREXP_VEC(8)
+__CLC_FREXP_VEC(16)
+#undef __CLC_FREXP_VEC
+#endif
+#endif
+
+#define __CLC_FREXP_DEF(addrspace) \
+ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, addrspace __CLC_INTN *iptr) { \
+ __CLC_INTN private_iptr; \
+ __CLC_GENTYPE ret = frexp(x, &private_iptr); \
+ *iptr = private_iptr; \
+ return ret; \
+}
+
+__CLC_FREXP_DEF(local);
+__CLC_FREXP_DEF(global);
+
+#undef __CLC_FREXP_DEF
diff --git a/libclc/generic/lib/math/half_rsqrt.cl b/libclc/generic/lib/math/half_rsqrt.cl
new file mode 100644
index 0000000..726f65c
--- /dev/null
+++ b/libclc/generic/lib/math/half_rsqrt.cl
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#define __CLC_BODY <half_rsqrt.inc>
+#define __FLOAT_ONLY
+#include <clc/math/gentype.inc>
+#undef __FLOAT_ONLY
diff --git a/libclc/generic/lib/math/half_rsqrt.inc b/libclc/generic/lib/math/half_rsqrt.inc
new file mode 100644
index 0000000..33ce6c2
--- /dev/null
+++ b/libclc/generic/lib/math/half_rsqrt.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE half_rsqrt(__CLC_GENTYPE val) {
+ return rsqrt(val);
+}
diff --git a/libclc/generic/lib/math/half_sqrt.cl b/libclc/generic/lib/math/half_sqrt.cl
new file mode 100644
index 0000000..a02896a
--- /dev/null
+++ b/libclc/generic/lib/math/half_sqrt.cl
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#define __CLC_BODY <half_sqrt.inc>
+#define __FLOAT_ONLY
+#include <clc/math/gentype.inc>
+#undef __FLOAT_ONLY
diff --git a/libclc/generic/lib/math/half_sqrt.inc b/libclc/generic/lib/math/half_sqrt.inc
new file mode 100644
index 0000000..bcdc6a9
--- /dev/null
+++ b/libclc/generic/lib/math/half_sqrt.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE half_sqrt(__CLC_GENTYPE val) {
+ return sqrt(val);
+}
diff --git a/libclc/generic/lib/math/hypot.cl b/libclc/generic/lib/math/hypot.cl
new file mode 100644
index 0000000..eca042c
--- /dev/null
+++ b/libclc/generic/lib/math/hypot.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <hypot.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/hypot.inc b/libclc/generic/lib/math/hypot.inc
new file mode 100644
index 0000000..036cee7
--- /dev/null
+++ b/libclc/generic/lib/math/hypot.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE hypot(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ return sqrt(x*x + y*y);
+}
diff --git a/libclc/generic/lib/math/ldexp.cl b/libclc/generic/lib/math/ldexp.cl
new file mode 100644
index 0000000..9be3127
--- /dev/null
+++ b/libclc/generic/lib/math/ldexp.cl
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "config.h"
+#include "../clcmacro.h"
+#include "math.h"
+#include "math/clc_ldexp.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, ldexp, __clc_ldexp, float, int)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(double, ldexp, __clc_ldexp, double, int)
+
+#endif
+
+// This defines all the ldexp(GENTYPE, int) variants
+#define __CLC_BODY <ldexp.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/ldexp.inc b/libclc/generic/lib/math/ldexp.inc
new file mode 100644
index 0000000..6e28fbb
--- /dev/null
+++ b/libclc/generic/lib/math/ldexp.inc
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __CLC_SCALAR
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE ldexp(__CLC_GENTYPE x, int n) {
+ return ldexp(x, (__CLC_INTN)n);
+}
+
+#endif
diff --git a/libclc/generic/lib/math/log.cl b/libclc/generic/lib/math/log.cl
new file mode 100644
index 0000000..ec1faa1
--- /dev/null
+++ b/libclc/generic/lib/math/log.cl
@@ -0,0 +1,26 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+/*
+ *log(x) = log2(x) * (1/log2(e))
+ */
+
+_CLC_OVERLOAD _CLC_DEF float log(float x)
+{
+ return log2(x) * (1.0f / M_LOG2E_F);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double log(double x)
+{
+ return log2(x) * (1.0 / M_LOG2E);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log, double);
+
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/log10.cl b/libclc/generic/lib/math/log10.cl
new file mode 100644
index 0000000..d65764a
--- /dev/null
+++ b/libclc/generic/lib/math/log10.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <log10.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/log10.inc b/libclc/generic/lib/math/log10.inc
new file mode 100644
index 0000000..423308a0
--- /dev/null
+++ b/libclc/generic/lib/math/log10.inc
@@ -0,0 +1,13 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE log10(__CLC_GENTYPE val) {
+ // log10(x) = log2(x) / log2(10)
+ // 1 / log2(10) = 0.30102999566 = log10(2)
+ // SP representation is 0.30103 (0x1.344136p-2)
+ // DP representation is 0.301029995659999993762312442414(0x1.34413509E61D8p-2)
+#if __CLC_FPSIZE == 32
+ return log2(val) * 0x1.344136p-2f;
+#elif __CLC_FPSIZE == 64
+ return log2(val) * 0x1.34413509E61D8p-2;
+#else
+#error unknown _CLC_FPSIZE
+#endif
+}
diff --git a/libclc/generic/lib/math/log1p.cl b/libclc/generic/lib/math/log1p.cl
new file mode 100644
index 0000000..be25c64
--- /dev/null
+++ b/libclc/generic/lib/math/log1p.cl
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float log1p(float x)
+{
+ float w = x;
+ uint ux = as_uint(x);
+ uint ax = ux & EXSIGNBIT_SP32;
+
+ // |x| < 2^-4
+ float u2 = MATH_DIVIDE(x, 2.0f + x);
+ float u = u2 + u2;
+ float v = u * u;
+ // 2/(5 * 2^5), 2/(3 * 2^3)
+ float zsmall = mad(-u2, x, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v * u) + x;
+
+ // |x| >= 2^-4
+ ux = as_uint(x + 1.0f);
+
+ int m = (int)((ux >> EXPSHIFTBITS_SP32) & 0xff) - EXPBIAS_SP32;
+ float mf = (float)m;
+ uint indx = (ux & 0x007f0000) + ((ux & 0x00008000) << 1);
+ float F = as_float(indx | 0x3f000000);
+
+ // x > 2^24
+ float fg24 = F - as_float(0x3f000000 | (ux & MANTBITS_SP32));
+
+ // x <= 2^24
+ uint xhi = ux & 0xffff8000;
+ float xh = as_float(xhi);
+ float xt = (1.0f - xh) + w;
+ uint xnm = ((~(xhi & 0x7f800000)) - 0x00800000) & 0x7f800000;
+ xt = xt * as_float(xnm) * 0.5f;
+ float fl24 = F - as_float(0x3f000000 | (xhi & MANTBITS_SP32)) - xt;
+
+ float f = mf > 24.0f ? fg24 : fl24;
+
+ indx = indx >> 16;
+ float r = f * USE_TABLE(log_inv_tbl, indx);
+
+ // 1/3, 1/2
+ float poly = mad(mad(r, 0x1.555556p-2f, 0x1.0p-1f), r*r, r);
+
+ const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
+ const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+
+ float2 tv = USE_TABLE(loge_tbl, indx);
+ float z1 = mad(mf, LOG2_HEAD, tv.s0);
+ float z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1;
+ float z = z1 + z2;
+
+ z = ax < 0x3d800000U ? zsmall : z;
+
+
+
+ // Edge cases
+ z = ax >= PINFBITPATT_SP32 ? w : z;
+ z = w < -1.0f ? as_float(QNANBITPATT_SP32) : z;
+ z = w == -1.0f ? as_float(NINFBITPATT_SP32) : z;
+ //fix subnormals
+ z = ax < 0x33800000 ? x : z;
+
+ return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log1p, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double log1p(double x)
+{
+ // Computes natural log(1+x). Algorithm based on:
+ // Ping-Tak Peter Tang
+ // "Table-driven implementation of the logarithm function in IEEE
+ // floating-point arithmetic"
+ // ACM Transactions on Mathematical Software (TOMS)
+ // Volume 16, Issue 4 (December 1990)
+ // Note that we use a lookup table of size 64 rather than 128,
+ // and compensate by having extra terms in the minimax polynomial
+ // for the kernel approximation.
+
+ // Process Inside the threshold now
+ ulong ux = as_ulong(1.0 + x);
+ int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64;
+ double f = as_double(ONEEXPBITS_DP64 | (ux & MANTBITS_DP64));
+
+ int j = as_int2(ux).hi >> 13;
+ j = ((0x80 | (j & 0x7e)) >> 1) + (j & 0x1);
+ double f1 = (double)j * 0x1.0p-6;
+ j -= 64;
+
+ double f2temp = f - f1;
+ double m2 = as_double(convert_ulong(0x3ff - xexp) << EXPSHIFTBITS_DP64);
+ double f2l = fma(m2, x, m2 - f1);
+ double f2g = fma(m2, x, -f1) + m2;
+ double f2 = xexp <= MANTLENGTH_DP64-1 ? f2l : f2g;
+ f2 = (xexp <= -2) | (xexp >= MANTLENGTH_DP64+8) ? f2temp : f2;
+
+ double2 tv = USE_TABLE(ln_tbl, j);
+ double z1 = tv.s0;
+ double q = tv.s1;
+
+ double u = MATH_DIVIDE(f2, fma(0.5, f2, f1));
+ double v = u * u;
+
+ double poly = v * fma(v,
+ fma(v, 2.23219810758559851206e-03, 1.24999999978138668903e-02),
+ 8.33333333333333593622e-02);
+
+ // log2_lead and log2_tail sum to an extra-precise version of log(2)
+ const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
+ const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
+
+ double z2 = q + fma(u, poly, u);
+ double dxexp = (double)xexp;
+ double r1 = fma(dxexp, log2_lead, z1);
+ double r2 = fma(dxexp, log2_tail, z2);
+ double result1 = r1 + r2;
+
+ // Process Outside the threshold now
+ double r = x;
+ u = r / (2.0 + r);
+ double correction = r * u;
+ u = u + u;
+ v = u * u;
+ r1 = r;
+
+ poly = fma(v,
+ fma(v,
+ fma(v, 4.34887777707614552256e-04, 2.23213998791944806202e-03),
+ 1.25000000037717509602e-02),
+ 8.33333333333317923934e-02);
+
+ r2 = fma(u*v, poly, -correction);
+
+ // The values exp(-1/16)-1 and exp(1/16)-1
+ const double log1p_thresh1 = -0x1.f0540438fd5c3p-5;
+ const double log1p_thresh2 = 0x1.082b577d34ed8p-4;
+ double result2 = r1 + r2;
+ result2 = x < log1p_thresh1 | x > log1p_thresh2 ? result1 : result2;
+
+ result2 = isinf(x) ? x : result2;
+ result2 = x < -1.0 ? as_double(QNANBITPATT_DP64) : result2;
+ result2 = x == -1.0 ? as_double(NINFBITPATT_DP64) : result2;
+ return result2;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log1p, double);
+
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/log2.cl b/libclc/generic/lib/math/log2.cl
new file mode 100644
index 0000000..8776a80
--- /dev/null
+++ b/libclc/generic/lib/math/log2.cl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "../clcmacro.h"
+#include "tables.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif // cl_khr_fp64
+
+#define COMPILING_LOG2
+#include "log_base.h"
+#undef COMPILING_LOG2
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log2, float);
+
+#ifdef cl_khr_fp64
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log2, double);
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h
new file mode 100644
index 0000000..bf2f82b
--- /dev/null
+++ b/libclc/generic/lib/math/log_base.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "math.h"
+
+/*
+ Algorithm:
+
+ Based on:
+ Ping-Tak Peter Tang
+ "Table-driven implementation of the logarithm function in IEEE
+ floating-point arithmetic"
+ ACM Transactions on Mathematical Software (TOMS)
+ Volume 16, Issue 4 (December 1990)
+
+
+ x very close to 1.0 is handled differently, for x everywhere else
+ a brief explanation is given below
+
+ x = (2^m)*A
+ x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
+ x = (2^m)*2*(G/2+g/2)
+ x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
+
+ Y = (2^(-1))*(2^(-m))*(2^m)*A
+ Now, range of Y is: 0.5 <= Y < 1
+
+ F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit)
+ Now, range of F is: 128 <= F <= 256
+ F = F / 256
+ Now, range of F is: 0.5 <= F <= 1
+
+ f = -(Y-F), with (f <= 2^(-9))
+
+ log(x) = m*log(2) + log(2) + log(F-f)
+ log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
+ log(x) = m*log(2) + log(2*F) + log(1-r)
+
+ r = (f/F), with (r <= 2^(-8))
+ r = f*(1/F) with (1/F) precomputed to avoid division
+
+ log(x) = m*log(2) + log(G) - poly
+
+ log(G) is precomputed
+ poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5))
+
+ log(2) and log(G) need to be maintained in extra precision
+ to avoid losing precision in the calculations
+
+
+ For x close to 1.0, we employ the following technique to
+ ensure faster convergence.
+
+ log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7
+ x = ((1+s)/(1-s))
+ x = 1 + r
+ s = r/(2+r)
+
+*/
+
+_CLC_OVERLOAD _CLC_DEF float
+#if defined(COMPILING_LOG2)
+log2(float x)
+#elif defined(COMPILING_LOG10)
+log10(float x)
+#else
+log(float x)
+#endif
+{
+
+#if defined(COMPILING_LOG2)
+ const float LOG2E = 0x1.715476p+0f; // 1.4426950408889634
+ const float LOG2E_HEAD = 0x1.700000p+0f; // 1.4375
+ const float LOG2E_TAIL = 0x1.547652p-8f; // 0.00519504072
+#elif defined(COMPILING_LOG10)
+ USE_TABLE(float2, p_log, LOG10_TBL);
+ const float LOG10E = 0x1.bcb7b2p-2f; // 0.43429448190325182
+ const float LOG10E_HEAD = 0x1.bc0000p-2f; // 0.43359375
+ const float LOG10E_TAIL = 0x1.6f62a4p-11f; // 0.0007007319
+ const float LOG10_2_HEAD = 0x1.340000p-2f; // 0.30078125
+ const float LOG10_2_TAIL = 0x1.04d426p-12f; // 0.000248745637
+#else
+ USE_TABLE(float2, p_log, LOGE_TBL);
+ const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
+ const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+#endif
+
+ uint xi = as_uint(x);
+ uint ax = xi & EXSIGNBIT_SP32;
+
+ // Calculations for |x-1| < 2^-4
+ float r = x - 1.0f;
+ int near1 = fabs(r) < 0x1.0p-4f;
+ float u2 = MATH_DIVIDE(r, 2.0f + r);
+ float corr = u2 * r;
+ float u = u2 + u2;
+ float v = u * u;
+ float znear1, z1, z2;
+
+ // 2/(5 * 2^5), 2/(3 * 2^3)
+ z2 = mad(u, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f)*v, -corr);
+
+#if defined(COMPILING_LOG2)
+ z1 = as_float(as_int(r) & 0xffff0000);
+ z2 = z2 + (r - z1);
+ znear1 = mad(z1, LOG2E_HEAD, mad(z2, LOG2E_HEAD, mad(z1, LOG2E_TAIL, z2*LOG2E_TAIL)));
+#elif defined(COMPILING_LOG10)
+ z1 = as_float(as_int(r) & 0xffff0000);
+ z2 = z2 + (r - z1);
+ znear1 = mad(z1, LOG10E_HEAD, mad(z2, LOG10E_HEAD, mad(z1, LOG10E_TAIL, z2*LOG10E_TAIL)));
+#else
+ znear1 = z2 + r;
+#endif
+
+ // Calculations for x not near 1
+ int m = (int)(xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+ // Normalize subnormal
+ uint xis = as_uint(as_float(xi | 0x3f800000) - 1.0f);
+ int ms = (int)(xis >> EXPSHIFTBITS_SP32) - 253;
+ int c = m == -127;
+ m = c ? ms : m;
+ uint xin = c ? xis : xi;
+
+ float mf = (float)m;
+ uint indx = (xin & 0x007f0000) + ((xin & 0x00008000) << 1);
+
+ // F - Y
+ float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (xin & MANTBITS_SP32));
+
+ indx = indx >> 16;
+ r = f * USE_TABLE(log_inv_tbl, indx);
+
+ // 1/3, 1/2
+ float poly = mad(mad(r, 0x1.555556p-2f, 0.5f), r*r, r);
+
+#if defined(COMPILING_LOG2)
+ float2 tv = USE_TABLE(log2_tbl, indx);
+ z1 = tv.s0 + mf;
+ z2 = mad(poly, -LOG2E, tv.s1);
+#elif defined(COMPILING_LOG10)
+ float2 tv = p_log[indx];
+ z1 = mad(mf, LOG10_2_HEAD, tv.s0);
+ z2 = mad(poly, -LOG10E, mf*LOG10_2_TAIL) + tv.s1;
+#else
+ float2 tv = p_log[indx];
+ z1 = mad(mf, LOG2_HEAD, tv.s0);
+ z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1;
+#endif
+
+ float z = z1 + z2;
+ z = near1 ? znear1 : z;
+
+ // Corner cases
+ z = ax >= PINFBITPATT_SP32 ? x : z;
+ z = xi != ax ? as_float(QNANBITPATT_SP32) : z;
+ z = ax == 0 ? as_float(NINFBITPATT_SP32) : z;
+
+ return z;
+}
+
+#ifdef cl_khr_fp64
+
+_CLC_OVERLOAD _CLC_DEF double
+#if defined(COMPILING_LOG2)
+log2(double x)
+#elif defined(COMPILING_LOG10)
+log10(double x)
+#else
+log(double x)
+#endif
+{
+
+#ifndef COMPILING_LOG2
+ // log2_lead and log2_tail sum to an extra-precise version of ln(2)
+ const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
+ const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
+#endif
+
+#if defined(COMPILING_LOG10)
+ // log10e_lead and log10e_tail sum to an extra-precision version of log10(e) (19 bits in lead)
+ const double log10e_lead = 4.34293746948242187500e-01; /* 0x3fdbcb7800000000 */
+ const double log10e_tail = 7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */
+#elif defined(COMPILING_LOG2)
+ // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19 bits in lead)
+ const double log2e_lead = 1.44269180297851562500E+00; /* 0x3FF7154400000000 */
+ const double log2e_tail = 3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */
+#endif
+
+ // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
+ // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000
+ const double log_thresh1 = 0x1.e0faap-1;
+ const double log_thresh2 = 0x1.1082cp+0;
+
+ int is_near = x >= log_thresh1 & x <= log_thresh2;
+
+ // Near 1 code
+ double r = x - 1.0;
+ double u = r / (2.0 + r);
+ double correction = r * u;
+ u = u + u;
+ double v = u * u;
+ double r1 = r;
+
+ const double ca_1 = 8.33333333333317923934e-02; /* 0x3fb55555555554e6 */
+ const double ca_2 = 1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */
+ const double ca_3 = 2.23213998791944806202e-03; /* 0x3f62492307f1519f */
+ const double ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
+
+ double r2 = fma(u*v, fma(v, fma(v, fma(v, ca_4, ca_3), ca_2), ca_1), -correction);
+
+#if defined(COMPILING_LOG10)
+ r = r1;
+ r1 = as_double(as_ulong(r1) & 0xffffffff00000000);
+ r2 = r2 + (r - r1);
+ double ret_near = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail * r2)));
+#elif defined(COMPILING_LOG2)
+ r = r1;
+ r1 = as_double(as_ulong(r1) & 0xffffffff00000000);
+ r2 = r2 + (r - r1);
+ double ret_near = fma(log2e_lead, r1, fma(log2e_lead, r2, fma(log2e_tail, r1, log2e_tail*r2)));
+#else
+ double ret_near = r1 + r2;
+#endif
+
+ // This is the far from 1 code
+
+ // Deal with subnormal
+ ulong ux = as_ulong(x);
+ ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962);
+ int c = ux < IMPBIT_DP64;
+ ux = c ? uxs : ux;
+ int expadjust = c ? 60 : 0;
+
+ int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust;
+ double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64));
+ int index = as_int2(ux).hi >> 13;
+ index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1);
+
+ double2 tv = USE_TABLE(ln_tbl, index - 64);
+ double z1 = tv.s0;
+ double q = tv.s1;
+
+ double f1 = index * 0x1.0p-7;
+ double f2 = f - f1;
+ u = f2 / fma(f2, 0.5, f1);
+ v = u * u;
+
+ const double cb_1 = 8.33333333333333593622e-02; /* 0x3fb5555555555557 */
+ const double cb_2 = 1.24999999978138668903e-02; /* 0x3f89999999865ede */
+ const double cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
+
+ double poly = v * fma(v, fma(v, cb_3, cb_2), cb_1);
+ double z2 = q + fma(u, poly, u);
+
+ double dxexp = (double)xexp;
+#if defined (COMPILING_LOG10)
+ // Add xexp * log(2) to z1,z2 to get log(x)
+ r1 = fma(dxexp, log2_lead, z1);
+ r2 = fma(dxexp, log2_tail, z2);
+ double ret_far = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail*r2)));
+#elif defined(COMPILING_LOG2)
+ r1 = fma(log2e_lead, z1, dxexp);
+ r2 = fma(log2e_lead, z2, fma(log2e_tail, z1, log2e_tail*z2));
+ double ret_far = r1 + r2;
+#else
+ r1 = fma(dxexp, log2_lead, z1);
+ r2 = fma(dxexp, log2_tail, z2);
+ double ret_far = r1 + r2;
+#endif
+
+ double ret = is_near ? ret_near : ret_far;
+
+ ret = isinf(x) ? as_double(PINFBITPATT_DP64) : ret;
+ ret = isnan(x) | (x < 0.0) ? as_double(QNANBITPATT_DP64) : ret;
+ ret = x == 0.0 ? as_double(NINFBITPATT_DP64) : ret;
+ return ret;
+}
+
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/mad.cl b/libclc/generic/lib/math/mad.cl
new file mode 100644
index 0000000..6c7b90d
--- /dev/null
+++ b/libclc/generic/lib/math/mad.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <mad.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/mad.inc b/libclc/generic/lib/math/mad.inc
new file mode 100644
index 0000000..d32c783
--- /dev/null
+++ b/libclc/generic/lib/math/mad.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
+ return a * b + c;
+}
diff --git a/libclc/generic/lib/math/math.h b/libclc/generic/lib/math/math.h
new file mode 100644
index 0000000..f46c7ea
--- /dev/null
+++ b/libclc/generic/lib/math/math.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+#define HAVE_HW_FMA32() (1)
+#define HAVE_BITALIGN() (0)
+#define HAVE_FAST_FMA32() (0)
+
+#define MATH_DIVIDE(X, Y) ((X) / (Y))
+#define MATH_RECIP(X) (1.0f / (X))
+#define MATH_SQRT(X) sqrt(X)
+
+#define SIGNBIT_SP32 0x80000000
+#define EXSIGNBIT_SP32 0x7fffffff
+#define EXPBITS_SP32 0x7f800000
+#define MANTBITS_SP32 0x007fffff
+#define ONEEXPBITS_SP32 0x3f800000
+#define TWOEXPBITS_SP32 0x40000000
+#define HALFEXPBITS_SP32 0x3f000000
+#define IMPBIT_SP32 0x00800000
+#define QNANBITPATT_SP32 0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32 0x7f800000
+#define NINFBITPATT_SP32 0xff800000
+#define EXPBIAS_SP32 127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32 1
+#define EMIN_SP32 -126
+#define BIASEDEMAX_SP32 254
+#define EMAX_SP32 127
+#define LAMBDA_SP32 1.0e30
+#define MANTLENGTH_SP32 24
+#define BASEDIGITS_SP32 7
+
+#ifdef cl_khr_fp64
+
+#define SIGNBIT_DP64 0x8000000000000000L
+#define EXSIGNBIT_DP64 0x7fffffffffffffffL
+#define EXPBITS_DP64 0x7ff0000000000000L
+#define MANTBITS_DP64 0x000fffffffffffffL
+#define ONEEXPBITS_DP64 0x3ff0000000000000L
+#define TWOEXPBITS_DP64 0x4000000000000000L
+#define HALFEXPBITS_DP64 0x3fe0000000000000L
+#define IMPBIT_DP64 0x0010000000000000L
+#define QNANBITPATT_DP64 0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64 0x7ff0000000000000L
+#define NINFBITPATT_DP64 0xfff0000000000000L
+#define EXPBIAS_DP64 1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64 1
+#define EMIN_DP64 -1022
+#define BIASEDEMAX_DP64 2046 /* 0x7fe */
+#define EMAX_DP64 1023 /* 0x3ff */
+#define LAMBDA_DP64 1.0e300
+#define MANTLENGTH_DP64 53
+#define BASEDIGITS_DP64 15
+
+#endif // cl_khr_fp64
+
+#define ALIGNED(x) __attribute__((aligned(x)))
diff --git a/libclc/generic/lib/math/modf.cl b/libclc/generic/lib/math/modf.cl
new file mode 100644
index 0000000..3294fbc
--- /dev/null
+++ b/libclc/generic/lib/math/modf.cl
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <modf.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/modf.inc b/libclc/generic/lib/math/modf.inc
new file mode 100644
index 0000000..1486b76
--- /dev/null
+++ b/libclc/generic/lib/math/modf.inc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, __CLC_GENTYPE *iptr) {
+ *iptr = trunc(x);
+ return copysign(isinf(x) ? 0.0f : x - *iptr, x);
+}
+
+#define MODF_DEF(addrspace) \
+ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \
+ __CLC_GENTYPE private_iptr; \
+ __CLC_GENTYPE ret = modf(x, &private_iptr); \
+ *iptr = private_iptr; \
+ return ret; \
+}
+
+MODF_DEF(local);
+MODF_DEF(global);
diff --git a/libclc/generic/lib/math/native_log.cl b/libclc/generic/lib/math/native_log.cl
new file mode 100644
index 0000000..f64f012
--- /dev/null
+++ b/libclc/generic/lib/math/native_log.cl
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#define __CLC_FUNCTION __clc_native_log
+#define __CLC_INTRINSIC "llvm.log"
+#undef cl_khr_fp64
+#include <clc/math/unary_intrin.inc>
+
+#define __CLC_BODY <native_log.inc>
+#define __FLOAT_ONLY
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/native_log.inc b/libclc/generic/lib/math/native_log.inc
new file mode 100644
index 0000000..cb4db3f
--- /dev/null
+++ b/libclc/generic/lib/math/native_log.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_log(__CLC_GENTYPE val) {
+ return __clc_native_log(val);
+}
diff --git a/libclc/generic/lib/math/native_log2.cl b/libclc/generic/lib/math/native_log2.cl
new file mode 100644
index 0000000..35ed18b
--- /dev/null
+++ b/libclc/generic/lib/math/native_log2.cl
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#define __CLC_FUNCTION __clc_native_log2
+#define __CLC_INTRINSIC "llvm.log2"
+#undef cl_khr_fp64
+#include <clc/math/unary_intrin.inc>
+
+#define __CLC_BODY <native_log2.inc>
+#define __FLOAT_ONLY
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/native_log2.inc b/libclc/generic/lib/math/native_log2.inc
new file mode 100644
index 0000000..0f6a509
--- /dev/null
+++ b/libclc/generic/lib/math/native_log2.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_log2(__CLC_GENTYPE val) {
+ return __clc_native_log2(val);
+}
diff --git a/libclc/generic/lib/math/nextafter.cl b/libclc/generic/lib/math/nextafter.cl
new file mode 100644
index 0000000..cbe54cd
--- /dev/null
+++ b/libclc/generic/lib/math/nextafter.cl
@@ -0,0 +1,12 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __builtin_nextafterf, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(double, nextafter, __builtin_nextafter, double, double)
+
+#endif
diff --git a/libclc/generic/lib/math/pown.cl b/libclc/generic/lib/math/pown.cl
new file mode 100644
index 0000000..f3b27d4
--- /dev/null
+++ b/libclc/generic/lib/math/pown.cl
@@ -0,0 +1,10 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, pown, float, int)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, pown, double, int)
+#endif
diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl
new file mode 100644
index 0000000..3a40749
--- /dev/null
+++ b/libclc/generic/lib/math/sin.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "sincos_helpers.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float sin(float x)
+{
+ int ix = as_int(x);
+ int ax = ix & 0x7fffffff;
+ float dx = as_float(ax);
+
+ float r0, r1;
+ int regn = __clc_argReductionS(&r0, &r1, dx);
+
+ float ss = __clc_sinf_piby4(r0, r1);
+ float cc = __clc_cosf_piby4(r0, r1);
+
+ float s = (regn & 1) != 0 ? cc : ss;
+ s = as_float(as_int(s) ^ ((regn > 1) << 31) ^ (ix ^ ax));
+
+ s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s;
+
+ //Subnormals
+ s = x == 0.0f ? x : s;
+
+ return s;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sin, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double sin(double x) {
+ double y = fabs(x);
+
+ double r, rr;
+ int regn;
+
+ if (y < 0x1.0p+47)
+ __clc_remainder_piby2_medium(y, &r, &rr, &regn);
+ else
+ __clc_remainder_piby2_large(y, &r, &rr, &regn);
+
+ double2 sc = __clc_sincos_piby4(r, rr);
+
+ int2 s = as_int2(regn & 1 ? sc.hi : sc.lo);
+ s.hi ^= ((regn > 1) << 31) ^ ((x < 0.0) << 31);
+
+ return isinf(x) | isnan(x) ? as_double(QNANBITPATT_DP64) : as_double(s);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double);
+
+#endif
diff --git a/libclc/generic/lib/math/sincos.cl b/libclc/generic/lib/math/sincos.cl
new file mode 100644
index 0000000..eace5ad
--- /dev/null
+++ b/libclc/generic/lib/math/sincos.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <sincos.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/sincos.inc b/libclc/generic/lib/math/sincos.inc
new file mode 100644
index 0000000..e97f0f9
--- /dev/null
+++ b/libclc/generic/lib/math/sincos.inc
@@ -0,0 +1,11 @@
+#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \
+ _CLC_OVERLOAD _CLC_DEF TYPE sincos (TYPE x, ADDRSPACE TYPE * cosval) { \
+ *cosval = cos(x); \
+ return sin(x); \
+ }
+
+__CLC_DECLARE_SINCOS(global, __CLC_GENTYPE)
+__CLC_DECLARE_SINCOS(local, __CLC_GENTYPE)
+__CLC_DECLARE_SINCOS(private, __CLC_GENTYPE)
+
+#undef __CLC_DECLARE_SINCOS
diff --git a/libclc/generic/lib/math/sincosD_piby4.h b/libclc/generic/lib/math/sincosD_piby4.h
new file mode 100644
index 0000000..a00db85
--- /dev/null
+++ b/libclc/generic/lib/math/sincosD_piby4.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_INLINE double2
+__libclc__sincos_piby4(double x, double xx)
+{
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we add a correction
+ // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+ // is an approximation to cos(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we subtract a correction
+ // term g(x,xx) = x*xx to the result, where g(x,xx)
+ // is an approximation to sin(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ const double sc1 = -0.166666666666666646259241729;
+ const double sc2 = 0.833333333333095043065222816e-2;
+ const double sc3 = -0.19841269836761125688538679e-3;
+ const double sc4 = 0.275573161037288022676895908448e-5;
+ const double sc5 = -0.25051132068021699772257377197e-7;
+ const double sc6 = 0.159181443044859136852668200e-9;
+
+ const double cc1 = 0.41666666666666665390037e-1;
+ const double cc2 = -0.13888888888887398280412e-2;
+ const double cc3 = 0.248015872987670414957399e-4;
+ const double cc4 = -0.275573172723441909470836e-6;
+ const double cc5 = 0.208761463822329611076335e-8;
+ const double cc6 = -0.113826398067944859590880e-10;
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ double r = 0.5 * x2;
+ double t = 1.0 - r;
+
+ double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+ double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
+ x2*x2, fma(x, xx, (1.0 - t) - r));
+
+ double2 ret;
+ ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
+ ret.hi = cp;
+
+ return ret;
+}
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
new file mode 100644
index 0000000..251b7f9
--- /dev/null
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "sincos_helpers.h"
+
+#define bitalign(hi, lo, shift) \
+ ((hi) << (32 - (shift))) | ((lo) >> (shift));
+
+#define bytealign(src0, src1, src2) \
+ ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
+
+_CLC_DEF float __clc_sinf_piby4(float x, float y) {
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+
+ const float c1 = -0.1666666666e0f;
+ const float c2 = 0.8333331876e-2f;
+ const float c3 = -0.198400874e-3f;
+ const float c4 = 0.272500015e-5f;
+ const float c5 = -2.5050759689e-08f; // 0xb2d72f34
+ const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3
+
+ float z = x * x;
+ float v = z * x;
+ float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
+ float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
+
+ return ret;
+}
+
+_CLC_DEF float __clc_cosf_piby4(float x, float y) {
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+
+ const float c1 = 0.416666666e-1f;
+ const float c2 = -0.138888876e-2f;
+ const float c3 = 0.248006008e-4f;
+ const float c4 = -0.2730101334e-6f;
+ const float c5 = 2.0875723372e-09f; // 0x310f74f6
+ const float c6 = -1.1359647598e-11f; // 0xad47d74e
+
+ float z = x * x;
+ float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2), c1);
+
+ // if |x| < 0.3
+ float qx = 0.0f;
+
+ int ix = as_int(x) & EXSIGNBIT_SP32;
+
+ // 0.78125 > |x| >= 0.3
+ float xby4 = as_float(ix - 0x01000000);
+ qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
+
+ // x > 0.78125
+ qx = ix > 0x3f480000 ? 0.28125f : qx;
+
+ float hz = mad(z, 0.5f, -qx);
+ float a = 1.0f - qx;
+ float ret = a - (hz - mad(z, r, -x*y));
+ return ret;
+}
+
+_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt)
+{
+ if (HAVE_HW_FMA32()) {
+ float ph = a * b;
+ *hi = ph;
+ *lo = fma(a, b, -ph);
+ } else {
+ float ah = as_float(as_uint(a) & 0xfffff000U);
+ float at = a - ah;
+ float ph = a * b;
+ float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
+ *hi = ph;
+ *lo = pt;
+ }
+}
+
+_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x)
+{
+ // 72 bits of pi/2
+ const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
+ const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
+ const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
+
+ const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
+ const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
+ const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
+
+ const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
+ const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
+ const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
+
+ const float twobypi = 0x1.45f306p-1f;
+
+ float fnpi2 = trunc(mad(x, twobypi, 0.5f));
+
+ // subtract n * pi/2 from x
+ float rhead, rtail;
+ __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
+ float v = x - rhead;
+ float rem = v + (((x - v) - rhead) - rtail);
+
+ float rhead2, rtail2;
+ __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
+ v = rem - rhead2;
+ rem = v + (((rem - v) - rhead2) - rtail2);
+
+ float rhead3, rtail3;
+ __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
+ v = rem - rhead3;
+
+ *hi = v + ((rem - v) - rhead3);
+ *lo = -rtail3;
+ return fnpi2;
+}
+
+_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x)
+{
+ float fnpi2 = __clc_removePi2S(r, rr, x);
+ return (int)fnpi2 & 0x3;
+}
+
+#define FULL_MUL(A, B, HI, LO) \
+ LO = A * B; \
+ HI = mul_hi(A, B)
+
+#define FULL_MAD(A, B, C, HI, LO) \
+ LO = ((A) * (B) + (C)); \
+ HI = mul_hi(A, B); \
+ HI += LO < C
+
+_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
+{
+ int xe = (int)(as_uint(x) >> 23) - 127;
+ uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
+
+ // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
+ const uint b6 = 0xA2F9836EU;
+ const uint b5 = 0x4E441529U;
+ const uint b4 = 0xFC2757D1U;
+ const uint b3 = 0xF534DDC0U;
+ const uint b2 = 0xDB629599U;
+ const uint b1 = 0x3C439041U;
+ const uint b0 = 0xFE5163ABU;
+
+ uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+
+ FULL_MUL(xm, b0, c0, p0);
+ FULL_MAD(xm, b1, c0, c1, p1);
+ FULL_MAD(xm, b2, c1, c0, p2);
+ FULL_MAD(xm, b3, c0, c1, p3);
+ FULL_MAD(xm, b4, c1, c0, p4);
+ FULL_MAD(xm, b5, c0, c1, p5);
+ FULL_MAD(xm, b6, c1, p7, p6);
+
+ uint fbits = 224 + 23 - xe;
+
+ // shift amount to get 2 lsb of integer part at top 2 bits
+ // min: 25 (xe=18) max: 134 (xe=127)
+ uint shift = 256U - 2 - fbits;
+
+ // Shift by up to 134/32 = 4 words
+ int c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ p3 = c ? p2 : p3;
+ p2 = c ? p1 : p2;
+ p1 = c ? p0 : p1;
+ shift -= (-c) & 32;
+
+ c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ p3 = c ? p2 : p3;
+ p2 = c ? p1 : p2;
+ shift -= (-c) & 32;
+
+ c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ p3 = c ? p2 : p3;
+ shift -= (-c) & 32;
+
+ c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ shift -= (-c) & 32;
+
+ // bitalign cannot handle a shift of 32
+ c = shift > 0;
+ shift = 32 - shift;
+ uint t7 = bitalign(p7, p6, shift);
+ uint t6 = bitalign(p6, p5, shift);
+ uint t5 = bitalign(p5, p4, shift);
+ p7 = c ? t7 : p7;
+ p6 = c ? t6 : p6;
+ p5 = c ? t5 : p5;
+
+ // Get 2 lsb of int part and msb of fraction
+ int i = p7 >> 29;
+
+ // Scoot up 2 more bits so only fraction remains
+ p7 = bitalign(p7, p6, 30);
+ p6 = bitalign(p6, p5, 30);
+ p5 = bitalign(p5, p4, 30);
+
+ // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
+ uint flip = i & 1 ? 0xffffffffU : 0U;
+ uint sign = i & 1 ? 0x80000000U : 0U;
+ p7 = p7 ^ flip;
+ p6 = p6 ^ flip;
+ p5 = p5 ^ flip;
+
+ // Find exponent and shift away leading zeroes and hidden bit
+ xe = clz(p7) + 1;
+ shift = 32 - xe;
+ p7 = bitalign(p7, p6, shift);
+ p6 = bitalign(p6, p5, shift);
+
+ // Most significant part of fraction
+ float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
+
+ // Shift out bits we captured on q1
+ p7 = bitalign(p7, p6, 32-23);
+
+ // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
+ int xxe = clz(p7) + 1;
+ p7 = bitalign(p7, p6, 32-xxe);
+ float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
+
+ // At this point, the fraction q1 + q0 is correct to at least 48 bits
+ // Now we need to multiply the fraction by pi/2
+ // This loses us about 4 bits
+ // pi/2 = C90 FDA A22 168 C23 4C4
+
+ const float pio2h = (float)0xc90fda / 0x1.0p+23f;
+ const float pio2hh = (float)0xc90 / 0x1.0p+11f;
+ const float pio2ht = (float)0xfda / 0x1.0p+23f;
+ const float pio2t = (float)0xa22168 / 0x1.0p+47f;
+
+ float rh, rt;
+
+ if (HAVE_HW_FMA32()) {
+ rh = q1 * pio2h;
+ rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
+ } else {
+ float q1h = as_float(as_uint(q1) & 0xfffff000);
+ float q1t = q1 - q1h;
+ rh = q1 * pio2h;
+ rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
+ rt = mad(q0, pio2h, mad(q1, pio2t, rt));
+ }
+
+ float t = rh + rt;
+ rt = rt - (t - rh);
+
+ *r = t;
+ *rr = rt;
+ return ((i >> 1) + (i & 1)) & 0x3;
+}
+
+_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
+{
+ if (x < 0x1.0p+23f)
+ return __clc_argReductionSmallS(r, rr, x);
+ else
+ return __clc_argReductionLargeS(r, rr, x);
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// Reduction for medium sized arguments
+_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
+ // How many pi/2 is x a multiple of?
+ const double two_by_pi = 0x1.45f306dc9c883p-1;
+ double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
+
+ const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+ const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+ const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+ // Compute product of npi2 with 159 bits of 2/pi
+ double p_hh = piby2_h * dnpi2;
+ double p_ht = fma(piby2_h, dnpi2, -p_hh);
+ double p_mh = piby2_m * dnpi2;
+ double p_mt = fma(piby2_m, dnpi2, -p_mh);
+ double p_th = piby2_t * dnpi2;
+ double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+ // Reduce to 159 bits
+ double ph = p_hh;
+ double pm = p_ht + p_mh;
+ double t = p_mh - (pm - p_ht);
+ double pt = p_th + t + p_mt + p_tt;
+ t = ph + pm; pm = pm - (t - ph); ph = t;
+ t = pm + pt; pt = pt - (t - pm); pm = t;
+
+ // Subtract from x
+ t = x + ph;
+ double qh = t + pm;
+ double qt = pm - (qh - t) + pt;
+
+ *r = qh;
+ *rr = qt;
+ *regn = (int)(long)dnpi2 & 0x3;
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+
+_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
+
+ long ux = as_long(x);
+ int e = (int)(ux >> 52) - 1023;
+ int i = max(23, (e >> 3) + 17);
+ int j = 150 - i;
+ int j16 = j & ~0xf;
+ double fract_temp;
+
+ // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
+ uint4 q0 = USE_TABLE(pibits_tbl, j16);
+ uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+ uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+ int k = (j >> 2) & 0x3;
+ int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+ uint u0, u1, u2, u3, u4, u5, u6;
+
+ u0 = c.s1 ? q0.s1 : q0.s0;
+ u0 = c.s2 ? q0.s2 : u0;
+ u0 = c.s3 ? q0.s3 : u0;
+
+ u1 = c.s1 ? q0.s2 : q0.s1;
+ u1 = c.s2 ? q0.s3 : u1;
+ u1 = c.s3 ? q1.s0 : u1;
+
+ u2 = c.s1 ? q0.s3 : q0.s2;
+ u2 = c.s2 ? q1.s0 : u2;
+ u2 = c.s3 ? q1.s1 : u2;
+
+ u3 = c.s1 ? q1.s0 : q0.s3;
+ u3 = c.s2 ? q1.s1 : u3;
+ u3 = c.s3 ? q1.s2 : u3;
+
+ u4 = c.s1 ? q1.s1 : q1.s0;
+ u4 = c.s2 ? q1.s2 : u4;
+ u4 = c.s3 ? q1.s3 : u4;
+
+ u5 = c.s1 ? q1.s2 : q1.s1;
+ u5 = c.s2 ? q1.s3 : u5;
+ u5 = c.s3 ? q2.s0 : u5;
+
+ u6 = c.s1 ? q1.s3 : q1.s2;
+ u6 = c.s2 ? q2.s0 : u6;
+ u6 = c.s3 ? q2.s1 : u6;
+
+ uint v0 = bytealign(u1, u0, j);
+ uint v1 = bytealign(u2, u1, j);
+ uint v2 = bytealign(u3, u2, j);
+ uint v3 = bytealign(u4, u3, j);
+ uint v4 = bytealign(u5, u4, j);
+ uint v5 = bytealign(u6, u5, j);
+
+ // Place those 192 bits in 4 48-bit doubles along with correct exponent
+ // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
+ i = 2 + 8*i;
+ x *= i > 1018 ? 0x1.0p-136 : 1.0;
+ i -= i > 1018 ? 136 : 0;
+
+ uint ua = (uint)(1023 + 52 - i) << 20;
+ double a = as_double((uint2)(0, ua));
+ double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+ // Exact multiply
+ double f0h = p0 * x;
+ double f0l = fma(p0, x, -f0h);
+ double f1h = p1 * x;
+ double f1l = fma(p1, x, -f1h);
+ double f2h = p2 * x;
+ double f2l = fma(p2, x, -f2h);
+ double f3h = p3 * x;
+ double f3l = fma(p3, x, -f3h);
+
+ // Accumulate product into 4 doubles
+ double s, t;
+
+ double f3 = f3h + f2h;
+ t = f2h - (f3 - f3h);
+ s = f3l + t;
+ t = t - (s - f3l);
+
+ double f2 = s + f1h;
+ t = f1h - (f2 - s) + t;
+ s = f2l + t;
+ t = t - (s - f2l);
+
+ double f1 = s + f0h;
+ t = f0h - (f1 - s) + t;
+ s = f1l + t;
+
+ double f0 = s + f0l;
+
+ // Strip off unwanted large integer bits
+ f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
+ f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+ // Compute least significant integer bits
+ t = f3 + f2;
+ double di = t - fract(t, &fract_temp);
+ i = (float)di;
+
+ // Shift out remaining integer part
+ f3 -= di;
+ s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
+ s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
+ f1 += f0;
+
+ // Subtract 1 if fraction is >= 0.5, and update regn
+ int g = f3 >= 0.5;
+ i += g;
+ f3 -= (float)g;
+
+ // Shift up bits
+ s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
+
+ // Multiply precise fraction by pi/2 to get radians
+ const double p2h = 7074237752028440.0 / 0x1.0p+52;
+ const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+ double rhi = f3 * p2h;
+ double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+ *r = rhi + rlo;
+ *rr = rlo - (*r - rhi);
+ *regn = i & 0x3;
+}
+
+
+_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we add a correction
+ // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+ // is an approximation to cos(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we subtract a correction
+ // term g(x,xx) = x*xx to the result, where g(x,xx)
+ // is an approximation to sin(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ const double sc1 = -0.166666666666666646259241729;
+ const double sc2 = 0.833333333333095043065222816e-2;
+ const double sc3 = -0.19841269836761125688538679e-3;
+ const double sc4 = 0.275573161037288022676895908448e-5;
+ const double sc5 = -0.25051132068021699772257377197e-7;
+ const double sc6 = 0.159181443044859136852668200e-9;
+
+ const double cc1 = 0.41666666666666665390037e-1;
+ const double cc2 = -0.13888888888887398280412e-2;
+ const double cc3 = 0.248015872987670414957399e-4;
+ const double cc4 = -0.275573172723441909470836e-6;
+ const double cc5 = 0.208761463822329611076335e-8;
+ const double cc6 = -0.113826398067944859590880e-10;
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ double r = 0.5 * x2;
+ double t = 1.0 - r;
+
+ double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+ double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
+ x2*x2, fma(x, xx, (1.0 - t) - r));
+
+ double2 ret;
+ ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
+ ret.hi = cp;
+
+ return ret;
+}
+
+#endif
diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h
new file mode 100644
index 0000000..2565d44
--- /dev/null
+++ b/libclc/generic/lib/math/sincos_helpers.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_DECL float __clc_sinf_piby4(float x, float y);
+_CLC_DECL float __clc_cosf_piby4(float x, float y);
+_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn);
+_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn);
+_CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
+
+#endif
diff --git a/libclc/generic/lib/math/sincospiF_piby4.h b/libclc/generic/lib/math/sincospiF_piby4.h
new file mode 100644
index 0000000..90ecb1d
--- /dev/null
+++ b/libclc/generic/lib/math/sincospiF_piby4.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
+_CLC_INLINE float2
+__libclc__sincosf_piby4(float x)
+{
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+
+ const float sc1 = -0.166666666638608441788607926e0F;
+ const float sc2 = 0.833333187633086262120839299e-2F;
+ const float sc3 = -0.198400874359527693921333720e-3F;
+ const float sc4 = 0.272500015145584081596826911e-5F;
+
+ const float cc1 = 0.41666666664325175238031e-1F;
+ const float cc2 = -0.13888887673175665567647e-2F;
+ const float cc3 = 0.24800600878112441958053e-4F;
+ const float cc4 = -0.27301013343179832472841e-6F;
+
+ float x2 = x * x;
+
+ float2 ret;
+ ret.x = mad(x*x2, mad(x2, mad(x2, mad(x2, sc4, sc3), sc2), sc1), x);
+ ret.y = mad(x2*x2, mad(x2, mad(x2, mad(x2, cc4, cc3), cc2), cc1), mad(x2, -0.5f, 1.0f));
+ return ret;
+}
diff --git a/libclc/generic/lib/math/sinpi.cl b/libclc/generic/lib/math/sinpi.cl
new file mode 100644
index 0000000..dbb995f
--- /dev/null
+++ b/libclc/generic/lib/math/sinpi.cl
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "sincospiF_piby4.h"
+#include "../clcmacro.h"
+#ifdef cl_khr_fp64
+#include "sincosD_piby4.h"
+#endif
+
+_CLC_OVERLOAD _CLC_DEF float sinpi(float x)
+{
+ int ix = as_int(x);
+ int xsgn = ix & 0x80000000;
+ ix ^= xsgn;
+ float ax = as_float(ix);
+ int iax = (int)ax;
+ float r = ax - iax;
+ int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0);
+
+ // Initialize with return for +-Inf and NaN
+ int ir = 0x7fc00000;
+
+ // 2^23 <= |x| < Inf, the result is always integer
+ ir = ix < 0x7f800000 ? xsgn : ir;
+
+ // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+ // r < 1.0
+ float a = 1.0f - r;
+ int e = 0;
+
+ // r <= 0.75
+ int c = r <= 0.75f;
+ a = c ? r - 0.5f : a;
+ e = c ? 1 : e;
+
+ // r < 0.5
+ c = r < 0.5f;
+ a = c ? 0.5f - r : a;
+
+ // 0 < r <= 0.25
+ c = r <= 0.25f;
+ a = c ? r : a;
+ e = c ? 0 : e;
+
+ float2 t = __libclc__sincosf_piby4(a * M_PI_F);
+ int jr = xodd ^ as_int(e ? t.hi : t.lo);
+
+ ir = ix < 0x4b000000 ? jr : ir;
+
+ return as_float(ir);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sinpi, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double sinpi(double x)
+{
+ long ix = as_long(x);
+ long xsgn = ix & 0x8000000000000000L;
+ ix ^= xsgn;
+ double ax = as_double(ix);
+ long iax = (long)ax;
+ double r = ax - (double)iax;
+ long xodd = xsgn ^ (iax & 0x1L ? 0x8000000000000000L : 0L);
+
+ // Initialize with return for +-Inf and NaN
+ long ir = 0x7ff8000000000000L;
+
+ // 2^23 <= |x| < Inf, the result is always integer
+ ir = ix < 0x7ff0000000000000 ? xsgn : ir;
+
+ // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+ // r < 1.0
+ double a = 1.0 - r;
+ int e = 0;
+
+ // r <= 0.75
+ int c = r <= 0.75;
+ double t = r - 0.5;
+ a = c ? t : a;
+ e = c ? 1 : e;
+
+ // r < 0.5
+ c = r < 0.5;
+ t = 0.5 - r;
+ a = c ? t : a;
+
+ // r <= 0.25
+ c = r <= 0.25;
+ a = c ? r : a;
+ e = c ? 0 : e;
+
+ double api = a * M_PI;
+ double2 sc = __libclc__sincos_piby4(api, 0.0);
+ long jr = xodd ^ as_long(e ? sc.hi : sc.lo);
+
+ ir = ax < 0x1.0p+52 ? jr : ir;
+
+ return as_double(ir);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinpi, double)
+
+#endif
diff --git a/libclc/generic/lib/math/sqrt.cl b/libclc/generic/lib/math/sqrt.cl
new file mode 100644
index 0000000..300e274
--- /dev/null
+++ b/libclc/generic/lib/math/sqrt.cl
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "../clcmacro.h"
+#include "math/clc_sqrt.h"
+
+_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(double, sqrt, __clc_sqrt, double)
+
+#endif
diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl
new file mode 100644
index 0000000..8286efb
--- /dev/null
+++ b/libclc/generic/lib/math/tables.cl
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "tables.h"
+
+DECLARE_TABLE(float2, LOGE_TBL, 129) = {
+ (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+ (float2)(0x1.fe0000p-8f, 0x1.535882p-23f),
+ (float2)(0x1.fc0000p-7f, 0x1.5161f8p-20f),
+ (float2)(0x1.7b8000p-6f, 0x1.1b07d4p-18f),
+ (float2)(0x1.f82000p-6f, 0x1.361cf0p-19f),
+ (float2)(0x1.39e000p-5f, 0x1.0f73fcp-18f),
+ (float2)(0x1.774000p-5f, 0x1.63d8cap-19f),
+ (float2)(0x1.b42000p-5f, 0x1.bae232p-18f),
+ (float2)(0x1.f0a000p-5f, 0x1.86008ap-20f),
+ (float2)(0x1.164000p-4f, 0x1.36eea2p-16f),
+ (float2)(0x1.340000p-4f, 0x1.d7961ap-16f),
+ (float2)(0x1.51a000p-4f, 0x1.073f06p-16f),
+ (float2)(0x1.6f0000p-4f, 0x1.a515cap-17f),
+ (float2)(0x1.8c2000p-4f, 0x1.45d630p-16f),
+ (float2)(0x1.a92000p-4f, 0x1.b4e92ap-18f),
+ (float2)(0x1.c5e000p-4f, 0x1.523d6ep-18f),
+ (float2)(0x1.e26000p-4f, 0x1.076e2ap-16f),
+ (float2)(0x1.fec000p-4f, 0x1.2263b6p-17f),
+ (float2)(0x1.0d6000p-3f, 0x1.7e7cd0p-15f),
+ (float2)(0x1.1b6000p-3f, 0x1.2ad52ep-15f),
+ (float2)(0x1.294000p-3f, 0x1.52f81ep-15f),
+ (float2)(0x1.370000p-3f, 0x1.fc201ep-15f),
+ (float2)(0x1.44c000p-3f, 0x1.2b6ccap-15f),
+ (float2)(0x1.526000p-3f, 0x1.cbc742p-16f),
+ (float2)(0x1.5fe000p-3f, 0x1.3070a6p-15f),
+ (float2)(0x1.6d6000p-3f, 0x1.fce33ap-20f),
+ (float2)(0x1.7aa000p-3f, 0x1.890210p-15f),
+ (float2)(0x1.87e000p-3f, 0x1.a06520p-15f),
+ (float2)(0x1.952000p-3f, 0x1.6a73d0p-17f),
+ (float2)(0x1.a22000p-3f, 0x1.bc1fe2p-15f),
+ (float2)(0x1.af2000p-3f, 0x1.c94e80p-15f),
+ (float2)(0x1.bc2000p-3f, 0x1.0ce85ap-16f),
+ (float2)(0x1.c8e000p-3f, 0x1.f7c79ap-15f),
+ (float2)(0x1.d5c000p-3f, 0x1.0b5a7cp-18f),
+ (float2)(0x1.e26000p-3f, 0x1.076e2ap-15f),
+ (float2)(0x1.ef0000p-3f, 0x1.5b97b8p-16f),
+ (float2)(0x1.fb8000p-3f, 0x1.186d5ep-15f),
+ (float2)(0x1.040000p-2f, 0x1.2ca5a6p-17f),
+ (float2)(0x1.0a2000p-2f, 0x1.24e272p-14f),
+ (float2)(0x1.104000p-2f, 0x1.8bf9aep-14f),
+ (float2)(0x1.166000p-2f, 0x1.5cabaap-14f),
+ (float2)(0x1.1c8000p-2f, 0x1.3182d2p-15f),
+ (float2)(0x1.228000p-2f, 0x1.41fbcep-14f),
+ (float2)(0x1.288000p-2f, 0x1.5a13dep-14f),
+ (float2)(0x1.2e8000p-2f, 0x1.c575c2p-15f),
+ (float2)(0x1.346000p-2f, 0x1.dd9a98p-14f),
+ (float2)(0x1.3a6000p-2f, 0x1.3155a4p-16f),
+ (float2)(0x1.404000p-2f, 0x1.843434p-17f),
+ (float2)(0x1.460000p-2f, 0x1.8bc21cp-14f),
+ (float2)(0x1.4be000p-2f, 0x1.7e55dcp-16f),
+ (float2)(0x1.51a000p-2f, 0x1.5b0e5ap-15f),
+ (float2)(0x1.576000p-2f, 0x1.dc5d14p-16f),
+ (float2)(0x1.5d0000p-2f, 0x1.bdbf58p-14f),
+ (float2)(0x1.62c000p-2f, 0x1.05e572p-15f),
+ (float2)(0x1.686000p-2f, 0x1.903d36p-15f),
+ (float2)(0x1.6e0000p-2f, 0x1.1d5456p-15f),
+ (float2)(0x1.738000p-2f, 0x1.d7f6bap-14f),
+ (float2)(0x1.792000p-2f, 0x1.4abfbap-15f),
+ (float2)(0x1.7ea000p-2f, 0x1.f07704p-15f),
+ (float2)(0x1.842000p-2f, 0x1.a3b43cp-15f),
+ (float2)(0x1.89a000p-2f, 0x1.9c360ap-17f),
+ (float2)(0x1.8f0000p-2f, 0x1.1e8736p-14f),
+ (float2)(0x1.946000p-2f, 0x1.941c20p-14f),
+ (float2)(0x1.99c000p-2f, 0x1.958116p-14f),
+ (float2)(0x1.9f2000p-2f, 0x1.23ecbep-14f),
+ (float2)(0x1.a48000p-2f, 0x1.024396p-16f),
+ (float2)(0x1.a9c000p-2f, 0x1.d93534p-15f),
+ (float2)(0x1.af0000p-2f, 0x1.293246p-14f),
+ (float2)(0x1.b44000p-2f, 0x1.eef798p-15f),
+ (float2)(0x1.b98000p-2f, 0x1.625a4cp-16f),
+ (float2)(0x1.bea000p-2f, 0x1.4d9da6p-14f),
+ (float2)(0x1.c3c000p-2f, 0x1.d7a7ccp-14f),
+ (float2)(0x1.c8e000p-2f, 0x1.f7c79ap-14f),
+ (float2)(0x1.ce0000p-2f, 0x1.af0b84p-14f),
+ (float2)(0x1.d32000p-2f, 0x1.fcfc00p-15f),
+ (float2)(0x1.d82000p-2f, 0x1.e7258ap-14f),
+ (float2)(0x1.dd4000p-2f, 0x1.a81306p-16f),
+ (float2)(0x1.e24000p-2f, 0x1.1034f8p-15f),
+ (float2)(0x1.e74000p-2f, 0x1.09875ap-16f),
+ (float2)(0x1.ec2000p-2f, 0x1.99d246p-14f),
+ (float2)(0x1.f12000p-2f, 0x1.1ebf5ep-15f),
+ (float2)(0x1.f60000p-2f, 0x1.23fa70p-14f),
+ (float2)(0x1.fae000p-2f, 0x1.588f78p-14f),
+ (float2)(0x1.ffc000p-2f, 0x1.2e0856p-14f),
+ (float2)(0x1.024000p-1f, 0x1.52a5a4p-13f),
+ (float2)(0x1.04a000p-1f, 0x1.df9da8p-13f),
+ (float2)(0x1.072000p-1f, 0x1.f2e0e6p-16f),
+ (float2)(0x1.098000p-1f, 0x1.bd3d5cp-15f),
+ (float2)(0x1.0be000p-1f, 0x1.cb9094p-15f),
+ (float2)(0x1.0e4000p-1f, 0x1.261746p-15f),
+ (float2)(0x1.108000p-1f, 0x1.f39e2cp-13f),
+ (float2)(0x1.12e000p-1f, 0x1.719592p-13f),
+ (float2)(0x1.154000p-1f, 0x1.87a5e8p-14f),
+ (float2)(0x1.178000p-1f, 0x1.eabbd8p-13f),
+ (float2)(0x1.19e000p-1f, 0x1.cd68cep-14f),
+ (float2)(0x1.1c2000p-1f, 0x1.b81f70p-13f),
+ (float2)(0x1.1e8000p-1f, 0x1.7d79c0p-15f),
+ (float2)(0x1.20c000p-1f, 0x1.b9a324p-14f),
+ (float2)(0x1.230000p-1f, 0x1.30d7bep-13f),
+ (float2)(0x1.254000p-1f, 0x1.5bce98p-13f),
+ (float2)(0x1.278000p-1f, 0x1.5e1288p-13f),
+ (float2)(0x1.29c000p-1f, 0x1.37fec2p-13f),
+ (float2)(0x1.2c0000p-1f, 0x1.d3da88p-14f),
+ (float2)(0x1.2e4000p-1f, 0x1.d0db90p-15f),
+ (float2)(0x1.306000p-1f, 0x1.d7334ep-13f),
+ (float2)(0x1.32a000p-1f, 0x1.133912p-13f),
+ (float2)(0x1.34e000p-1f, 0x1.44ece6p-16f),
+ (float2)(0x1.370000p-1f, 0x1.17b546p-13f),
+ (float2)(0x1.392000p-1f, 0x1.e0d356p-13f),
+ (float2)(0x1.3b6000p-1f, 0x1.0893fep-14f),
+ (float2)(0x1.3d8000p-1f, 0x1.026a70p-13f),
+ (float2)(0x1.3fa000p-1f, 0x1.5b84d0p-13f),
+ (float2)(0x1.41c000p-1f, 0x1.8fe846p-13f),
+ (float2)(0x1.43e000p-1f, 0x1.9fe2f8p-13f),
+ (float2)(0x1.460000p-1f, 0x1.8bc21cp-13f),
+ (float2)(0x1.482000p-1f, 0x1.53d1eap-13f),
+ (float2)(0x1.4a4000p-1f, 0x1.f0bb60p-14f),
+ (float2)(0x1.4c6000p-1f, 0x1.e6bf32p-15f),
+ (float2)(0x1.4e6000p-1f, 0x1.d811b6p-13f),
+ (float2)(0x1.508000p-1f, 0x1.13cc00p-13f),
+ (float2)(0x1.52a000p-1f, 0x1.6932dep-16f),
+ (float2)(0x1.54a000p-1f, 0x1.246798p-13f),
+ (float2)(0x1.56a000p-1f, 0x1.f9d5b2p-13f),
+ (float2)(0x1.58c000p-1f, 0x1.5b6b9ap-14f),
+ (float2)(0x1.5ac000p-1f, 0x1.404c34p-13f),
+ (float2)(0x1.5cc000p-1f, 0x1.b1dc6cp-13f),
+ (float2)(0x1.5ee000p-1f, 0x1.54920ap-20f),
+ (float2)(0x1.60e000p-1f, 0x1.97a23cp-16f),
+ (float2)(0x1.62e000p-1f, 0x1.0bfbe8p-15f),
+};
+
+DECLARE_TABLE(float, LOG_INV_TBL, 129) = {
+ 0x1.000000p+1f,
+ 0x1.fc07f0p+0f,
+ 0x1.f81f82p+0f,
+ 0x1.f4465ap+0f,
+ 0x1.f07c20p+0f,
+ 0x1.ecc07cp+0f,
+ 0x1.e9131ap+0f,
+ 0x1.e573acp+0f,
+ 0x1.e1e1e2p+0f,
+ 0x1.de5d6ep+0f,
+ 0x1.dae608p+0f,
+ 0x1.d77b66p+0f,
+ 0x1.d41d42p+0f,
+ 0x1.d0cb58p+0f,
+ 0x1.cd8568p+0f,
+ 0x1.ca4b30p+0f,
+ 0x1.c71c72p+0f,
+ 0x1.c3f8f0p+0f,
+ 0x1.c0e070p+0f,
+ 0x1.bdd2b8p+0f,
+ 0x1.bacf92p+0f,
+ 0x1.b7d6c4p+0f,
+ 0x1.b4e81cp+0f,
+ 0x1.b20364p+0f,
+ 0x1.af286cp+0f,
+ 0x1.ac5702p+0f,
+ 0x1.a98ef6p+0f,
+ 0x1.a6d01ap+0f,
+ 0x1.a41a42p+0f,
+ 0x1.a16d40p+0f,
+ 0x1.9ec8eap+0f,
+ 0x1.9c2d14p+0f,
+ 0x1.99999ap+0f,
+ 0x1.970e50p+0f,
+ 0x1.948b10p+0f,
+ 0x1.920fb4p+0f,
+ 0x1.8f9c18p+0f,
+ 0x1.8d3018p+0f,
+ 0x1.8acb90p+0f,
+ 0x1.886e60p+0f,
+ 0x1.861862p+0f,
+ 0x1.83c978p+0f,
+ 0x1.818182p+0f,
+ 0x1.7f4060p+0f,
+ 0x1.7d05f4p+0f,
+ 0x1.7ad220p+0f,
+ 0x1.78a4c8p+0f,
+ 0x1.767dcep+0f,
+ 0x1.745d18p+0f,
+ 0x1.724288p+0f,
+ 0x1.702e06p+0f,
+ 0x1.6e1f76p+0f,
+ 0x1.6c16c2p+0f,
+ 0x1.6a13cep+0f,
+ 0x1.681682p+0f,
+ 0x1.661ec6p+0f,
+ 0x1.642c86p+0f,
+ 0x1.623fa8p+0f,
+ 0x1.605816p+0f,
+ 0x1.5e75bcp+0f,
+ 0x1.5c9882p+0f,
+ 0x1.5ac056p+0f,
+ 0x1.58ed24p+0f,
+ 0x1.571ed4p+0f,
+ 0x1.555556p+0f,
+ 0x1.539094p+0f,
+ 0x1.51d07ep+0f,
+ 0x1.501502p+0f,
+ 0x1.4e5e0ap+0f,
+ 0x1.4cab88p+0f,
+ 0x1.4afd6ap+0f,
+ 0x1.49539ep+0f,
+ 0x1.47ae14p+0f,
+ 0x1.460cbcp+0f,
+ 0x1.446f86p+0f,
+ 0x1.42d662p+0f,
+ 0x1.414142p+0f,
+ 0x1.3fb014p+0f,
+ 0x1.3e22ccp+0f,
+ 0x1.3c995ap+0f,
+ 0x1.3b13b2p+0f,
+ 0x1.3991c2p+0f,
+ 0x1.381382p+0f,
+ 0x1.3698e0p+0f,
+ 0x1.3521d0p+0f,
+ 0x1.33ae46p+0f,
+ 0x1.323e34p+0f,
+ 0x1.30d190p+0f,
+ 0x1.2f684cp+0f,
+ 0x1.2e025cp+0f,
+ 0x1.2c9fb4p+0f,
+ 0x1.2b404ap+0f,
+ 0x1.29e412p+0f,
+ 0x1.288b02p+0f,
+ 0x1.27350cp+0f,
+ 0x1.25e228p+0f,
+ 0x1.24924ap+0f,
+ 0x1.234568p+0f,
+ 0x1.21fb78p+0f,
+ 0x1.20b470p+0f,
+ 0x1.1f7048p+0f,
+ 0x1.1e2ef4p+0f,
+ 0x1.1cf06ap+0f,
+ 0x1.1bb4a4p+0f,
+ 0x1.1a7b96p+0f,
+ 0x1.194538p+0f,
+ 0x1.181182p+0f,
+ 0x1.16e068p+0f,
+ 0x1.15b1e6p+0f,
+ 0x1.1485f0p+0f,
+ 0x1.135c82p+0f,
+ 0x1.12358ep+0f,
+ 0x1.111112p+0f,
+ 0x1.0fef02p+0f,
+ 0x1.0ecf56p+0f,
+ 0x1.0db20ap+0f,
+ 0x1.0c9714p+0f,
+ 0x1.0b7e6ep+0f,
+ 0x1.0a6810p+0f,
+ 0x1.0953f4p+0f,
+ 0x1.084210p+0f,
+ 0x1.073260p+0f,
+ 0x1.0624dep+0f,
+ 0x1.051980p+0f,
+ 0x1.041042p+0f,
+ 0x1.03091cp+0f,
+ 0x1.020408p+0f,
+ 0x1.010102p+0f,
+ 0x1.000000p+0f,
+};
+
+DECLARE_TABLE(float2, LOG2_TBL, 129) = {
+ (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+ (float2)(0x1.6f8000p-7f, 0x1.942dbap-17f),
+ (float2)(0x1.6e0000p-6f, 0x1.e5a170p-16f),
+ (float2)(0x1.118000p-5f, 0x1.347544p-15f),
+ (float2)(0x1.6b8000p-5f, 0x1.69bac6p-16f),
+ (float2)(0x1.c48000p-5f, 0x1.7eae42p-15f),
+ (float2)(0x1.0e8000p-4f, 0x1.9c4fd0p-15f),
+ (float2)(0x1.3a8000p-4f, 0x1.17ee92p-15f),
+ (float2)(0x1.660000p-4f, 0x1.fb7d64p-15f),
+ (float2)(0x1.918000p-4f, 0x1.42dc8cp-17f),
+ (float2)(0x1.bc8000p-4f, 0x1.0902b6p-18f),
+ (float2)(0x1.e70000p-4f, 0x1.7608bep-15f),
+ (float2)(0x1.088000p-3f, 0x1.162336p-13f),
+ (float2)(0x1.1d8000p-3f, 0x1.3465d4p-13f),
+ (float2)(0x1.328000p-3f, 0x1.74f13cp-14f),
+ (float2)(0x1.470000p-3f, 0x1.aa7e60p-13f),
+ (float2)(0x1.5c0000p-3f, 0x1.a39fbcp-19f),
+ (float2)(0x1.700000p-3f, 0x1.d0b53ap-13f),
+ (float2)(0x1.848000p-3f, 0x1.0af40ap-13f),
+ (float2)(0x1.988000p-3f, 0x1.b741dep-13f),
+ (float2)(0x1.ac8000p-3f, 0x1.d78b6cp-13f),
+ (float2)(0x1.c08000p-3f, 0x1.6db376p-13f),
+ (float2)(0x1.d48000p-3f, 0x1.ee4c32p-15f),
+ (float2)(0x1.e80000p-3f, 0x1.02f9d2p-13f),
+ (float2)(0x1.fb8000p-3f, 0x1.05ae40p-13f),
+ (float2)(0x1.078000p-2f, 0x1.0adbb0p-14f),
+ (float2)(0x1.110000p-2f, 0x1.83ed68p-13f),
+ (float2)(0x1.1a8000p-2f, 0x1.016ca4p-12f),
+ (float2)(0x1.240000p-2f, 0x1.01eac2p-12f),
+ (float2)(0x1.2d8000p-2f, 0x1.887e26p-13f),
+ (float2)(0x1.370000p-2f, 0x1.24cea4p-14f),
+ (float2)(0x1.400000p-2f, 0x1.918ec6p-12f),
+ (float2)(0x1.498000p-2f, 0x1.3c25e6p-13f),
+ (float2)(0x1.528000p-2f, 0x1.6f7f12p-12f),
+ (float2)(0x1.5c0000p-2f, 0x1.a39fbcp-18f),
+ (float2)(0x1.650000p-2f, 0x1.8fe466p-14f),
+ (float2)(0x1.6e0000p-2f, 0x1.10e6cep-13f),
+ (float2)(0x1.770000p-2f, 0x1.d2ba7ep-14f),
+ (float2)(0x1.800000p-2f, 0x1.4ac62cp-15f),
+ (float2)(0x1.888000p-2f, 0x1.a71cb8p-12f),
+ (float2)(0x1.918000p-2f, 0x1.dd448ep-13f),
+ (float2)(0x1.9a8000p-2f, 0x1.1c8f10p-21f),
+ (float2)(0x1.a30000p-2f, 0x1.bb053ep-13f),
+ (float2)(0x1.ab8000p-2f, 0x1.861e5ep-12f),
+ (float2)(0x1.b40000p-2f, 0x1.fafdcep-12f),
+ (float2)(0x1.bd0000p-2f, 0x1.e5d3cep-15f),
+ (float2)(0x1.c58000p-2f, 0x1.2fad28p-14f),
+ (float2)(0x1.ce0000p-2f, 0x1.492474p-15f),
+ (float2)(0x1.d60000p-2f, 0x1.d4f80cp-12f),
+ (float2)(0x1.de8000p-2f, 0x1.4ff510p-12f),
+ (float2)(0x1.e70000p-2f, 0x1.3550f2p-13f),
+ (float2)(0x1.ef0000p-2f, 0x1.b59ccap-12f),
+ (float2)(0x1.f78000p-2f, 0x1.42b464p-13f),
+ (float2)(0x1.ff8000p-2f, 0x1.5e66a0p-12f),
+ (float2)(0x1.038000p-1f, 0x1.f6a2e4p-11f),
+ (float2)(0x1.080000p-1f, 0x1.39e4fep-14f),
+ (float2)(0x1.0c0000p-1f, 0x1.0500d6p-13f),
+ (float2)(0x1.100000p-1f, 0x1.13b152p-13f),
+ (float2)(0x1.140000p-1f, 0x1.93f542p-14f),
+ (float2)(0x1.180000p-1f, 0x1.467b94p-16f),
+ (float2)(0x1.1b8000p-1f, 0x1.cc47a4p-11f),
+ (float2)(0x1.1f8000p-1f, 0x1.78f4c2p-11f),
+ (float2)(0x1.238000p-1f, 0x1.107508p-11f),
+ (float2)(0x1.278000p-1f, 0x1.2602c2p-12f),
+ (float2)(0x1.2b8000p-1f, 0x1.a39fbcp-20f),
+ (float2)(0x1.2f0000p-1f, 0x1.5a1d7ap-11f),
+ (float2)(0x1.330000p-1f, 0x1.3e355ap-12f),
+ (float2)(0x1.368000p-1f, 0x1.cffedap-11f),
+ (float2)(0x1.3a8000p-1f, 0x1.d9fd50p-12f),
+ (float2)(0x1.3e0000p-1f, 0x1.f64de6p-11f),
+ (float2)(0x1.420000p-1f, 0x1.d83f4cp-12f),
+ (float2)(0x1.458000p-1f, 0x1.cea628p-11f),
+ (float2)(0x1.498000p-1f, 0x1.3c25e6p-12f),
+ (float2)(0x1.4d0000p-1f, 0x1.5a96ccp-11f),
+ (float2)(0x1.510000p-1f, 0x1.18708ap-17f),
+ (float2)(0x1.548000p-1f, 0x1.374652p-12f),
+ (float2)(0x1.580000p-1f, 0x1.2089a6p-11f),
+ (float2)(0x1.5b8000p-1f, 0x1.93432cp-11f),
+ (float2)(0x1.5f0000p-1f, 0x1.f3fd06p-11f),
+ (float2)(0x1.630000p-1f, 0x1.0b8f54p-13f),
+ (float2)(0x1.668000p-1f, 0x1.004722p-12f),
+ (float2)(0x1.6a0000p-1f, 0x1.57cf2cp-12f),
+ (float2)(0x1.6d8000p-1f, 0x1.8cb53ap-12f),
+ (float2)(0x1.710000p-1f, 0x1.9f4d8ap-12f),
+ (float2)(0x1.748000p-1f, 0x1.8feb26p-12f),
+ (float2)(0x1.780000p-1f, 0x1.5edfeep-12f),
+ (float2)(0x1.7b8000p-1f, 0x1.0c7c9ap-12f),
+ (float2)(0x1.7f0000p-1f, 0x1.322182p-13f),
+ (float2)(0x1.828000p-1f, 0x1.3ab7cep-18f),
+ (float2)(0x1.858000p-1f, 0x1.a82c2cp-11f),
+ (float2)(0x1.890000p-1f, 0x1.3dd2c0p-11f),
+ (float2)(0x1.8c8000p-1f, 0x1.871da4p-12f),
+ (float2)(0x1.900000p-1f, 0x1.cc2c00p-14f),
+ (float2)(0x1.930000p-1f, 0x1.9fdb68p-11f),
+ (float2)(0x1.968000p-1f, 0x1.ed6956p-12f),
+ (float2)(0x1.9a0000p-1f, 0x1.f1a760p-14f),
+ (float2)(0x1.9d0000p-1f, 0x1.767f54p-11f),
+ (float2)(0x1.a08000p-1f, 0x1.3f6d26p-12f),
+ (float2)(0x1.a38000p-1f, 0x1.b9fce2p-11f),
+ (float2)(0x1.a70000p-1f, 0x1.8ae816p-12f),
+ (float2)(0x1.aa0000p-1f, 0x1.c23d60p-11f),
+ (float2)(0x1.ad8000p-1f, 0x1.60f388p-12f),
+ (float2)(0x1.b08000p-1f, 0x1.9049aep-11f),
+ (float2)(0x1.b40000p-1f, 0x1.8734a8p-13f),
+ (float2)(0x1.b70000p-1f, 0x1.2523d4p-11f),
+ (float2)(0x1.ba0000p-1f, 0x1.da6ce6p-11f),
+ (float2)(0x1.bd8000p-1f, 0x1.038e62p-12f),
+ (float2)(0x1.c08000p-1f, 0x1.1b511ep-11f),
+ (float2)(0x1.c38000p-1f, 0x1.a728b8p-11f),
+ (float2)(0x1.c70000p-1f, 0x1.2b5d22p-14f),
+ (float2)(0x1.ca0000p-1f, 0x1.2c6e54p-12f),
+ (float2)(0x1.cd0000p-1f, 0x1.f35064p-12f),
+ (float2)(0x1.d00000p-1f, 0x1.4fdb48p-11f),
+ (float2)(0x1.d30000p-1f, 0x1.98ec9ep-11f),
+ (float2)(0x1.d60000p-1f, 0x1.d4f80cp-11f),
+ (float2)(0x1.d98000p-1f, 0x1.0643d6p-17f),
+ (float2)(0x1.dc8000p-1f, 0x1.33567ep-14f),
+ (float2)(0x1.df8000p-1f, 0x1.e0410cp-14f),
+ (float2)(0x1.e28000p-1f, 0x1.142e0ep-13f),
+ (float2)(0x1.e58000p-1f, 0x1.063c88p-13f),
+ (float2)(0x1.e88000p-1f, 0x1.8d66c4p-14f),
+ (float2)(0x1.eb8000p-1f, 0x1.57e32ap-15f),
+ (float2)(0x1.ee0000p-1f, 0x1.ed1c6cp-11f),
+ (float2)(0x1.f10000p-1f, 0x1.b8a076p-11f),
+ (float2)(0x1.f40000p-1f, 0x1.7822f2p-11f),
+ (float2)(0x1.f70000p-1f, 0x1.2bbc3ap-11f),
+ (float2)(0x1.fa0000p-1f, 0x1.a708bap-12f),
+ (float2)(0x1.fd0000p-1f, 0x1.be4c7ep-13f),
+ (float2)(0x1.000000p+0f, 0x0.000000p+0f)
+};
+
+DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
+ 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
+ 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
+ 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
+ 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
+ 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
+ 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
+ 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
+ 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
+ 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
+ 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
+ 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
+ 230, 139, 2, 0, 0, 0, 0, 0, 0, 0
+};
+
+TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl);
+TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
+TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl);
+
+uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) {
+ return *(__constant uint4 *)(PIBITS_TBL + idx);
+}
+
+#ifdef cl_khr_fp64
+
+DECLARE_TABLE(double2, LN_TBL, 65) = {
+ (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+ (double2)(0x1.fc0a800000000p-7, 0x1.61f807c79f3dbp-28),
+ (double2)(0x1.f829800000000p-6, 0x1.873c1980267c8p-25),
+ (double2)(0x1.7745800000000p-5, 0x1.ec65b9f88c69ep-26),
+ (double2)(0x1.f0a3000000000p-5, 0x1.8022c54cc2f99p-26),
+ (double2)(0x1.341d700000000p-4, 0x1.2c37a3a125330p-25),
+ (double2)(0x1.6f0d200000000p-4, 0x1.15cad69737c93p-25),
+ (double2)(0x1.a926d00000000p-4, 0x1.d256ab1b285e9p-27),
+ (double2)(0x1.e270700000000p-4, 0x1.b8abcb97a7aa2p-26),
+ (double2)(0x1.0d77e00000000p-3, 0x1.f34239659a5dcp-25),
+ (double2)(0x1.2955280000000p-3, 0x1.e07fd48d30177p-25),
+ (double2)(0x1.44d2b00000000p-3, 0x1.b32df4799f4f6p-25),
+ (double2)(0x1.5ff3000000000p-3, 0x1.c29e4f4f21cf8p-25),
+ (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30),
+ (double2)(0x1.9525a80000000p-3, 0x1.cf456b4764130p-27),
+ (double2)(0x1.af3c900000000p-3, 0x1.3a02ffcb63398p-25),
+ (double2)(0x1.c8ff780000000p-3, 0x1.1e6a6886b0976p-25),
+ (double2)(0x1.e270700000000p-3, 0x1.b8abcb97a7aa2p-25),
+ (double2)(0x1.fb91800000000p-3, 0x1.b578f8aa35552p-25),
+ (double2)(0x1.0a324c0000000p-2, 0x1.139c871afb9fcp-25),
+ (double2)(0x1.1675c80000000p-2, 0x1.5d5d30701ce64p-25),
+ (double2)(0x1.22941c0000000p-2, 0x1.de7bcb2d12142p-25),
+ (double2)(0x1.2e8e280000000p-2, 0x1.d708e984e1664p-25),
+ (double2)(0x1.3a64c40000000p-2, 0x1.56945e9c72f36p-26),
+ (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bdap-29),
+ (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f6p-28),
+ (double2)(0x1.5d1bd80000000p-2, 0x1.fac04e52846c7p-25),
+ (double2)(0x1.686c800000000p-2, 0x1.e9b14aec442bep-26),
+ (double2)(0x1.739d7c0000000p-2, 0x1.b5de8034e7126p-25),
+ (double2)(0x1.7eaf800000000p-2, 0x1.dc157e1b259d3p-25),
+ (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28),
+ (double2)(0x1.9479400000000p-2, 0x1.c2116faba4cddp-26),
+ (double2)(0x1.9f323c0000000p-2, 0x1.65fcc25f95b47p-25),
+ (double2)(0x1.a9cec80000000p-2, 0x1.a9a08498d4850p-26),
+ (double2)(0x1.b44f740000000p-2, 0x1.de647b1465f77p-25),
+ (double2)(0x1.beb4d80000000p-2, 0x1.da71b7bf7861dp-26),
+ (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28),
+ (double2)(0x1.d32fe40000000p-2, 0x1.f0075eab0ef64p-25),
+ (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989bp-28),
+ (double2)(0x1.e744240000000p-2, 0x1.0eb43c3f1bed2p-25),
+ (double2)(0x1.f128f40000000p-2, 0x1.faf06ecb35c84p-26),
+ (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f68p-27),
+ (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27),
+ (double2)(0x1.0723e40000000p-1, 0x1.c1cdf404e5796p-25),
+ (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27),
+ (double2)(0x1.109f380000000p-1, 0x1.e2d4c96fde3ecp-25),
+ (double2)(0x1.154c3c0000000p-1, 0x1.2f4d5e9a98f34p-25),
+ (double2)(0x1.19ee6a0000000p-1, 0x1.467c96ecc5cbep-25),
+ (double2)(0x1.1e85f40000000p-1, 0x1.e7040d03dec5ap-25),
+ (double2)(0x1.23130c0000000p-1, 0x1.7bebf4282de36p-25),
+ (double2)(0x1.2795e00000000p-1, 0x1.289b11aeb783fp-25),
+ (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26),
+ (double2)(0x1.307d720000000p-1, 0x1.34f10be1fb591p-25),
+ (double2)(0x1.34e2880000000p-1, 0x1.d9ce1d316eb93p-25),
+ (double2)(0x1.393e0c0000000p-1, 0x1.3562a19a9c442p-25),
+ (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26),
+ (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c97ap-26),
+ (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bdap-28),
+ (double2)(0x1.4a4f840000000p-1, 0x1.db03ebb0227bfp-25),
+ (double2)(0x1.4e7d800000000p-1, 0x1.1b75bb09cb098p-25),
+ (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27),
+ (double2)(0x1.56bf9c0000000p-1, 0x1.5b3f399411c62p-25),
+ (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26),
+ (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26),
+ (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25),
+};
+
+TABLE_FUNCTION(double2, LN_TBL, ln_tbl);
+
+
+// Arrays atan_jby256_lead and atan_jby256_tail contain
+// leading and trailing parts respectively of precomputed
+// values of atan(j/256), for j = 16, 17, ..., 256.
+// atan_jby256_lead contains the first 21 bits of precision,
+// and atan_jby256_tail contains a further 53 bits precision.
+
+DECLARE_TABLE(double2, ATAN_JBY256_TBL, 241) = {
+ (double2)(0x1.ff55b00000000p-5, 0x1.6e59fbd38db2cp-26),
+ (double2)(0x1.0f99e00000000p-4, 0x1.4e3aa54dedf96p-25),
+ (double2)(0x1.1f86d00000000p-4, 0x1.7e105ab1bda88p-25),
+ (double2)(0x1.2f71900000000p-4, 0x1.8c5254d013fd0p-27),
+ (double2)(0x1.3f59f00000000p-4, 0x1.cf8ab3ad62670p-29),
+ (double2)(0x1.4f3fd00000000p-4, 0x1.9dca4bec80468p-26),
+ (double2)(0x1.5f23200000000p-4, 0x1.3f4b5ec98a8dap-26),
+ (double2)(0x1.6f03b00000000p-4, 0x1.b9d49619d81fep-25),
+ (double2)(0x1.7ee1800000000p-4, 0x1.3017887460934p-27),
+ (double2)(0x1.8ebc500000000p-4, 0x1.11e3eca0b9944p-26),
+ (double2)(0x1.9e94100000000p-4, 0x1.4f3f73c5a332ep-26),
+ (double2)(0x1.ae68a00000000p-4, 0x1.c71c8ae0e00a6p-26),
+ (double2)(0x1.be39e00000000p-4, 0x1.7cde0f86fbdc7p-25),
+ (double2)(0x1.ce07c00000000p-4, 0x1.70f328c889c72p-26),
+ (double2)(0x1.ddd2100000000p-4, 0x1.c07ae9b994efep-26),
+ (double2)(0x1.ed98c00000000p-4, 0x1.0c8021d7b1698p-27),
+ (double2)(0x1.fd5ba00000000p-4, 0x1.35585edb8cb22p-25),
+ (double2)(0x1.068d500000000p-3, 0x1.0842567b30e96p-24),
+ (double2)(0x1.0e6ad00000000p-3, 0x1.99e811031472ep-24),
+ (double2)(0x1.1646500000000p-3, 0x1.041821416bceep-25),
+ (double2)(0x1.1e1fa00000000p-3, 0x1.f6086e4dc96f4p-24),
+ (double2)(0x1.25f6e00000000p-3, 0x1.71a535c5f1b58p-27),
+ (double2)(0x1.2dcbd00000000p-3, 0x1.65f743fe63ca1p-24),
+ (double2)(0x1.359e800000000p-3, 0x1.dbd733472d014p-24),
+ (double2)(0x1.3d6ee00000000p-3, 0x1.d18cc4d8b0d1dp-24),
+ (double2)(0x1.453ce00000000p-3, 0x1.8c12553c8fb29p-24),
+ (double2)(0x1.4d08700000000p-3, 0x1.53b49e2e8f991p-24),
+ (double2)(0x1.54d1800000000p-3, 0x1.7422ae148c141p-24),
+ (double2)(0x1.5c98100000000p-3, 0x1.e3ec269df56a8p-27),
+ (double2)(0x1.645bf00000000p-3, 0x1.ff6754e7e0ac9p-24),
+ (double2)(0x1.6c1d400000000p-3, 0x1.131267b1b5aadp-24),
+ (double2)(0x1.73dbd00000000p-3, 0x1.d14fa403a94bcp-24),
+ (double2)(0x1.7b97b00000000p-3, 0x1.2f396c089a3d8p-25),
+ (double2)(0x1.8350b00000000p-3, 0x1.c731d78fa95bbp-24),
+ (double2)(0x1.8b06e00000000p-3, 0x1.c50f385177399p-24),
+ (double2)(0x1.92ba300000000p-3, 0x1.f41409c6f2c20p-25),
+ (double2)(0x1.9a6a800000000p-3, 0x1.d2d90c4c39ec0p-24),
+ (double2)(0x1.a217e00000000p-3, 0x1.80420696f2106p-25),
+ (double2)(0x1.a9c2300000000p-3, 0x1.b40327943a2e8p-27),
+ (double2)(0x1.b169600000000p-3, 0x1.5d35e02f3d2a2p-25),
+ (double2)(0x1.b90d700000000p-3, 0x1.4a498288117b0p-25),
+ (double2)(0x1.c0ae500000000p-3, 0x1.35da119afb324p-25),
+ (double2)(0x1.c84bf00000000p-3, 0x1.14e85cdb9a908p-24),
+ (double2)(0x1.cfe6500000000p-3, 0x1.38754e5547b9ap-25),
+ (double2)(0x1.d77d500000000p-3, 0x1.be40ae6ce3246p-24),
+ (double2)(0x1.df11000000000p-3, 0x1.0c993b3bea7e7p-24),
+ (double2)(0x1.e6a1400000000p-3, 0x1.1d2dd89ac3359p-24),
+ (double2)(0x1.ee2e100000000p-3, 0x1.1476603332c46p-25),
+ (double2)(0x1.f5b7500000000p-3, 0x1.f25901bac55b7p-24),
+ (double2)(0x1.fd3d100000000p-3, 0x1.f881b7c826e28p-24),
+ (double2)(0x1.025fa00000000p-2, 0x1.441996d698d20p-24),
+ (double2)(0x1.061ee00000000p-2, 0x1.407ac521ea089p-23),
+ (double2)(0x1.09dc500000000p-2, 0x1.2fb0c6c4b1723p-23),
+ (double2)(0x1.0d97e00000000p-2, 0x1.ca135966a3e18p-23),
+ (double2)(0x1.1151a00000000p-2, 0x1.b1218e4d646e4p-25),
+ (double2)(0x1.1509700000000p-2, 0x1.d4e72a350d288p-25),
+ (double2)(0x1.18bf500000000p-2, 0x1.4617e2f04c329p-23),
+ (double2)(0x1.1c73500000000p-2, 0x1.096ec41e82650p-25),
+ (double2)(0x1.2025500000000p-2, 0x1.9f91f25773e6ep-24),
+ (double2)(0x1.23d5600000000p-2, 0x1.59c0820f1d674p-25),
+ (double2)(0x1.2783700000000p-2, 0x1.02bf7a2df1064p-25),
+ (double2)(0x1.2b2f700000000p-2, 0x1.fb36bfc40508fp-23),
+ (double2)(0x1.2ed9800000000p-2, 0x1.ea08f3f8dc892p-24),
+ (double2)(0x1.3281800000000p-2, 0x1.3ed6254656a0ep-24),
+ (double2)(0x1.3627700000000p-2, 0x1.b83f5e5e69c58p-25),
+ (double2)(0x1.39cb400000000p-2, 0x1.d6ec2af768592p-23),
+ (double2)(0x1.3d6d100000000p-2, 0x1.493889a226f94p-25),
+ (double2)(0x1.410cb00000000p-2, 0x1.5ad8fa65279bap-23),
+ (double2)(0x1.44aa400000000p-2, 0x1.b615784d45434p-25),
+ (double2)(0x1.4845a00000000p-2, 0x1.09a184368f145p-23),
+ (double2)(0x1.4bdee00000000p-2, 0x1.61a2439b0d91cp-24),
+ (double2)(0x1.4f75f00000000p-2, 0x1.ce1a65e39a978p-24),
+ (double2)(0x1.530ad00000000p-2, 0x1.32a39a93b6a66p-23),
+ (double2)(0x1.569d800000000p-2, 0x1.1c3699af804e7p-23),
+ (double2)(0x1.5a2e000000000p-2, 0x1.75e0f4e44ede8p-26),
+ (double2)(0x1.5dbc300000000p-2, 0x1.f77ced1a7a83bp-23),
+ (double2)(0x1.6148400000000p-2, 0x1.84e7f0cb1b500p-29),
+ (double2)(0x1.64d1f00000000p-2, 0x1.ec6b838b02dfep-23),
+ (double2)(0x1.6859700000000p-2, 0x1.3ebf4dfbeda87p-23),
+ (double2)(0x1.6bdea00000000p-2, 0x1.9397aed9cb475p-23),
+ (double2)(0x1.6f61900000000p-2, 0x1.07937bc239c54p-24),
+ (double2)(0x1.72e2200000000p-2, 0x1.aa754553131b6p-23),
+ (double2)(0x1.7660700000000p-2, 0x1.4a05d407c45dcp-24),
+ (double2)(0x1.79dc600000000p-2, 0x1.132231a206dd0p-23),
+ (double2)(0x1.7d56000000000p-2, 0x1.2d8ecfdd69c88p-24),
+ (double2)(0x1.80cd400000000p-2, 0x1.a852c74218606p-24),
+ (double2)(0x1.8442200000000p-2, 0x1.71bf2baeebb50p-23),
+ (double2)(0x1.87b4b00000000p-2, 0x1.83d7db7491820p-27),
+ (double2)(0x1.8b24d00000000p-2, 0x1.ca50d92b6da14p-25),
+ (double2)(0x1.8e92900000000p-2, 0x1.6f5cde8530298p-26),
+ (double2)(0x1.91fde00000000p-2, 0x1.f343198910740p-24),
+ (double2)(0x1.9566d00000000p-2, 0x1.0e8d241ccd80ap-24),
+ (double2)(0x1.98cd500000000p-2, 0x1.1535ac619e6c8p-24),
+ (double2)(0x1.9c31600000000p-2, 0x1.7316041c36cd2p-24),
+ (double2)(0x1.9f93000000000p-2, 0x1.985a000637d8ep-24),
+ (double2)(0x1.a2f2300000000p-2, 0x1.f2f29858c0a68p-25),
+ (double2)(0x1.a64ee00000000p-2, 0x1.879847f96d909p-23),
+ (double2)(0x1.a9a9200000000p-2, 0x1.ab3d319e12e42p-23),
+ (double2)(0x1.ad00f00000000p-2, 0x1.5088162dfc4c2p-24),
+ (double2)(0x1.b056400000000p-2, 0x1.05749a1cd9d8cp-25),
+ (double2)(0x1.b3a9100000000p-2, 0x1.da65c6c6b8618p-26),
+ (double2)(0x1.b6f9600000000p-2, 0x1.739bf7df1ad64p-25),
+ (double2)(0x1.ba47300000000p-2, 0x1.bc31252aa3340p-25),
+ (double2)(0x1.bd92800000000p-2, 0x1.e528191ad3aa8p-26),
+ (double2)(0x1.c0db400000000p-2, 0x1.929d93df19f18p-23),
+ (double2)(0x1.c421900000000p-2, 0x1.ff11eb693a080p-26),
+ (double2)(0x1.c765500000000p-2, 0x1.55ae3f145a3a0p-27),
+ (double2)(0x1.caa6800000000p-2, 0x1.cbcd8c6c0ca82p-24),
+ (double2)(0x1.cde5300000000p-2, 0x1.0cb04d425d304p-24),
+ (double2)(0x1.d121500000000p-2, 0x1.9adfcab5be678p-24),
+ (double2)(0x1.d45ae00000000p-2, 0x1.93d90c5662508p-23),
+ (double2)(0x1.d791f00000000p-2, 0x1.68489bd35ff40p-24),
+ (double2)(0x1.dac6700000000p-2, 0x1.586ed3da2b7e0p-28),
+ (double2)(0x1.ddf8500000000p-2, 0x1.7604d2e850eeep-23),
+ (double2)(0x1.e127b00000000p-2, 0x1.ac1d12bfb53d8p-24),
+ (double2)(0x1.e454800000000p-2, 0x1.9b3d468274740p-28),
+ (double2)(0x1.e77eb00000000p-2, 0x1.fc5d68d10e53cp-24),
+ (double2)(0x1.eaa6500000000p-2, 0x1.8f9e51884becbp-23),
+ (double2)(0x1.edcb600000000p-2, 0x1.a87f0869c06d1p-23),
+ (double2)(0x1.f0ede00000000p-2, 0x1.31e7279f685fap-23),
+ (double2)(0x1.f40dd00000000p-2, 0x1.6a8282f9719b0p-27),
+ (double2)(0x1.f72b200000000p-2, 0x1.0d2724a8a44e0p-25),
+ (double2)(0x1.fa45d00000000p-2, 0x1.a60524b11ad4ep-23),
+ (double2)(0x1.fd5e000000000p-2, 0x1.75fdf832750f0p-26),
+ (double2)(0x1.0039c00000000p-1, 0x1.cf06902e4cd36p-23),
+ (double2)(0x1.01c3400000000p-1, 0x1.e82422d4f6d10p-25),
+ (double2)(0x1.034b700000000p-1, 0x1.24a091063e6c0p-26),
+ (double2)(0x1.04d2500000000p-1, 0x1.8a1a172dc6f38p-24),
+ (double2)(0x1.0657e00000000p-1, 0x1.29b6619f8a92dp-22),
+ (double2)(0x1.07dc300000000p-1, 0x1.9274d9c1b70c8p-24),
+ (double2)(0x1.095f300000000p-1, 0x1.0c34b1fbb7930p-26),
+ (double2)(0x1.0ae0e00000000p-1, 0x1.639866c20eb50p-25),
+ (double2)(0x1.0c61400000000p-1, 0x1.6d6d0f6832e9ep-23),
+ (double2)(0x1.0de0500000000p-1, 0x1.af54def99f25ep-22),
+ (double2)(0x1.0f5e200000000p-1, 0x1.16cfc52a00262p-22),
+ (double2)(0x1.10daa00000000p-1, 0x1.dcc1e83569c32p-23),
+ (double2)(0x1.1255d00000000p-1, 0x1.37f7a551ed425p-22),
+ (double2)(0x1.13cfb00000000p-1, 0x1.f6360adc98887p-22),
+ (double2)(0x1.1548500000000p-1, 0x1.2c6ec8d35a2c1p-22),
+ (double2)(0x1.16bfa00000000p-1, 0x1.bd44df84cb036p-23),
+ (double2)(0x1.1835a00000000p-1, 0x1.117cf826e310ep-22),
+ (double2)(0x1.19aa500000000p-1, 0x1.ca533f332cfc9p-22),
+ (double2)(0x1.1b1dc00000000p-1, 0x1.0f208509dbc2ep-22),
+ (double2)(0x1.1c8fe00000000p-1, 0x1.cd07d93c945dep-23),
+ (double2)(0x1.1e00b00000000p-1, 0x1.57bdfd67e6d72p-22),
+ (double2)(0x1.1f70400000000p-1, 0x1.aab89c516c658p-24),
+ (double2)(0x1.20de800000000p-1, 0x1.3e823b1a1b8a0p-25),
+ (double2)(0x1.224b700000000p-1, 0x1.307464a9d6d3cp-23),
+ (double2)(0x1.23b7100000000p-1, 0x1.c5993cd438843p-22),
+ (double2)(0x1.2521700000000p-1, 0x1.ba2fca02ab554p-22),
+ (double2)(0x1.268a900000000p-1, 0x1.01a5b6983a268p-23),
+ (double2)(0x1.27f2600000000p-1, 0x1.273d1b350efc8p-25),
+ (double2)(0x1.2958e00000000p-1, 0x1.64c238c37b0c6p-23),
+ (double2)(0x1.2abe200000000p-1, 0x1.aded07370a300p-25),
+ (double2)(0x1.2c22100000000p-1, 0x1.78091197eb47ep-23),
+ (double2)(0x1.2d84c00000000p-1, 0x1.4b0f245e0dabcp-24),
+ (double2)(0x1.2ee6200000000p-1, 0x1.080d9794e2eafp-22),
+ (double2)(0x1.3046400000000p-1, 0x1.d4ec242b60c76p-23),
+ (double2)(0x1.31a5200000000p-1, 0x1.221d2f940caa0p-27),
+ (double2)(0x1.3302b00000000p-1, 0x1.cdbc42b2bba5cp-24),
+ (double2)(0x1.345f000000000p-1, 0x1.cce37bb440840p-25),
+ (double2)(0x1.35ba000000000p-1, 0x1.6c1d999cf1dd0p-22),
+ (double2)(0x1.3713d00000000p-1, 0x1.bed8a07eb0870p-26),
+ (double2)(0x1.386c500000000p-1, 0x1.69ed88f490e3cp-24),
+ (double2)(0x1.39c3900000000p-1, 0x1.cd41719b73ef0p-25),
+ (double2)(0x1.3b19800000000p-1, 0x1.cbc4ac95b41b7p-22),
+ (double2)(0x1.3c6e400000000p-1, 0x1.238f1b890f5d7p-22),
+ (double2)(0x1.3dc1c00000000p-1, 0x1.50c4282259cc4p-24),
+ (double2)(0x1.3f13f00000000p-1, 0x1.713d2de87b3e2p-22),
+ (double2)(0x1.4064f00000000p-1, 0x1.1d5a7d2255276p-23),
+ (double2)(0x1.41b4a00000000p-1, 0x1.c0dfd48227ac1p-22),
+ (double2)(0x1.4303200000000p-1, 0x1.1c964dab76753p-22),
+ (double2)(0x1.4450600000000p-1, 0x1.6de56d5704496p-23),
+ (double2)(0x1.459c600000000p-1, 0x1.4aeb71fd19968p-23),
+ (double2)(0x1.46e7200000000p-1, 0x1.fbf91c57b1918p-23),
+ (double2)(0x1.4830a00000000p-1, 0x1.d6bef7fbe5d9ap-22),
+ (double2)(0x1.4978f00000000p-1, 0x1.464d3dc249066p-22),
+ (double2)(0x1.4ac0000000000p-1, 0x1.638e2ec4d9073p-22),
+ (double2)(0x1.4c05e00000000p-1, 0x1.16f4a7247ea7cp-24),
+ (double2)(0x1.4d4a800000000p-1, 0x1.1a0a740f1d440p-28),
+ (double2)(0x1.4e8de00000000p-1, 0x1.6edbb0114a33cp-23),
+ (double2)(0x1.4fd0100000000p-1, 0x1.dbee8bf1d513cp-24),
+ (double2)(0x1.5111000000000p-1, 0x1.5b8bdb0248f73p-22),
+ (double2)(0x1.5250c00000000p-1, 0x1.7de3d3f5eac64p-22),
+ (double2)(0x1.538f500000000p-1, 0x1.ee24187ae448ap-23),
+ (double2)(0x1.54cca00000000p-1, 0x1.e06c591ec5192p-22),
+ (double2)(0x1.5608d00000000p-1, 0x1.4e3861a332738p-24),
+ (double2)(0x1.5743c00000000p-1, 0x1.a9599dcc2bfe4p-24),
+ (double2)(0x1.587d800000000p-1, 0x1.f732fbad43468p-25),
+ (double2)(0x1.59b6000000000p-1, 0x1.eb9f573b727d9p-22),
+ (double2)(0x1.5aed600000000p-1, 0x1.8b212a2eb9897p-22),
+ (double2)(0x1.5c23900000000p-1, 0x1.384884c167215p-22),
+ (double2)(0x1.5d58900000000p-1, 0x1.0e2d363020051p-22),
+ (double2)(0x1.5e8c600000000p-1, 0x1.2820879fbd022p-22),
+ (double2)(0x1.5fbf000000000p-1, 0x1.a1ab9893e4b30p-22),
+ (double2)(0x1.60f0800000000p-1, 0x1.2d1b817a24478p-23),
+ (double2)(0x1.6220d00000000p-1, 0x1.15d7b8ded4878p-25),
+ (double2)(0x1.634ff00000000p-1, 0x1.8968f9db3a5e4p-24),
+ (double2)(0x1.647de00000000p-1, 0x1.71c4171fe135fp-22),
+ (double2)(0x1.65aab00000000p-1, 0x1.6d80f605d0d8cp-22),
+ (double2)(0x1.66d6600000000p-1, 0x1.c91f043691590p-24),
+ (double2)(0x1.6800e00000000p-1, 0x1.39f8a15fce2b2p-23),
+ (double2)(0x1.692a400000000p-1, 0x1.55beda9d94b80p-27),
+ (double2)(0x1.6a52700000000p-1, 0x1.b12c15d60949ap-23),
+ (double2)(0x1.6b79800000000p-1, 0x1.24167b312bfe3p-22),
+ (double2)(0x1.6c9f700000000p-1, 0x1.0ab8633070277p-22),
+ (double2)(0x1.6dc4400000000p-1, 0x1.54554ebbc80eep-23),
+ (double2)(0x1.6ee7f00000000p-1, 0x1.0204aef5a4bb8p-25),
+ (double2)(0x1.700a700000000p-1, 0x1.8af08c679cf2cp-22),
+ (double2)(0x1.712be00000000p-1, 0x1.0852a330ae6c8p-22),
+ (double2)(0x1.724c300000000p-1, 0x1.6d3eb9ec32916p-23),
+ (double2)(0x1.736b600000000p-1, 0x1.685cb7fcbbafep-23),
+ (double2)(0x1.7489700000000p-1, 0x1.1f751c1e0bd95p-22),
+ (double2)(0x1.75a6700000000p-1, 0x1.705b1b0f72560p-26),
+ (double2)(0x1.76c2400000000p-1, 0x1.b98d8d808ca92p-22),
+ (double2)(0x1.77dd100000000p-1, 0x1.2ea22c75cc980p-25),
+ (double2)(0x1.78f6b00000000p-1, 0x1.7aba62bca0350p-22),
+ (double2)(0x1.7a0f400000000p-1, 0x1.d73833442278cp-22),
+ (double2)(0x1.7b26c00000000p-1, 0x1.5a5ca1fb18bf9p-22),
+ (double2)(0x1.7c3d300000000p-1, 0x1.1a6092b6ecf28p-25),
+ (double2)(0x1.7d52800000000p-1, 0x1.44fd049aac104p-24),
+ (double2)(0x1.7e66c00000000p-1, 0x1.c114fd8df5180p-29),
+ (double2)(0x1.7f79e00000000p-1, 0x1.5972f130feae5p-22),
+ (double2)(0x1.808c000000000p-1, 0x1.ca034a55fe198p-24),
+ (double2)(0x1.819d000000000p-1, 0x1.6e2b149990227p-22),
+ (double2)(0x1.82ad000000000p-1, 0x1.b00000294592cp-24),
+ (double2)(0x1.83bbe00000000p-1, 0x1.8b9bdc442620ep-22),
+ (double2)(0x1.84c9c00000000p-1, 0x1.d94fdfabf3e4ep-23),
+ (double2)(0x1.85d6900000000p-1, 0x1.5db30b145ad9ap-23),
+ (double2)(0x1.86e2500000000p-1, 0x1.e3e1eb95022b0p-23),
+ (double2)(0x1.87ed000000000p-1, 0x1.d5b8b45442bd6p-22),
+ (double2)(0x1.88f6b00000000p-1, 0x1.7a046231ecd2ep-22),
+ (double2)(0x1.89ff500000000p-1, 0x1.feafe3ef55232p-22),
+ (double2)(0x1.8b06f00000000p-1, 0x1.839e7bfd78267p-22),
+ (double2)(0x1.8c0d900000000p-1, 0x1.45cf49d6fa900p-25),
+ (double2)(0x1.8d13200000000p-1, 0x1.be3132b27f380p-27),
+ (double2)(0x1.8e17a00000000p-1, 0x1.533980bb84f9fp-22),
+ (double2)(0x1.8f1b300000000p-1, 0x1.889e2ce3ba390p-26),
+ (double2)(0x1.901db00000000p-1, 0x1.f7778c3ad0cc8p-24),
+ (double2)(0x1.911f300000000p-1, 0x1.46660cec4eba2p-23),
+ (double2)(0x1.921fb00000000p-1, 0x1.5110b4611a626p-23),
+};
+
+DECLARE_TABLE(double2, TWO_TO_JBY64_EP, 64) = {
+ (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+ (double2)(0x1.02c9a30000000p+0, 0x1.cef00c1dcdef9p-25),
+ (double2)(0x1.059b0d0000000p+0, 0x1.8ac2ba1d73e2ap-27),
+ (double2)(0x1.0874510000000p+0, 0x1.0eb37901186bep-25),
+ (double2)(0x1.0b55860000000p+0, 0x1.9f3121ec53172p-25),
+ (double2)(0x1.0e3ec30000000p+0, 0x1.69e8d10103a17p-27),
+ (double2)(0x1.11301d0000000p+0, 0x1.25b50a4ebbf1ap-32),
+ (double2)(0x1.1429aa0000000p+0, 0x1.d525bbf668203p-25),
+ (double2)(0x1.172b830000000p+0, 0x1.8faa2f5b9bef9p-25),
+ (double2)(0x1.1a35be0000000p+0, 0x1.6df96ea796d31p-25),
+ (double2)(0x1.1d48730000000p+0, 0x1.68b9aa7805b80p-28),
+ (double2)(0x1.2063b80000000p+0, 0x1.0c519ac771dd6p-25),
+ (double2)(0x1.2387a60000000p+0, 0x1.ceac470cd83f5p-25),
+ (double2)(0x1.26b4560000000p+0, 0x1.789f37495e99cp-26),
+ (double2)(0x1.29e9df0000000p+0, 0x1.47f7b84b09745p-26),
+ (double2)(0x1.2d285a0000000p+0, 0x1.b900c2d002475p-26),
+ (double2)(0x1.306fe00000000p+0, 0x1.4636e2a5bd1abp-25),
+ (double2)(0x1.33c08b0000000p+0, 0x1.320b7fa64e430p-27),
+ (double2)(0x1.371a730000000p+0, 0x1.ceaa72a9c5154p-26),
+ (double2)(0x1.3a7db30000000p+0, 0x1.3967fdba86f24p-26),
+ (double2)(0x1.3dea640000000p+0, 0x1.82468446b6824p-25),
+ (double2)(0x1.4160a20000000p+0, 0x1.f72e29f84325bp-28),
+ (double2)(0x1.44e0860000000p+0, 0x1.8624b40c4dbd0p-30),
+ (double2)(0x1.486a2b0000000p+0, 0x1.704f3404f068ep-26),
+ (double2)(0x1.4bfdad0000000p+0, 0x1.4d8a89c750e5ep-26),
+ (double2)(0x1.4f9b270000000p+0, 0x1.a74b29ab4cf62p-26),
+ (double2)(0x1.5342b50000000p+0, 0x1.a753e077c2a0fp-26),
+ (double2)(0x1.56f4730000000p+0, 0x1.ad49f699bb2c0p-26),
+ (double2)(0x1.5ab07d0000000p+0, 0x1.a90a852b19260p-25),
+ (double2)(0x1.5e76f10000000p+0, 0x1.6b48521ba6f93p-26),
+ (double2)(0x1.6247eb0000000p+0, 0x1.d2ac258f87d03p-31),
+ (double2)(0x1.6623880000000p+0, 0x1.2a91124893ecfp-27),
+ (double2)(0x1.6a09e60000000p+0, 0x1.9fcef32422cbep-26),
+ (double2)(0x1.6dfb230000000p+0, 0x1.8ca345de441c5p-25),
+ (double2)(0x1.71f75e0000000p+0, 0x1.1d8bee7ba46e1p-25),
+ (double2)(0x1.75feb50000000p+0, 0x1.9099f22fdba6ap-26),
+ (double2)(0x1.7a11470000000p+0, 0x1.f580c36bea881p-27),
+ (double2)(0x1.7e2f330000000p+0, 0x1.b3d398841740ap-26),
+ (double2)(0x1.8258990000000p+0, 0x1.2999c25159f11p-25),
+ (double2)(0x1.868d990000000p+0, 0x1.68925d901c83bp-25),
+ (double2)(0x1.8ace540000000p+0, 0x1.15506dadd3e2ap-27),
+ (double2)(0x1.8f1ae90000000p+0, 0x1.22aee6c57304ep-25),
+ (double2)(0x1.93737b0000000p+0, 0x1.9b8bc9e8a0387p-29),
+ (double2)(0x1.97d8290000000p+0, 0x1.fbc9c9f173d24p-25),
+ (double2)(0x1.9c49180000000p+0, 0x1.51f8480e3e235p-27),
+ (double2)(0x1.a0c6670000000p+0, 0x1.6bbcac96535b5p-25),
+ (double2)(0x1.a5503b0000000p+0, 0x1.1f12ae45a1224p-27),
+ (double2)(0x1.a9e6b50000000p+0, 0x1.5e7f6fd0fac90p-26),
+ (double2)(0x1.ae89f90000000p+0, 0x1.2b5a75abd0e69p-25),
+ (double2)(0x1.b33a2b0000000p+0, 0x1.09e2bf5ed7fa1p-25),
+ (double2)(0x1.b7f76f0000000p+0, 0x1.7daf237553d84p-27),
+ (double2)(0x1.bcc1e90000000p+0, 0x1.2f074891ee83dp-30),
+ (double2)(0x1.c199bd0000000p+0, 0x1.b0aa538444196p-25),
+ (double2)(0x1.c67f120000000p+0, 0x1.cafa29694426fp-25),
+ (double2)(0x1.cb720d0000000p+0, 0x1.9df20d22a0797p-25),
+ (double2)(0x1.d072d40000000p+0, 0x1.40f12f71a1e45p-25),
+ (double2)(0x1.d5818d0000000p+0, 0x1.9f7490e4bb40bp-25),
+ (double2)(0x1.da9e600000000p+0, 0x1.ed9942b84600dp-27),
+ (double2)(0x1.dfc9730000000p+0, 0x1.bdcdaf5cb4656p-27),
+ (double2)(0x1.e502ee0000000p+0, 0x1.e2cffd89cf44cp-26),
+ (double2)(0x1.ea4afa0000000p+0, 0x1.52486cc2c7b9dp-27),
+ (double2)(0x1.efa1be0000000p+0, 0x1.cc2b44eee3fa4p-25),
+ (double2)(0x1.f507650000000p+0, 0x1.6dc8a80ce9f09p-25),
+ (double2)(0x1.fa7c180000000p+0, 0x1.9e90d82e90a7ep-28)
+
+};
+
+
+TABLE_FUNCTION(double2, ATAN_JBY256_TBL, atan_jby256_tbl);
+TABLE_FUNCTION(double2, TWO_TO_JBY64_EP, two_to_jby64_ep_tbl);
+
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/tables.h b/libclc/generic/lib/math/tables.h
new file mode 100644
index 0000000..1348fe1
--- /dev/null
+++ b/libclc/generic/lib/math/tables.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define TABLE_SPACE __constant
+
+#define TABLE_MANGLE(NAME) __clc_##NAME
+
+#define DECLARE_TABLE(TYPE,NAME,LENGTH) \
+ TABLE_SPACE TYPE NAME [ LENGTH ]
+
+#define TABLE_FUNCTION(TYPE,TABLE,NAME) \
+ TYPE TABLE_MANGLE(NAME)(size_t idx) { \
+ return TABLE[idx]; \
+ }
+
+#define TABLE_FUNCTION_DECL(TYPE, NAME) \
+ TYPE TABLE_MANGLE(NAME)(size_t idx);
+
+#define USE_TABLE(NAME, IDX) \
+ TABLE_MANGLE(NAME)(IDX)
+
+TABLE_FUNCTION_DECL(float2, loge_tbl);
+TABLE_FUNCTION_DECL(float, log_inv_tbl);
+TABLE_FUNCTION_DECL(float2, log2_tbl);
+TABLE_FUNCTION_DECL(uint4, pibits_tbl);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+TABLE_FUNCTION_DECL(double2, ln_tbl);
+TABLE_FUNCTION_DECL(double2, atan_jby256_tbl);
+TABLE_FUNCTION_DECL(double2, two_to_jby64_ep_tbl);
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/math/tan.cl b/libclc/generic/lib/math/tan.cl
new file mode 100644
index 0000000..a447999
--- /dev/null
+++ b/libclc/generic/lib/math/tan.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <tan.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/tan.inc b/libclc/generic/lib/math/tan.inc
new file mode 100644
index 0000000..b9ce33e
--- /dev/null
+++ b/libclc/generic/lib/math/tan.inc
@@ -0,0 +1,17 @@
+/*
+ * Note: tan(x) = sin(x)/cos(x) also, but the final assembly ends up being
+ * twice as long for R600 (maybe for others as well).
+ */
+
+#if __CLC_FPSIZE == 32
+#define __CLC_CONST(x) x ## f
+#else
+#define __CLC_CONST(x) x
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE tan(__CLC_GENTYPE x) {
+ __CLC_GENTYPE sinx = sin(x);
+ return sinx / sqrt( (__CLC_GENTYPE) __CLC_CONST(1.0) - (sinx*sinx) );
+}
+
+#undef __CLC_CONST
diff --git a/libclc/generic/lib/math/tanh.cl b/libclc/generic/lib/math/tanh.cl
new file mode 100644
index 0000000..e9c4079
--- /dev/null
+++ b/libclc/generic/lib/math/tanh.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float tanh(float x)
+{
+ // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+ // to the following three formulae:
+ // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
+ // 2. (1 - (2/(exp(2*x) + 1 )))
+ // 3. (exp(2*x) - 1)/(exp(2*x) + 1)
+ // but computationally, some formulae are better on some ranges.
+
+ const float large_threshold = 0x1.0a2b24p+3f;
+
+ uint ux = as_uint(x);
+ uint aux = ux & EXSIGNBIT_SP32;
+ uint xs = ux ^ aux;
+
+ float y = as_float(aux);
+ float y2 = y*y;
+
+ float a1 = mad(y2,
+ mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F),
+ -0.28192806108402678e0F);
+ float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F);
+
+ float a2 = mad(y2,
+ mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F),
+ -0.24069858695196524e0F);
+ float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F);
+
+ int c = y < 0.9f;
+ float a = c ? a1 : a2;
+ float b = c ? b1 : b2;
+ float zlo = mad(MATH_DIVIDE(a, b), y*y2, y);
+
+ float p = exp(2.0f * y) + 1.0f;
+ float zhi = 1.0F - MATH_DIVIDE(2.0F, p);
+
+ float z = y <= 1.0f ? zlo : zhi;
+ z = as_float(xs | as_uint(z));
+
+ // Edge cases
+ float sone = as_float(0x3f800000U | xs);
+ z = y > large_threshold ? sone : z;
+ z = aux < 0x39000000 | aux > 0x7f800000 ? x : z;
+
+ return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, tanh, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double tanh(double x)
+{
+ // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+ // to the following three formulae:
+ // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
+ // 2. (1 - (2/(exp(2*x) + 1 )))
+ // 3. (exp(2*x) - 1)/(exp(2*x) + 1)
+ // but computationally, some formulae are better on some ranges.
+
+ // The point at which e^-x is insignificant compared to e^x = ln(2^27)
+ const double large_threshold = 0x1.2b708872320e2p+4;
+
+ ulong ux = as_ulong(x);
+ ulong ax = ux & ~SIGNBIT_DP64;
+ ulong sx = ux ^ ax;
+ double y = as_double(ax);
+ double y2 = y * y;
+
+ // y < 0.9
+ double znl = fma(y2,
+ fma(y2,
+ fma(y2, -0.142077926378834722618091e-7, -0.200047621071909498730453e-3),
+ -0.176016349003044679402273e-1),
+ -0.274030424656179760118928e0);
+
+ double zdl = fma(y2,
+ fma(y2,
+ fma(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1),
+ 0.381641414288328849317962e0),
+ 0.822091273968539282568011e0);
+
+ // 0.9 <= y <= 1
+ double znm = fma(y2,
+ fma(y2,
+ fma(y2, -0.115475878996143396378318e-7, -0.165597043903549960486816e-3),
+ -0.146173047288731678404066e-1),
+ -0.227793870659088295252442e0);
+
+ double zdm = fma(y2,
+ fma(y2,
+ fma(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1),
+ 0.317204558977294374244770e0),
+ 0.683381611977295894959554e0);
+
+ int c = y < 0.9;
+ double zn = c ? znl : znm;
+ double zd = c ? zdl : zdm;
+ double z = y + y*y2 * MATH_DIVIDE(zn, zd);
+
+ // y > 1
+ double p = exp(2.0 * y) + 1.0;
+ double zg = 1.0 - 2.0 / p;
+
+ z = y > 1.0 ? zg : z;
+
+ // Other cases
+ z = y < 0x1.0p-28 | ax > PINFBITPATT_DP64 ? x : z;
+
+ z = y > large_threshold ? 1.0 : z;
+
+ return as_double(sx | as_ulong(z));
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tanh, double);
+
+#endif // cl_khr_fp64
diff --git a/libclc/generic/lib/relational/all.cl b/libclc/generic/lib/relational/all.cl
new file mode 100644
index 0000000..607d7a9
--- /dev/null
+++ b/libclc/generic/lib/relational/all.cl
@@ -0,0 +1,29 @@
+#include <clc/clc.h>
+
+#define _CLC_ALL(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
+#define _CLC_ALL2(v) (_CLC_ALL((v).s0) & _CLC_ALL((v).s1))
+#define _CLC_ALL3(v) (_CLC_ALL2((v)) & _CLC_ALL((v).s2))
+#define _CLC_ALL4(v) (_CLC_ALL3((v)) & _CLC_ALL((v).s3))
+#define _CLC_ALL8(v) (_CLC_ALL4((v)) & _CLC_ALL((v).s4) & _CLC_ALL((v).s5) \
+ & _CLC_ALL((v).s6) & _CLC_ALL((v).s7))
+#define _CLC_ALL16(v) (_CLC_ALL8((v)) & _CLC_ALL((v).s8) & _CLC_ALL((v).s9) \
+ & _CLC_ALL((v).sA) & _CLC_ALL((v).sB) \
+ & _CLC_ALL((v).sC) & _CLC_ALL((v).sD) \
+ & _CLC_ALL((v).sE) & _CLC_ALL((v).sf))
+
+
+#define ALL_ID(TYPE) \
+ _CLC_OVERLOAD _CLC_DEF int all(TYPE v)
+
+#define ALL_VECTORIZE(TYPE) \
+ ALL_ID(TYPE) { return _CLC_ALL(v); } \
+ ALL_ID(TYPE##2) { return _CLC_ALL2(v); } \
+ ALL_ID(TYPE##3) { return _CLC_ALL3(v); } \
+ ALL_ID(TYPE##4) { return _CLC_ALL4(v); } \
+ ALL_ID(TYPE##8) { return _CLC_ALL8(v); } \
+ ALL_ID(TYPE##16) { return _CLC_ALL16(v); }
+
+ALL_VECTORIZE(char)
+ALL_VECTORIZE(short)
+ALL_VECTORIZE(int)
+ALL_VECTORIZE(long)
diff --git a/libclc/generic/lib/relational/any.cl b/libclc/generic/lib/relational/any.cl
new file mode 100644
index 0000000..4d37210
--- /dev/null
+++ b/libclc/generic/lib/relational/any.cl
@@ -0,0 +1,30 @@
+#include <clc/clc.h>
+
+#define _CLC_ANY(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
+#define _CLC_ANY2(v) (_CLC_ANY((v).s0) | _CLC_ANY((v).s1))
+#define _CLC_ANY3(v) (_CLC_ANY2((v)) | _CLC_ANY((v).s2))
+#define _CLC_ANY4(v) (_CLC_ANY3((v)) | _CLC_ANY((v).s3))
+#define _CLC_ANY8(v) (_CLC_ANY4((v)) | _CLC_ANY((v).s4) | _CLC_ANY((v).s5) \
+ | _CLC_ANY((v).s6) | _CLC_ANY((v).s7))
+#define _CLC_ANY16(v) (_CLC_ANY8((v)) | _CLC_ANY((v).s8) | _CLC_ANY((v).s9) \
+ | _CLC_ANY((v).sA) | _CLC_ANY((v).sB) \
+ | _CLC_ANY((v).sC) | _CLC_ANY((v).sD) \
+ | _CLC_ANY((v).sE) | _CLC_ANY((v).sf))
+
+
+#define ANY_ID(TYPE) \
+ _CLC_OVERLOAD _CLC_DEF int any(TYPE v)
+
+#define ANY_VECTORIZE(TYPE) \
+ ANY_ID(TYPE) { return _CLC_ANY(v); } \
+ ANY_ID(TYPE##2) { return _CLC_ANY2(v); } \
+ ANY_ID(TYPE##3) { return _CLC_ANY3(v); } \
+ ANY_ID(TYPE##4) { return _CLC_ANY4(v); } \
+ ANY_ID(TYPE##8) { return _CLC_ANY8(v); } \
+ ANY_ID(TYPE##16) { return _CLC_ANY16(v); }
+
+ANY_VECTORIZE(char)
+ANY_VECTORIZE(short)
+ANY_VECTORIZE(int)
+ANY_VECTORIZE(long)
+
diff --git a/libclc/generic/lib/relational/bitselect.cl b/libclc/generic/lib/relational/bitselect.cl
new file mode 100644
index 0000000..af4e70c
--- /dev/null
+++ b/libclc/generic/lib/relational/bitselect.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+
+#define __CLC_BODY <bitselect.inc>
+#include <clc/integer/gentype.inc>
+#undef __CLC_BODY
+
+#define FLOAT_BITSELECT(f_type, i_type, width) \
+ _CLC_OVERLOAD _CLC_DEF f_type##width bitselect(f_type##width x, f_type##width y, f_type##width z) { \
+ return as_##f_type##width(bitselect(as_##i_type##width(x), as_##i_type##width(y), as_##i_type##width(z))); \
+}
+
+FLOAT_BITSELECT(float, uint, )
+FLOAT_BITSELECT(float, uint, 2)
+FLOAT_BITSELECT(float, uint, 3)
+FLOAT_BITSELECT(float, uint, 4)
+FLOAT_BITSELECT(float, uint, 8)
+FLOAT_BITSELECT(float, uint, 16)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+FLOAT_BITSELECT(double, ulong, )
+FLOAT_BITSELECT(double, ulong, 2)
+FLOAT_BITSELECT(double, ulong, 3)
+FLOAT_BITSELECT(double, ulong, 4)
+FLOAT_BITSELECT(double, ulong, 8)
+FLOAT_BITSELECT(double, ulong, 16)
+
+#endif
diff --git a/libclc/generic/lib/relational/bitselect.inc b/libclc/generic/lib/relational/bitselect.inc
new file mode 100644
index 0000000..3a78a8c
--- /dev/null
+++ b/libclc/generic/lib/relational/bitselect.inc
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE bitselect(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) {
+ return ((x) ^ ((z) & ((y) ^ (x))));
+}
diff --git a/libclc/generic/lib/relational/isequal.cl b/libclc/generic/lib/relational/isequal.cl
new file mode 100644
index 0000000..9d79ba6
--- /dev/null
+++ b/libclc/generic/lib/relational/isequal.cl
@@ -0,0 +1,30 @@
+#include <clc/clc.h>
+
+#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
+ return (x == y); \
+} \
+
+_CLC_DEFINE_ISEQUAL(int, isequal, float, float)
+_CLC_DEFINE_ISEQUAL(int2, isequal, float2, float2)
+_CLC_DEFINE_ISEQUAL(int3, isequal, float3, float3)
+_CLC_DEFINE_ISEQUAL(int4, isequal, float4, float4)
+_CLC_DEFINE_ISEQUAL(int8, isequal, float8, float8)
+_CLC_DEFINE_ISEQUAL(int16, isequal, float16, float16)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isequal(double) returns an int, but the vector versions
+// return long.
+_CLC_DEFINE_ISEQUAL(int, isequal, double, double)
+_CLC_DEFINE_ISEQUAL(long2, isequal, double2, double2)
+_CLC_DEFINE_ISEQUAL(long3, isequal, double3, double3)
+_CLC_DEFINE_ISEQUAL(long4, isequal, double4, double4)
+_CLC_DEFINE_ISEQUAL(long8, isequal, double8, double8)
+_CLC_DEFINE_ISEQUAL(long16, isequal, double16, double16)
+
+#endif
+
+#undef _CLC_DEFINE_ISEQUAL \ No newline at end of file
diff --git a/libclc/generic/lib/relational/isfinite.cl b/libclc/generic/lib/relational/isfinite.cl
new file mode 100644
index 0000000..d0658c0
--- /dev/null
+++ b/libclc/generic/lib/relational/isfinite.cl
@@ -0,0 +1,18 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, isfinite, __builtin_isfinite, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isfinite(double) returns an int, but the vector versions
+// return long.
+_CLC_DEF _CLC_OVERLOAD int isfinite(double x) {
+ return __builtin_isfinite(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isfinite, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/isgreater.cl b/libclc/generic/lib/relational/isgreater.cl
new file mode 100644
index 0000000..79456e5
--- /dev/null
+++ b/libclc/generic/lib/relational/isgreater.cl
@@ -0,0 +1,22 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+//Note: It would be nice to use __builtin_isgreater with vector inputs, but it seems to only take scalar values as
+// input, which will produce incorrect output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, isgreater, __builtin_isgreater, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isgreater(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEF _CLC_OVERLOAD int isgreater(double x, double y){
+ return __builtin_isgreater(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isgreater, double, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/isgreaterequal.cl b/libclc/generic/lib/relational/isgreaterequal.cl
new file mode 100644
index 0000000..2d5ebe57
--- /dev/null
+++ b/libclc/generic/lib/relational/isgreaterequal.cl
@@ -0,0 +1,22 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+//Note: It would be nice to use __builtin_isgreaterequal with vector inputs, but it seems to only take scalar values as
+// input, which will produce incorrect output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, isgreaterequal, __builtin_isgreaterequal, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isgreaterequal(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEF _CLC_OVERLOAD int isgreaterequal(double x, double y){
+ return __builtin_isgreaterequal(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isgreaterequal, double, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/isinf.cl b/libclc/generic/lib/relational/isinf.cl
new file mode 100644
index 0000000..1452d91
--- /dev/null
+++ b/libclc/generic/lib/relational/isinf.cl
@@ -0,0 +1,18 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, isinf, __builtin_isinf, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isinf(double) returns an int, but the vector versions
+// return long.
+_CLC_DEF _CLC_OVERLOAD int isinf(double x) {
+ return __builtin_isinf(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isinf, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/isless.cl b/libclc/generic/lib/relational/isless.cl
new file mode 100644
index 0000000..56a3e13
--- /dev/null
+++ b/libclc/generic/lib/relational/isless.cl
@@ -0,0 +1,22 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+//Note: It would be nice to use __builtin_isless with vector inputs, but it seems to only take scalar values as
+// input, which will produce incorrect output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, isless, __builtin_isless, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isless(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEF _CLC_OVERLOAD int isless(double x, double y){
+ return __builtin_isless(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isless, double, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/islessequal.cl b/libclc/generic/lib/relational/islessequal.cl
new file mode 100644
index 0000000..259c307
--- /dev/null
+++ b/libclc/generic/lib/relational/islessequal.cl
@@ -0,0 +1,22 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+//Note: It would be nice to use __builtin_islessequal with vector inputs, but it seems to only take scalar values as
+// input, which will produce incorrect output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, islessequal, __builtin_islessequal, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of islessequal(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEF _CLC_OVERLOAD int islessequal(double x, double y){
+ return __builtin_islessequal(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, islessequal, double, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/islessgreater.cl b/libclc/generic/lib/relational/islessgreater.cl
new file mode 100644
index 0000000..fc029f3
--- /dev/null
+++ b/libclc/generic/lib/relational/islessgreater.cl
@@ -0,0 +1,22 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+//Note: It would be nice to use __builtin_islessgreater with vector inputs, but it seems to only take scalar values as
+// input, which will produce incorrect output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, islessgreater, __builtin_islessgreater, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of islessgreater(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEF _CLC_OVERLOAD int islessgreater(double x, double y){
+ return __builtin_islessgreater(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, islessgreater, double, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/isnan.cl b/libclc/generic/lib/relational/isnan.cl
new file mode 100644
index 0000000..f82dc5d
--- /dev/null
+++ b/libclc/generic/lib/relational/isnan.cl
@@ -0,0 +1,18 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, isnan, __builtin_isnan, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isnan(double) returns an int, but the vector versions
+// return long.
+_CLC_DEF _CLC_OVERLOAD int isnan(double x) {
+ return __builtin_isnan(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isnan, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/isnormal.cl b/libclc/generic/lib/relational/isnormal.cl
new file mode 100644
index 0000000..2e6b42d
--- /dev/null
+++ b/libclc/generic/lib/relational/isnormal.cl
@@ -0,0 +1,18 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, isnormal, __builtin_isnormal, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isnormal(double) returns an int, but the vector versions
+// return long.
+_CLC_DEF _CLC_OVERLOAD int isnormal(double x) {
+ return __builtin_isnormal(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isnormal, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/isnotequal.cl b/libclc/generic/lib/relational/isnotequal.cl
new file mode 100644
index 0000000..787fd8d
--- /dev/null
+++ b/libclc/generic/lib/relational/isnotequal.cl
@@ -0,0 +1,23 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
+ return (x != y); \
+} \
+
+_CLC_DEFINE_ISNOTEQUAL(int, isnotequal, float, float)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, isnotequal, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isnotequal(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEFINE_ISNOTEQUAL(int, isnotequal, double, double)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isnotequal, double, double)
+
+#endif
+
+#undef _CLC_DEFINE_ISNOTEQUAL
diff --git a/libclc/generic/lib/relational/isordered.cl b/libclc/generic/lib/relational/isordered.cl
new file mode 100644
index 0000000..ebda2eb
--- /dev/null
+++ b/libclc/generic/lib/relational/isordered.cl
@@ -0,0 +1,23 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+#define _CLC_DEFINE_ISORDERED(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
+ return isequal(x, x) && isequal(y, y); \
+} \
+
+_CLC_DEFINE_ISORDERED(int, isordered, float, float)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, isordered, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isordered(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEFINE_ISORDERED(int, isordered, double, double)
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isordered, double, double)
+
+#endif
+
+#undef _CLC_DEFINE_ISORDERED
diff --git a/libclc/generic/lib/relational/isunordered.cl b/libclc/generic/lib/relational/isunordered.cl
new file mode 100644
index 0000000..8bc5e3f
--- /dev/null
+++ b/libclc/generic/lib/relational/isunordered.cl
@@ -0,0 +1,22 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+//Note: It would be nice to use __builtin_isunordered with vector inputs, but it seems to only take scalar values as
+// input, which will produce incorrect output for vector input types.
+
+_CLC_DEFINE_RELATIONAL_BINARY(int, isunordered, __builtin_isunordered, float, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of isunordered(double, double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEF _CLC_OVERLOAD int isunordered(double x, double y){
+ return __builtin_isunordered(x, y);
+}
+
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isunordered, double, double)
+
+#endif
diff --git a/libclc/generic/lib/relational/relational.h b/libclc/generic/lib/relational/relational.h
new file mode 100644
index 0000000..e492750
--- /dev/null
+++ b/libclc/generic/lib/relational/relational.h
@@ -0,0 +1,117 @@
+/*
+ * Contains relational macros that have to return 1 for scalar and -1 for vector
+ * when the result is true.
+ */
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x){ \
+ return BUILTIN_NAME(x); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
+ return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
+ return (RET_TYPE)( (RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)} != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
+ return (RET_TYPE)( \
+ (RET_TYPE){ \
+ FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3) \
+ } != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
+ return (RET_TYPE)( \
+ (RET_TYPE){ \
+ FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
+ FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7) \
+ } != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
+ return (RET_TYPE)( \
+ (RET_TYPE){ \
+ FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
+ FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \
+ FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \
+ FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf) \
+ } != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) \
+_CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2) \
+_CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3) \
+_CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4) \
+_CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8) \
+_CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16)
+
+#define _CLC_DEFINE_RELATIONAL_UNARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
+_CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) \
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y){ \
+ return BUILTIN_NAME(x, y); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
+ return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
+ return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
+ return (RET_TYPE)( (RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2)} != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
+ return (RET_TYPE)( \
+ (RET_TYPE){ \
+ FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3) \
+ } != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
+ return (RET_TYPE)( \
+ (RET_TYPE){ \
+ FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \
+ FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7) \
+ } != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \
+ return (RET_TYPE)( \
+ (RET_TYPE){ \
+ FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \
+ FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), \
+ FUNCTION(x.s8, y.s8), FUNCTION(x.s9, y.s9), FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \
+ FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), FUNCTION(x.se, y.se), FUNCTION(x.sf, y.sf) \
+ } != (RET_TYPE)0); \
+}
+
+#define _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE##2, FUNCTION, ARG0_TYPE##2, ARG1_TYPE##2) \
+_CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE##3, FUNCTION, ARG0_TYPE##3, ARG1_TYPE##3) \
+_CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE##4, FUNCTION, ARG0_TYPE##4, ARG1_TYPE##4) \
+_CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE##8, FUNCTION, ARG0_TYPE##8, ARG1_TYPE##8) \
+_CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE##16, FUNCTION, ARG0_TYPE##16, ARG1_TYPE##16)
+
+#define _CLC_DEFINE_RELATIONAL_BINARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG0_TYPE, ARG1_TYPE) \
+_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE)
diff --git a/libclc/generic/lib/relational/signbit.cl b/libclc/generic/lib/relational/signbit.cl
new file mode 100644
index 0000000..ab37d2f
--- /dev/null
+++ b/libclc/generic/lib/relational/signbit.cl
@@ -0,0 +1,19 @@
+#include <clc/clc.h>
+#include "relational.h"
+
+_CLC_DEFINE_RELATIONAL_UNARY(int, signbit, __builtin_signbitf, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// The scalar version of signbit(double) returns an int, but the vector versions
+// return long.
+
+_CLC_DEF _CLC_OVERLOAD int signbit(double x){
+ return __builtin_signbit(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, signbit, double)
+
+#endif
diff --git a/libclc/generic/lib/shared/clamp.cl b/libclc/generic/lib/shared/clamp.cl
new file mode 100644
index 0000000..c79a358
--- /dev/null
+++ b/libclc/generic/lib/shared/clamp.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <clamp.inc>
+#include <clc/integer/gentype.inc>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <clamp.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/shared/clamp.inc b/libclc/generic/lib/shared/clamp.inc
new file mode 100644
index 0000000..c918f9c
--- /dev/null
+++ b/libclc/generic/lib/shared/clamp.inc
@@ -0,0 +1,9 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) {
+ return (x > z ? z : (x < y ? y : x));
+}
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_SCALAR_GENTYPE y, __CLC_SCALAR_GENTYPE z) {
+ return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x));
+}
+#endif
diff --git a/libclc/generic/lib/shared/max.cl b/libclc/generic/lib/shared/max.cl
new file mode 100644
index 0000000..1c4457c
--- /dev/null
+++ b/libclc/generic/lib/shared/max.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <max.inc>
+#include <clc/integer/gentype.inc>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <max.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/shared/max.inc b/libclc/generic/lib/shared/max.inc
new file mode 100644
index 0000000..75a24c0
--- /dev/null
+++ b/libclc/generic/lib/shared/max.inc
@@ -0,0 +1,9 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b) {
+ return (a > b ? a : b);
+}
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) {
+ return (a > (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b);
+}
+#endif
diff --git a/libclc/generic/lib/shared/min.cl b/libclc/generic/lib/shared/min.cl
new file mode 100644
index 0000000..433087a
--- /dev/null
+++ b/libclc/generic/lib/shared/min.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <min.inc>
+#include <clc/integer/gentype.inc>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <min.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/shared/min.inc b/libclc/generic/lib/shared/min.inc
new file mode 100644
index 0000000..fe42864
--- /dev/null
+++ b/libclc/generic/lib/shared/min.inc
@@ -0,0 +1,9 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) {
+ return (a < b ? a : b);
+}
+
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) {
+ return (a < (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b);
+}
+#endif
diff --git a/libclc/generic/lib/shared/vload.cl b/libclc/generic/lib/shared/vload.cl
new file mode 100644
index 0000000..8897200
--- /dev/null
+++ b/libclc/generic/lib/shared/vload.cl
@@ -0,0 +1,52 @@
+#include <clc/clc.h>
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[2*offset])); \
+ } \
+\
+ typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ PRIM_TYPE##2 vec = *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \
+ return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \
+ } \
+\
+ typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&x[4*offset])); \
+ } \
+\
+ typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&x[8*offset])); \
+ } \
+\
+ typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&x[16*offset])); \
+ } \
+
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
+
+#define VLOAD_TYPES() \
+ VLOAD_ADDR_SPACES(char) \
+ VLOAD_ADDR_SPACES(uchar) \
+ VLOAD_ADDR_SPACES(short) \
+ VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(int) \
+ VLOAD_ADDR_SPACES(uint) \
+ VLOAD_ADDR_SPACES(long) \
+ VLOAD_ADDR_SPACES(ulong) \
+ VLOAD_ADDR_SPACES(float) \
+
+VLOAD_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+ VLOAD_ADDR_SPACES(double)
+#endif
diff --git a/libclc/generic/lib/shared/vstore.cl b/libclc/generic/lib/shared/vstore.cl
new file mode 100644
index 0000000..4777b7e
--- /dev/null
+++ b/libclc/generic/lib/shared/vstore.cl
@@ -0,0 +1,52 @@
+#include <clc/clc.h>
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \
+ } \
+\
+ _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \
+ mem[3 * offset + 2] = vec.s2;\
+ } \
+\
+ typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \
+ } \
+\
+ typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \
+ } \
+\
+ typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
+ _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \
+ } \
+
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
+
+#define VSTORE_TYPES() \
+ VSTORE_ADDR_SPACES(char) \
+ VSTORE_ADDR_SPACES(uchar) \
+ VSTORE_ADDR_SPACES(short) \
+ VSTORE_ADDR_SPACES(ushort) \
+ VSTORE_ADDR_SPACES(int) \
+ VSTORE_ADDR_SPACES(uint) \
+ VSTORE_ADDR_SPACES(long) \
+ VSTORE_ADDR_SPACES(ulong) \
+ VSTORE_ADDR_SPACES(float) \
+
+VSTORE_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+ VSTORE_ADDR_SPACES(double)
+#endif
diff --git a/libclc/generic/lib/subnormal_config.cl b/libclc/generic/lib/subnormal_config.cl
new file mode 100644
index 0000000..4bcecfd
--- /dev/null
+++ b/libclc/generic/lib/subnormal_config.cl
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "config.h"
+
+_CLC_DEF bool __clc_fp16_subnormals_supported() {
+ return false;
+}
+
+_CLC_DEF bool __clc_fp32_subnormals_supported() {
+ return false;
+}
+
+_CLC_DEF bool __clc_fp64_subnormals_supported() {
+ return !__clc_subnormals_disabled();
+}
diff --git a/libclc/generic/lib/subnormal_disable.ll b/libclc/generic/lib/subnormal_disable.ll
new file mode 100644
index 0000000..b935583
--- /dev/null
+++ b/libclc/generic/lib/subnormal_disable.ll
@@ -0,0 +1 @@
+@__CLC_SUBNORMAL_DISABLE = unnamed_addr constant i1 true
diff --git a/libclc/generic/lib/subnormal_helper_func.ll b/libclc/generic/lib/subnormal_helper_func.ll
new file mode 100644
index 0000000..fb1b5d2
--- /dev/null
+++ b/libclc/generic/lib/subnormal_helper_func.ll
@@ -0,0 +1,8 @@
+@__CLC_SUBNORMAL_DISABLE = external global i1
+
+define i1 @__clc_subnormals_disabled() #0 {
+ %disable = load i1, i1* @__CLC_SUBNORMAL_DISABLE
+ ret i1 %disable
+}
+
+attributes #0 = { alwaysinline }
diff --git a/libclc/generic/lib/subnormal_use_default.ll b/libclc/generic/lib/subnormal_use_default.ll
new file mode 100644
index 0000000..d70c63b
--- /dev/null
+++ b/libclc/generic/lib/subnormal_use_default.ll
@@ -0,0 +1 @@
+@__CLC_SUBNORMAL_DISABLE = unnamed_addr constant i1 false
diff --git a/libclc/generic/lib/workitem/get_global_id.cl b/libclc/generic/lib/workitem/get_global_id.cl
new file mode 100644
index 0000000..fdd83d2
--- /dev/null
+++ b/libclc/generic/lib/workitem/get_global_id.cl
@@ -0,0 +1,5 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_global_id(uint dim) {
+ return get_group_id(dim)*get_local_size(dim) + get_local_id(dim);
+}
diff --git a/libclc/generic/lib/workitem/get_global_size.cl b/libclc/generic/lib/workitem/get_global_size.cl
new file mode 100644
index 0000000..5ae649e
--- /dev/null
+++ b/libclc/generic/lib/workitem/get_global_size.cl
@@ -0,0 +1,5 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_global_size(uint dim) {
+ return get_num_groups(dim)*get_local_size(dim);
+}