diff options
Diffstat (limited to 'libclc/generic/lib')
190 files changed, 9948 insertions, 0 deletions
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES new file mode 100644 index 0000000..c3a5a8a --- /dev/null +++ b/libclc/generic/lib/SOURCES @@ -0,0 +1,139 @@ +subnormal_config.cl +subnormal_helper_func.ll +async/async_work_group_copy.cl +async/async_work_group_strided_copy.cl +async/prefetch.cl +async/wait_group_events.cl +atomic/atomic_xchg.cl +atomic/atomic_impl.ll +cl_khr_global_int32_base_atomics/atom_add.cl +cl_khr_global_int32_base_atomics/atom_cmpxchg.cl +cl_khr_global_int32_base_atomics/atom_dec.cl +cl_khr_global_int32_base_atomics/atom_inc.cl +cl_khr_global_int32_base_atomics/atom_sub.cl +cl_khr_global_int32_base_atomics/atom_xchg.cl +cl_khr_global_int32_extended_atomics/atom_and.cl +cl_khr_global_int32_extended_atomics/atom_max.cl +cl_khr_global_int32_extended_atomics/atom_min.cl +cl_khr_global_int32_extended_atomics/atom_or.cl +cl_khr_global_int32_extended_atomics/atom_xor.cl +cl_khr_local_int32_base_atomics/atom_add.cl +cl_khr_local_int32_base_atomics/atom_cmpxchg.cl +cl_khr_local_int32_base_atomics/atom_dec.cl +cl_khr_local_int32_base_atomics/atom_inc.cl +cl_khr_local_int32_base_atomics/atom_sub.cl +cl_khr_local_int32_base_atomics/atom_xchg.cl +cl_khr_local_int32_extended_atomics/atom_and.cl +cl_khr_local_int32_extended_atomics/atom_max.cl +cl_khr_local_int32_extended_atomics/atom_min.cl +cl_khr_local_int32_extended_atomics/atom_or.cl +cl_khr_local_int32_extended_atomics/atom_xor.cl +convert.cl +common/degrees.cl +common/mix.cl +common/radians.cl +common/sign.cl +common/smoothstep.cl +common/step.cl +geometric/cross.cl +geometric/distance.cl +geometric/dot.cl +geometric/fast_distance.cl +geometric/fast_length.cl +geometric/fast_normalize.cl +geometric/length.cl +geometric/normalize.cl +integer/abs.cl +integer/abs_diff.cl +integer/add_sat.cl +integer/add_sat_if.ll +integer/add_sat_impl.ll +integer/clz.cl +integer/clz_if.ll +integer/clz_impl.ll +integer/hadd.cl +integer/mad24.cl +integer/mad_sat.cl +integer/mul24.cl +integer/mul_hi.cl +integer/rhadd.cl +integer/rotate.cl +integer/sub_sat.cl +integer/sub_sat_if.ll +integer/sub_sat_impl.ll +integer/upsample.cl +math/acos.cl +math/acosh.cl +math/acospi.cl +math/asin.cl +math/asinh.cl +math/asinpi.cl +math/atan.cl +math/atan2.cl +math/atan2pi.cl +math/atanh.cl +math/atanpi.cl +math/copysign.cl +math/cos.cl +math/cospi.cl +math/ep_log.cl +math/erfc.cl +math/exp.cl +math/exp_helper.cl +math/exp2.cl +math/exp10.cl +math/fmax.cl +math/fmin.cl +math/fmod.cl +math/fract.cl +math/frexp.cl +math/half_rsqrt.cl +math/half_sqrt.cl +math/hypot.cl +math/clc_ldexp.cl +math/ldexp.cl +math/log.cl +math/log10.cl +math/log1p.cl +math/log2.cl +math/mad.cl +math/modf.cl +math/native_log.cl +math/native_log2.cl +math/tables.cl +math/clc_nextafter.cl +math/nextafter.cl +math/pown.cl +math/sin.cl +math/sincos.cl +math/sincos_helpers.cl +math/sinpi.cl +math/clc_sqrt.cl +math/sqrt.cl +math/tan.cl +math/tanh.cl +relational/all.cl +relational/any.cl +relational/bitselect.cl +relational/isequal.cl +relational/isfinite.cl +relational/isgreater.cl +relational/isgreaterequal.cl +relational/isinf.cl +relational/isless.cl +relational/islessequal.cl +relational/islessgreater.cl +relational/isnan.cl +relational/isnormal.cl +relational/isnotequal.cl +relational/isordered.cl +relational/isunordered.cl +relational/signbit.cl +shared/clamp.cl +shared/max.cl +shared/min.cl +shared/vload.cl +shared/vstore.cl +workitem/get_global_id.cl +workitem/get_global_size.cl +image/get_image_dim.cl diff --git a/libclc/generic/lib/async/async_work_group_copy.cl b/libclc/generic/lib/async/async_work_group_copy.cl new file mode 100644 index 0000000..fe20ecf --- /dev/null +++ b/libclc/generic/lib/async/async_work_group_copy.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <async_work_group_copy.inc> +#include <clc/async/gentype.inc> +#undef __CLC_BODY diff --git a/libclc/generic/lib/async/async_work_group_copy.inc b/libclc/generic/lib/async/async_work_group_copy.inc new file mode 100644 index 0000000..a143ddf --- /dev/null +++ b/libclc/generic/lib/async/async_work_group_copy.inc @@ -0,0 +1,17 @@ +_CLC_OVERLOAD _CLC_DEF event_t async_work_group_copy( + local __CLC_GENTYPE *dst, + const global __CLC_GENTYPE *src, + size_t num_gentypes, + event_t event) { + + return async_work_group_strided_copy(dst, src, num_gentypes, 1, event); +} + +_CLC_OVERLOAD _CLC_DEF event_t async_work_group_copy( + global __CLC_GENTYPE *dst, + const local __CLC_GENTYPE *src, + size_t num_gentypes, + event_t event) { + + return async_work_group_strided_copy(dst, src, num_gentypes, 1, event); +} diff --git a/libclc/generic/lib/async/async_work_group_strided_copy.cl b/libclc/generic/lib/async/async_work_group_strided_copy.cl new file mode 100644 index 0000000..61b8898 --- /dev/null +++ b/libclc/generic/lib/async/async_work_group_strided_copy.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <async_work_group_strided_copy.inc> +#include <clc/async/gentype.inc> +#undef __CLC_BODY diff --git a/libclc/generic/lib/async/async_work_group_strided_copy.inc b/libclc/generic/lib/async/async_work_group_strided_copy.inc new file mode 100644 index 0000000..d81a8b7 --- /dev/null +++ b/libclc/generic/lib/async/async_work_group_strided_copy.inc @@ -0,0 +1,34 @@ + +#define STRIDED_COPY(dst, src, num_gentypes, dst_stride, src_stride) \ + size_t size = get_local_size(0) * get_local_size(1) * get_local_size(2); \ + size_t id = (get_local_size(1) * get_local_size(2) * get_local_id(0)) + \ + (get_local_size(2) * get_local_id(1)) + \ + get_local_id(2); \ + size_t i; \ + \ + for (i = id; i < num_gentypes; i += size) { \ + dst[i * dst_stride] = src[i * src_stride]; \ + } + + +_CLC_OVERLOAD _CLC_DEF event_t async_work_group_strided_copy( + local __CLC_GENTYPE *dst, + const global __CLC_GENTYPE *src, + size_t num_gentypes, + size_t src_stride, + event_t event) { + + STRIDED_COPY(dst, src, num_gentypes, 1, src_stride); + return event; +} + +_CLC_OVERLOAD _CLC_DEF event_t async_work_group_strided_copy( + global __CLC_GENTYPE *dst, + const local __CLC_GENTYPE *src, + size_t num_gentypes, + size_t dst_stride, + event_t event) { + + STRIDED_COPY(dst, src, num_gentypes, dst_stride, 1); + return event; +} diff --git a/libclc/generic/lib/async/prefetch.cl b/libclc/generic/lib/async/prefetch.cl new file mode 100644 index 0000000..45af21b --- /dev/null +++ b/libclc/generic/lib/async/prefetch.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <prefetch.inc> +#include <clc/async/gentype.inc> +#undef __CLC_BODY diff --git a/libclc/generic/lib/async/prefetch.inc b/libclc/generic/lib/async/prefetch.inc new file mode 100644 index 0000000..6747e4c --- /dev/null +++ b/libclc/generic/lib/async/prefetch.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DEF void prefetch(const global __CLC_GENTYPE *p, size_t num_gentypes) { } diff --git a/libclc/generic/lib/async/wait_group_events.cl b/libclc/generic/lib/async/wait_group_events.cl new file mode 100644 index 0000000..05c9d58 --- /dev/null +++ b/libclc/generic/lib/async/wait_group_events.cl @@ -0,0 +1,5 @@ +#include <clc/clc.h> + +_CLC_DEF void wait_group_events(int num_events, event_t *event_list) { + barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); +} diff --git a/libclc/generic/lib/atomic/atomic_impl.ll b/libclc/generic/lib/atomic/atomic_impl.ll new file mode 100644 index 0000000..019147f --- /dev/null +++ b/libclc/generic/lib/atomic/atomic_impl.ll @@ -0,0 +1,133 @@ +define i32 @__clc_atomic_add_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_add_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile add i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_and_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_and_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile and i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_cmpxchg_addr1(i32 addrspace(1)* nocapture %ptr, i32 %compare, i32 %value) nounwind alwaysinline { +entry: + %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %compare, i32 %value seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + ret i32 %1 +} + +define i32 @__clc_atomic_cmpxchg_addr3(i32 addrspace(3)* nocapture %ptr, i32 %compare, i32 %value) nounwind alwaysinline { +entry: + %0 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 %compare, i32 %value seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + ret i32 %1 +} + +define i32 @__clc_atomic_max_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_max_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile max i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_min_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_min_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile min i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_or_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_or_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile or i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_umax_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_umax_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umax i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_umin_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_umin_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umin i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_sub_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_sub_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile sub i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_xchg_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_xchg_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xchg i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_xor_addr1(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__clc_atomic_xor_addr3(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xor i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %0 +} diff --git a/libclc/generic/lib/atomic/atomic_xchg.cl b/libclc/generic/lib/atomic/atomic_xchg.cl new file mode 100644 index 0000000..9aee595 --- /dev/null +++ b/libclc/generic/lib/atomic/atomic_xchg.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile global float *p, float val) { + return as_float(atomic_xchg((volatile global int *)p, as_int(val))); +} + +_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile local float *p, float val) { + return as_float(atomic_xchg((volatile local int *)p, as_int(val))); +} diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_add.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_add.cl new file mode 100644 index 0000000..9151b0c --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_add.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_add(global TYPE *p, TYPE val) { \ + return atomic_add(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_cmpxchg.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_cmpxchg.cl new file mode 100644 index 0000000..7647740 --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_cmpxchg.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(global TYPE *p, TYPE cmp, TYPE val) { \ + return atomic_cmpxchg(p, cmp, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl new file mode 100644 index 0000000..a74158d --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_dec(global TYPE *p) { \ + return atom_sub(p, 1); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl new file mode 100644 index 0000000..1404b5a --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_inc(global TYPE *p) { \ + return atom_add(p, 1); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_sub.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_sub.cl new file mode 100644 index 0000000..7faa3cc --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_sub.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_sub(global TYPE *p, TYPE val) { \ + return atomic_sub(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_xchg.cl b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_xchg.cl new file mode 100644 index 0000000..9c77db1 --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_base_atomics/atom_xchg.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(global TYPE *p, TYPE val) { \ + return atomic_xchg(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_and.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_and.cl new file mode 100644 index 0000000..e587969 --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_and.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_and(global TYPE *p, TYPE val) { \ + return atomic_and(p, val); \ +} + +IMPL(int) +IMPL(unsigned int)
\ No newline at end of file diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_max.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_max.cl new file mode 100644 index 0000000..09177ed --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_max.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_max(global TYPE *p, TYPE val) { \ + return atomic_max(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_min.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_min.cl new file mode 100644 index 0000000..277c41b --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_min.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_min(global TYPE *p, TYPE val) { \ + return atomic_min(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_or.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_or.cl new file mode 100644 index 0000000..a936a8e --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_or.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_or(global TYPE *p, TYPE val) { \ + return atomic_or(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_xor.cl b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_xor.cl new file mode 100644 index 0000000..1a8e350 --- /dev/null +++ b/libclc/generic/lib/cl_khr_global_int32_extended_atomics/atom_xor.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_xor(global TYPE *p, TYPE val) { \ + return atomic_xor(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_add.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_add.cl new file mode 100644 index 0000000..a5dea18 --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_add.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_add(local TYPE *p, TYPE val) { \ + return atomic_add(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_cmpxchg.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_cmpxchg.cl new file mode 100644 index 0000000..16e9579 --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_cmpxchg.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(local TYPE *p, TYPE cmp, TYPE val) { \ + return atomic_cmpxchg(p, cmp, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl new file mode 100644 index 0000000..d22c333 --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_dec(local TYPE *p) { \ + return atom_sub(p, 1); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl new file mode 100644 index 0000000..4ba0d06 --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_inc(local TYPE *p) { \ + return atom_add(p, 1); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_sub.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_sub.cl new file mode 100644 index 0000000..c96696a --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_sub.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_sub(local TYPE *p, TYPE val) { \ + return atomic_sub(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_xchg.cl b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_xchg.cl new file mode 100644 index 0000000..7d4bcca --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_base_atomics/atom_xchg.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(local TYPE *p, TYPE val) { \ + return atomic_xchg(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_and.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_and.cl new file mode 100644 index 0000000..180103a --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_and.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_and(local TYPE *p, TYPE val) { \ + return atomic_and(p, val); \ +} + +IMPL(int) +IMPL(unsigned int)
\ No newline at end of file diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_max.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_max.cl new file mode 100644 index 0000000..b90301b --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_max.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_max(local TYPE *p, TYPE val) { \ + return atomic_max(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_min.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_min.cl new file mode 100644 index 0000000..3acedd8 --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_min.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_min(local TYPE *p, TYPE val) { \ + return atomic_min(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_or.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_or.cl new file mode 100644 index 0000000..338ff2c --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_or.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_or(local TYPE *p, TYPE val) { \ + return atomic_or(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_xor.cl b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_xor.cl new file mode 100644 index 0000000..51ae3c0 --- /dev/null +++ b/libclc/generic/lib/cl_khr_local_int32_extended_atomics/atom_xor.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +#define IMPL(TYPE) \ +_CLC_OVERLOAD _CLC_DEF TYPE atom_xor(local TYPE *p, TYPE val) { \ + return atomic_xor(p, val); \ +} + +IMPL(int) +IMPL(unsigned int) diff --git a/libclc/generic/lib/clcmacro.h b/libclc/generic/lib/clcmacro.h new file mode 100644 index 0000000..88f3b2a --- /dev/null +++ b/libclc/generic/lib/clcmacro.h @@ -0,0 +1,148 @@ +#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ + return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ + } \ +\ + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ + return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \ + } \ +\ + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \ + return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \ + return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \ + return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + } + +#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ + } \ +\ + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \ + return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \ + FUNCTION(x.z, y.z)); \ + } \ +\ + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) { \ + return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) { \ + return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) { \ + return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + } + +#define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \ + return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) { \ + return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y), \ + FUNCTION(x, y.z)); \ + } \ +\ + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) { \ + return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) { \ + return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) { \ + return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + } \ +\ + +#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, ARG3_TYPE##2 z) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ + } \ +\ + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, ARG3_TYPE##3 z) { \ + return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \ + FUNCTION(x.z, y.z, z.z)); \ + } \ +\ + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, ARG3_TYPE##4 z) { \ + return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, ARG3_TYPE##8 z) { \ + return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \ + return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \ + } + +#define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) { \ + return (RET_TYPE##2)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##3 z) { \ + return (RET_TYPE##3)(FUNCTION(x, y, z.x), FUNCTION(x, y, z.y), \ + FUNCTION(x, y, z.z)); \ + } \ +\ + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##4 z) { \ + return (RET_TYPE##4)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##8 z) { \ + return (RET_TYPE##8)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \ + } \ +\ + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##16 z) { \ + return (RET_TYPE##16)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi)); \ + } \ +\ + +#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 *y) { \ + return (RET_TYPE##2)(FUNCTION(x.x, (ARG2_TYPE*)y), FUNCTION(x.y, (ARG2_TYPE*)y+1)); \ + } \ +\ + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 *y) { \ + return (RET_TYPE##3)(FUNCTION(x.x, (ARG2_TYPE*)y), FUNCTION(x.y, (ARG2_TYPE*)y+1), \ + FUNCTION(x.z, (ARG2_TYPE*)y+2)); \ + } \ +\ + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 *y) { \ + return (RET_TYPE##4)(FUNCTION(x.lo, (ARG2_TYPE##2*)y), FUNCTION(x.hi, (ARG2_TYPE##2*)((ARG2_TYPE*)y+2))); \ + } \ +\ + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 *y) { \ + return (RET_TYPE##8)(FUNCTION(x.lo, (ARG2_TYPE##4*)y), FUNCTION(x.hi, (ARG2_TYPE##4*)((ARG2_TYPE*)y+4))); \ + } \ +\ + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 *y) { \ + return (RET_TYPE##16)(FUNCTION(x.lo, (ARG2_TYPE##8*)y), FUNCTION(x.hi, (ARG2_TYPE##8*)((ARG2_TYPE*)y+8))); \ + } + +#define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ + return BUILTIN(x, y); \ +} \ +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) + +#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \ +_CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \ +_CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) + +#define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { \ + return BUILTIN(x); \ +} \ +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE) diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl new file mode 100644 index 0000000..5de56f8 --- /dev/null +++ b/libclc/generic/lib/common/degrees.cl @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float degrees(float radians) { + // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F + return 0x1.ca5dc2p+5F * radians; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, degrees, float); + + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double degrees(double radians) { + // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F + return 0x1.ca5dc1a63c1f8p+5 * radians; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, degrees, double); + +#endif diff --git a/libclc/generic/lib/common/mix.cl b/libclc/generic/lib/common/mix.cl new file mode 100644 index 0000000..294f332e --- /dev/null +++ b/libclc/generic/lib/common/mix.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <mix.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/common/mix.inc b/libclc/generic/lib/common/mix.inc new file mode 100644 index 0000000..1e8b936 --- /dev/null +++ b/libclc/generic/lib/common/mix.inc @@ -0,0 +1,9 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) { + return mad( y - x, a, x ); +} + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) { + return mix(x, y, (__CLC_GENTYPE)a); +} +#endif diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl new file mode 100644 index 0000000..3838dd6 --- /dev/null +++ b/libclc/generic/lib/common/radians.cl @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float radians(float degrees) { + // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F + return 0x1.1df46ap-6F * degrees; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, radians, float); + + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double radians(double degrees) { + // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F + return 0x1.1df46a2529d39p-6 * degrees; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, radians, double); + +#endif diff --git a/libclc/generic/lib/common/sign.cl b/libclc/generic/lib/common/sign.cl new file mode 100644 index 0000000..25832e0 --- /dev/null +++ b/libclc/generic/lib/common/sign.cl @@ -0,0 +1,28 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +#define SIGN(TYPE, F) \ +_CLC_DEF _CLC_OVERLOAD TYPE sign(TYPE x) { \ + if (isnan(x)) { \ + return 0.0F; \ + } \ + if (x > 0.0F) { \ + return 1.0F; \ + } \ + if (x < 0.0F) { \ + return -1.0F; \ + } \ + return x; /* -0.0 or +0.0 */ \ +} + +SIGN(float, f) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sign, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +SIGN(double, ) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sign, double) + +#endif diff --git a/libclc/generic/lib/common/smoothstep.cl b/libclc/generic/lib/common/smoothstep.cl new file mode 100644 index 0000000..68d1a13 --- /dev/null +++ b/libclc/generic/lib/common/smoothstep.cl @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float smoothstep(float edge0, float edge1, float x) { + float t = clamp((x - edge0) / (edge1 - edge0), 0.0f, 1.0f); + return t * t * (3.0f - 2.0f * t); +} + +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, float, float, float); + +_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, float, float, float); + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define SMOOTH_STEP_DEF(edge_type, x_type, impl) \ + _CLC_OVERLOAD _CLC_DEF x_type smoothstep(edge_type edge0, edge_type edge1, x_type x) { \ + double t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0); \ + return t * t * (3.0 - 2.0 * t); \ + } + +SMOOTH_STEP_DEF(double, double, SMOOTH_STEP_IMPL_D); + +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, double, double); + +SMOOTH_STEP_DEF(float, double, SMOOTH_STEP_IMPL_D); +SMOOTH_STEP_DEF(double, float, SMOOTH_STEP_IMPL_D); + +_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, float, float, double); +_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, double, double, float); + +#endif diff --git a/libclc/generic/lib/common/step.cl b/libclc/generic/lib/common/step.cl new file mode 100644 index 0000000..4b022f1 --- /dev/null +++ b/libclc/generic/lib/common/step.cl @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float step(float edge, float x) { + return x < edge ? 0.0f : 1.0f; +} + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, step, float, float); + +_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, step, float, float); + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define STEP_DEF(edge_type, x_type) \ + _CLC_OVERLOAD _CLC_DEF x_type step(edge_type edge, x_type x) { \ + return x < edge ? 0.0 : 1.0; \ + } + +STEP_DEF(double, double); + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); +_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); + +STEP_DEF(float, double); +STEP_DEF(double, float); + +_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, float, double); +_CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, step, double, float); + +#endif diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py new file mode 100644 index 0000000..f91a89a --- /dev/null +++ b/libclc/generic/lib/gen_convert.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python3 + +# OpenCL built-in library: type conversion functions +# +# Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com> +# Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# This script generates the file convert_type.cl, which contains all of the +# OpenCL functions in the form: +# +# convert_<destTypen><_sat><_roundingMode>(<sourceTypen>) + +types = ['char', 'uchar', 'short', 'ushort', 'int', 'uint', 'long', 'ulong', 'float', 'double'] +int_types = ['char', 'uchar', 'short', 'ushort', 'int', 'uint', 'long', 'ulong'] +unsigned_types = ['uchar', 'ushort', 'uint', 'ulong'] +float_types = ['float', 'double'] +int64_types = ['long', 'ulong'] +float64_types = ['double'] +vector_sizes = ['', '2', '3', '4', '8', '16'] +half_sizes = [('2',''), ('4','2'), ('8','4'), ('16','8')] + +saturation = ['','_sat'] +rounding_modes = ['_rtz','_rte','_rtp','_rtn'] +float_prefix = {'float':'FLT_', 'double':'DBL_'} +float_suffix = {'float':'f', 'double':''} + +bool_type = {'char' : 'char', + 'uchar' : 'char', + 'short' : 'short', + 'ushort': 'short', + 'int' : 'int', + 'uint' : 'int', + 'long' : 'long', + 'ulong' : 'long', + 'float' : 'int', + 'double' : 'long'} + +unsigned_type = {'char' : 'uchar', + 'uchar' : 'uchar', + 'short' : 'ushort', + 'ushort': 'ushort', + 'int' : 'uint', + 'uint' : 'uint', + 'long' : 'ulong', + 'ulong' : 'ulong'} + +sizeof_type = {'char' : 1, 'uchar' : 1, + 'short' : 2, 'ushort' : 2, + 'int' : 4, 'uint' : 4, + 'long' : 8, 'ulong' : 8, + 'float' : 4, 'double' : 8} + +limit_max = {'char' : 'CHAR_MAX', + 'uchar' : 'UCHAR_MAX', + 'short' : 'SHRT_MAX', + 'ushort': 'USHRT_MAX', + 'int' : 'INT_MAX', + 'uint' : 'UINT_MAX', + 'long' : 'LONG_MAX', + 'ulong' : 'ULONG_MAX'} + +limit_min = {'char' : 'CHAR_MIN', + 'uchar' : '0', + 'short' : 'SHRT_MIN', + 'ushort': '0', + 'int' : 'INT_MIN', + 'uint' : '0', + 'long' : 'LONG_MIN', + 'ulong' : '0'} + +def conditional_guard(src, dst): + int64_count = 0 + float64_count = 0 + if src in int64_types: + int64_count = int64_count +1 + elif src in float64_types: + float64_count = float64_count + 1 + if dst in int64_types: + int64_count = int64_count +1 + elif dst in float64_types: + float64_count = float64_count + 1 + if float64_count > 0 and int64_count > 0: + print("#if defined(cl_khr_fp64) && defined(cles_khr_int64)") + return True + elif float64_count > 0: + print("#ifdef cl_khr_fp64") + return True + elif int64_count > 0: + print("#ifdef cles_khr_int64") + return True + return False + + +print("""/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!! + + DON'T CHANGE THIS FILE. MAKE YOUR CHANGES TO convert_type.py AND RUN: + $ ./generate-conversion-type-cl.sh + + OpenCL type conversion functions + + Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com> + Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com> + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +""") + +# +# Default Conversions +# +# All conversions are in accordance with the OpenCL specification, +# which cites the C99 conversion rules. +# +# Casting from floating point to integer results in conversions +# with truncation, so it should be suitable for the default convert +# functions. +# +# Conversions from integer to floating-point, and floating-point to +# floating-point through casting is done with the default rounding +# mode. While C99 allows dynamically changing the rounding mode +# during runtime, it is not a supported feature in OpenCL according +# to Section 7.1 - Rounding Modes in the OpenCL 1.2 specification. +# +# Therefore, we can assume for optimization purposes that the +# rounding mode is fixed to round-to-nearest-even. Platform target +# authors should ensure that the rounding-control registers remain +# in this state, and that this invariant holds. +# +# Also note, even though the OpenCL specification isn't entirely +# clear on this matter, we implement all rounding mode combinations +# even for integer-to-integer conversions. When such a conversion +# is used, the rounding mode is ignored. +# + +def generate_default_conversion(src, dst, mode): + close_conditional = conditional_guard(src, dst) + + # scalar conversions + print("""_CLC_DEF _CLC_OVERLOAD +{DST} convert_{DST}{M}({SRC} x) +{{ + return ({DST})x; +}} +""".format(SRC=src, DST=dst, M=mode)) + + # vector conversions, done through decomposition to components + for size, half_size in half_sizes: + print("""_CLC_DEF _CLC_OVERLOAD +{DST}{N} convert_{DST}{N}{M}({SRC}{N} x) +{{ + return ({DST}{N})(convert_{DST}{H}(x.lo), convert_{DST}{H}(x.hi)); +}} +""".format(SRC=src, DST=dst, N=size, H=half_size, M=mode)) + + # 3-component vector conversions + print("""_CLC_DEF _CLC_OVERLOAD +{DST}3 convert_{DST}3{M}({SRC}3 x) +{{ + return ({DST}3)(convert_{DST}2(x.s01), convert_{DST}(x.s2)); +}}""".format(SRC=src, DST=dst, M=mode)) + + if close_conditional: + print("#endif") + + +for src in types: + for dst in types: + generate_default_conversion(src, dst, '') + +for src in int_types: + for dst in int_types: + for mode in rounding_modes: + generate_default_conversion(src, dst, mode) + +# +# Saturated Conversions To Integers +# +# These functions are dependent on the unsaturated conversion functions +# generated above, and use clamp, max, min, and select to eliminate +# branching and vectorize the conversions. +# +# Again, as above, we allow all rounding modes for integer-to-integer +# conversions with saturation. +# + +def generate_saturated_conversion(src, dst, size): + # Header + close_conditional = conditional_guard(src, dst) + print("""_CLC_DEF _CLC_OVERLOAD +{DST}{N} convert_{DST}{N}_sat({SRC}{N} x) +{{""".format(DST=dst, SRC=src, N=size)) + + # FIXME: This is a work around for lack of select function with + # signed third argument when the first two arguments are unsigned types. + # We cast to the signed type for sign-extension, then do a bitcast to + # the unsigned type. + if dst in unsigned_types: + bool_prefix = "as_{DST}{N}(convert_{BOOL}{N}".format(DST=dst, BOOL=bool_type[dst], N=size); + bool_suffix = ")" + else: + bool_prefix = "convert_{BOOL}{N}".format(BOOL=bool_type[dst], N=size); + bool_suffix = "" + + # Body + if src == dst: + + # Conversion between same types + print(" return x;") + + elif src in float_types: + + # Conversion from float to int + print(""" {DST}{N} y = convert_{DST}{N}(x); + y = select(y, ({DST}{N}){DST_MIN}, {BP}(x < ({SRC}{N}){DST_MIN}){BS}); + y = select(y, ({DST}{N}){DST_MAX}, {BP}(x > ({SRC}{N}){DST_MAX}){BS}); + return y;""".format(SRC=src, DST=dst, N=size, + DST_MIN=limit_min[dst], DST_MAX=limit_max[dst], + BP=bool_prefix, BS=bool_suffix)) + + else: + + # Integer to integer convesion with sizeof(src) == sizeof(dst) + if sizeof_type[src] == sizeof_type[dst]: + if src in unsigned_types: + print(" x = min(x, ({SRC}){DST_MAX});".format(SRC=src, DST_MAX=limit_max[dst])) + else: + print(" x = max(x, ({SRC})0);".format(SRC=src)) + + # Integer to integer conversion where sizeof(src) > sizeof(dst) + elif sizeof_type[src] > sizeof_type[dst]: + if src in unsigned_types: + print(" x = min(x, ({SRC}){DST_MAX});".format(SRC=src, DST_MAX=limit_max[dst])) + else: + print(" x = clamp(x, ({SRC}){DST_MIN}, ({SRC}){DST_MAX});" + .format(SRC=src, DST_MIN=limit_min[dst], DST_MAX=limit_max[dst])) + + # Integer to integer conversion where sizeof(src) < sizeof(dst) + elif src not in unsigned_types and dst in unsigned_types: + print(" x = max(x, ({SRC})0);".format(SRC=src)) + + print(" return convert_{DST}{N}(x);".format(DST=dst, N=size)) + + # Footer + print("}") + if close_conditional: + print("#endif") + + +for src in types: + for dst in int_types: + for size in vector_sizes: + generate_saturated_conversion(src, dst, size) + + +def generate_saturated_conversion_with_rounding(src, dst, size, mode): + # Header + close_conditional = conditional_guard(src, dst) + + # Body + print("""_CLC_DEF _CLC_OVERLOAD +{DST}{N} convert_{DST}{N}_sat{M}({SRC}{N} x) +{{ + return convert_{DST}{N}_sat(x); +}} +""".format(DST=dst, SRC=src, N=size, M=mode)) + + # Footer + if close_conditional: + print("#endif") + + +for src in int_types: + for dst in int_types: + for size in vector_sizes: + for mode in rounding_modes: + generate_saturated_conversion_with_rounding(src, dst, size, mode) + +# +# Conversions To/From Floating-Point With Rounding +# +# Note that we assume as above that casts from floating-point to +# integer are done with truncation, and that the default rounding +# mode is fixed to round-to-nearest-even, as per C99 and OpenCL +# rounding rules. +# +# These functions rely on the use of abs, ceil, fabs, floor, +# nextafter, sign, rint and the above generated conversion functions. +# +# Only conversions to integers can have saturation. +# + +def generate_float_conversion(src, dst, size, mode, sat): + # Header + close_conditional = conditional_guard(src, dst) + print("""_CLC_DEF _CLC_OVERLOAD +{DST}{N} convert_{DST}{N}{S}{M}({SRC}{N} x) +{{""".format(SRC=src, DST=dst, N=size, M=mode, S=sat)) + + # Perform conversion + if dst in int_types: + if mode == '_rte': + print(" x = rint(x);"); + elif mode == '_rtp': + print(" x = ceil(x);"); + elif mode == '_rtn': + print(" x = floor(x);"); + print(" return convert_{DST}{N}{S}(x);".format(DST=dst, N=size, S=sat)) + elif mode == '_rte': + print(" return convert_{DST}{N}(x);".format(DST=dst, N=size)) + else: + print(" {DST}{N} r = convert_{DST}{N}(x);".format(DST=dst, N=size)) + print(" {SRC}{N} y = convert_{SRC}{N}(y);".format(SRC=src, N=size)) + if mode == '_rtz': + if src in int_types: + print(" {USRC}{N} abs_x = abs(x);".format(USRC=unsigned_type[src], N=size)) + print(" {USRC}{N} abs_y = abs(y);".format(USRC=unsigned_type[src], N=size)) + else: + print(" {SRC}{N} abs_x = fabs(x);".format(SRC=src, N=size)) + print(" {SRC}{N} abs_y = fabs(y);".format(SRC=src, N=size)) + print(" return select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), convert_{BOOL}{N}(abs_y > abs_x));" + .format(DST=dst, N=size, BOOL=bool_type[dst])) + if mode == '_rtp': + print(" return select(r, nextafter(r, ({DST}{N})INFINITY), convert_{BOOL}{N}(y < x));" + .format(DST=dst, N=size, BOOL=bool_type[dst])) + if mode == '_rtn': + print(" return select(r, nextafter(r, ({DST}{N})-INFINITY), convert_{BOOL}{N}(y > x));" + .format(DST=dst, N=size, BOOL=bool_type[dst])) + + # Footer + print("}") + if close_conditional: + print("#endif") + + +for src in float_types: + for dst in int_types: + for size in vector_sizes: + for mode in rounding_modes: + for sat in saturation: + generate_float_conversion(src, dst, size, mode, sat) + + +for src in types: + for dst in float_types: + for size in vector_sizes: + for mode in rounding_modes: + generate_float_conversion(src, dst, size, mode, '') diff --git a/libclc/generic/lib/geometric/cross.cl b/libclc/generic/lib/geometric/cross.cl new file mode 100644 index 0000000..3b4ca6c --- /dev/null +++ b/libclc/generic/lib/geometric/cross.cl @@ -0,0 +1,25 @@ +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float3 cross(float3 p0, float3 p1) { + return (float3)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x); +} + +_CLC_OVERLOAD _CLC_DEF float4 cross(float4 p0, float4 p1) { + return (float4)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x, 0.f); +} + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double3 cross(double3 p0, double3 p1) { + return (double3)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x); +} + +_CLC_OVERLOAD _CLC_DEF double4 cross(double4 p0, double4 p1) { + return (double4)(p0.y*p1.z - p0.z*p1.y, p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x, 0.f); +} +#endif diff --git a/libclc/generic/lib/geometric/distance.cl b/libclc/generic/lib/geometric/distance.cl new file mode 100644 index 0000000..4a5b8e25 --- /dev/null +++ b/libclc/generic/lib/geometric/distance.cl @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <distance.inc> +#include <clc/geometric/floatn.inc> diff --git a/libclc/generic/lib/geometric/distance.inc b/libclc/generic/lib/geometric/distance.inc new file mode 100644 index 0000000..193665e --- /dev/null +++ b/libclc/generic/lib/geometric/distance.inc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_FLOAT distance(__CLC_FLOATN p0, __CLC_FLOATN p1) { + return length(p0 - p1); +} diff --git a/libclc/generic/lib/geometric/dot.cl b/libclc/generic/lib/geometric/dot.cl new file mode 100644 index 0000000..0d6fe6c --- /dev/null +++ b/libclc/generic/lib/geometric/dot.cl @@ -0,0 +1,39 @@ +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float dot(float p0, float p1) { + return p0*p1; +} + +_CLC_OVERLOAD _CLC_DEF float dot(float2 p0, float2 p1) { + return p0.x*p1.x + p0.y*p1.y; +} + +_CLC_OVERLOAD _CLC_DEF float dot(float3 p0, float3 p1) { + return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; +} + +_CLC_OVERLOAD _CLC_DEF float dot(float4 p0, float4 p1) { + return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; +} + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double dot(double p0, double p1) { + return p0*p1; +} + +_CLC_OVERLOAD _CLC_DEF double dot(double2 p0, double2 p1) { + return p0.x*p1.x + p0.y*p1.y; +} + +_CLC_OVERLOAD _CLC_DEF double dot(double3 p0, double3 p1) { + return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; +} + +_CLC_OVERLOAD _CLC_DEF double dot(double4 p0, double4 p1) { + return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; +} + +#endif diff --git a/libclc/generic/lib/geometric/fast_distance.cl b/libclc/generic/lib/geometric/fast_distance.cl new file mode 100644 index 0000000..0a4f82c --- /dev/null +++ b/libclc/generic/lib/geometric/fast_distance.cl @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#define __CLC_BODY <fast_distance.inc> +#define __FLOAT_ONLY +#include <clc/geometric/floatn.inc> +#undef __FLOAT_ONLY diff --git a/libclc/generic/lib/geometric/fast_distance.inc b/libclc/generic/lib/geometric/fast_distance.inc new file mode 100644 index 0000000..d8fe3e0 --- /dev/null +++ b/libclc/generic/lib/geometric/fast_distance.inc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_FLOAT fast_distance(__CLC_FLOATN p0, __CLC_FLOATN p1) { + return fast_length(p0 - p1); +} diff --git a/libclc/generic/lib/geometric/fast_length.cl b/libclc/generic/lib/geometric/fast_length.cl new file mode 100644 index 0000000..8f6ffc6 --- /dev/null +++ b/libclc/generic/lib/geometric/fast_length.cl @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float fast_length(float p) { + return fabs(p); +} + +_CLC_OVERLOAD _CLC_DEF float fast_length(float2 p) { + return half_sqrt(dot(p, p)); +} + +_CLC_OVERLOAD _CLC_DEF float fast_length(float3 p) { + return half_sqrt(dot(p, p)); +} + +_CLC_OVERLOAD _CLC_DEF float fast_length(float4 p) { + return half_sqrt(dot(p, p)); +} diff --git a/libclc/generic/lib/geometric/fast_normalize.cl b/libclc/generic/lib/geometric/fast_normalize.cl new file mode 100644 index 0000000..af5f994 --- /dev/null +++ b/libclc/generic/lib/geometric/fast_normalize.cl @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float fast_normalize(float p) { + return normalize(p); +} + +#define __CLC_BODY <fast_normalize.inc> +#define __FLOAT_ONLY +#include <clc/geometric/floatn.inc> +#undef __FLOAT_ONLY diff --git a/libclc/generic/lib/geometric/fast_normalize.inc b/libclc/generic/lib/geometric/fast_normalize.inc new file mode 100644 index 0000000..c1be2b8 --- /dev/null +++ b/libclc/generic/lib/geometric/fast_normalize.inc @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __CLC_SCALAR + +// Only handle vector implementations +_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN fast_normalize(__CLC_FLOATN p) { + __CLC_FLOAT l2 = dot(p, p); + return l2 == 0.0f ? p : p * half_rsqrt(l2); +} + +#endif diff --git a/libclc/generic/lib/geometric/length.cl b/libclc/generic/lib/geometric/length.cl new file mode 100644 index 0000000..e7f31b4 --- /dev/null +++ b/libclc/generic/lib/geometric/length.cl @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float length(float p) { + return fabs(p); +} + +#define V_FLENGTH(p) \ + float l2 = dot(p, p); \ + \ + if (l2 < FLT_MIN) { \ + p *= 0x1.0p+86F; \ + return sqrt(dot(p, p)) * 0x1.0p-86F; \ + } else if (l2 == INFINITY) { \ + p *= 0x1.0p-65F; \ + return sqrt(dot(p, p)) * 0x1.0p+65F; \ + } \ + \ + return sqrt(l2); + +_CLC_OVERLOAD _CLC_DEF float length(float2 p) { + V_FLENGTH(p); +} + +_CLC_OVERLOAD _CLC_DEF float length(float3 p) { + V_FLENGTH(p); +} + +_CLC_OVERLOAD _CLC_DEF float length(float4 p) { + V_FLENGTH(p); +} + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double length(double p){ + return fabs(p); +} + +#define V_DLENGTH(p) \ + double l2 = dot(p, p); \ + \ + if (l2 < DBL_MIN) { \ + p *= 0x1.0p+563; \ + return sqrt(dot(p, p)) * 0x1.0p-563; \ + } else if (l2 == INFINITY) { \ + p *= 0x1.0p-513; \ + return sqrt(dot(p, p)) * 0x1.0p+513; \ + } \ + \ + return sqrt(l2); + +_CLC_OVERLOAD _CLC_DEF double length(double2 p) { + V_DLENGTH(p); +} + +_CLC_OVERLOAD _CLC_DEF double length(double3 p) { + V_DLENGTH(p); +} + +_CLC_OVERLOAD _CLC_DEF double +length(double4 p) { + V_DLENGTH(p); +} + +#endif diff --git a/libclc/generic/lib/geometric/normalize.cl b/libclc/generic/lib/geometric/normalize.cl new file mode 100644 index 0000000..f61ac94 --- /dev/null +++ b/libclc/generic/lib/geometric/normalize.cl @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float normalize(float p) { + return sign(p); +} + +_CLC_OVERLOAD _CLC_DEF float2 normalize(float2 p) { + if (all(p == (float2)0.0F)) + return p; + + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-65f; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((float2)0.0F, (float2)1.0F, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +_CLC_OVERLOAD _CLC_DEF float3 normalize(float3 p) { + if (all(p == (float3)0.0F)) + return p; + + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-66f; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((float3)0.0F, (float3)1.0F, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +_CLC_OVERLOAD _CLC_DEF float4 normalize(float4 p) { + if (all(p == (float4)0.0F)) + return p; + + float l2 = dot(p, p); + + if (l2 < FLT_MIN) { + p *= 0x1.0p+86F; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-66f; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((float4)0.0F, (float4)1.0F, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double normalize(double p) { + return sign(p); +} + +_CLC_OVERLOAD _CLC_DEF double2 normalize(double2 p) { + if (all(p == (double2)0.0)) + return p; + + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-513; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((double2)0.0, (double2)1.0, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +_CLC_OVERLOAD _CLC_DEF double3 normalize(double3 p) { + if (all(p == (double3)0.0)) + return p; + + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-514; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((double3)0.0, (double3)1.0, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +_CLC_OVERLOAD _CLC_DEF double4 normalize(double4 p) { + if (all(p == (double4)0.0)) + return p; + + double l2 = dot(p, p); + + if (l2 < DBL_MIN) { + p *= 0x1.0p+563; + l2 = dot(p, p); + } else if (l2 == INFINITY) { + p *= 0x1.0p-514; + l2 = dot(p, p); + if (l2 == INFINITY) { + p = copysign(select((double4)0.0, (double4)1.0, isinf(p)), p); + l2 = dot(p, p); + } + } + return p * rsqrt(l2); +} + +#endif diff --git a/libclc/generic/lib/image/get_image_dim.cl b/libclc/generic/lib/image/get_image_dim.cl new file mode 100644 index 0000000..26dbd00 --- /dev/null +++ b/libclc/generic/lib/image/get_image_dim.cl @@ -0,0 +1,9 @@ +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF int2 get_image_dim (image2d_t image) { + return (int2)(get_image_width(image), get_image_height(image)); +} +_CLC_OVERLOAD _CLC_DEF int4 get_image_dim (image3d_t image) { + return (int4)(get_image_width(image), get_image_height(image), + get_image_depth(image), 0); +} diff --git a/libclc/generic/lib/integer/abs.cl b/libclc/generic/lib/integer/abs.cl new file mode 100644 index 0000000..faff8d0 --- /dev/null +++ b/libclc/generic/lib/integer/abs.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <abs.inc> +#include <clc/integer/gentype.inc> diff --git a/libclc/generic/lib/integer/abs.inc b/libclc/generic/lib/integer/abs.inc new file mode 100644 index 0000000..cfe7bfe --- /dev/null +++ b/libclc/generic/lib/integer/abs.inc @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs(__CLC_GENTYPE x) { + return __builtin_astype((__CLC_GENTYPE)(x > (__CLC_GENTYPE)(0) ? x : -x), __CLC_U_GENTYPE); +} diff --git a/libclc/generic/lib/integer/abs_diff.cl b/libclc/generic/lib/integer/abs_diff.cl new file mode 100644 index 0000000..3d75105 --- /dev/null +++ b/libclc/generic/lib/integer/abs_diff.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <abs_diff.inc> +#include <clc/integer/gentype.inc> diff --git a/libclc/generic/lib/integer/abs_diff.inc b/libclc/generic/lib/integer/abs_diff.inc new file mode 100644 index 0000000..f39c3ff --- /dev/null +++ b/libclc/generic/lib/integer/abs_diff.inc @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs_diff(__CLC_GENTYPE x, __CLC_GENTYPE y) { + return __builtin_astype((__CLC_GENTYPE)(x > y ? x-y : y-x), __CLC_U_GENTYPE); +} diff --git a/libclc/generic/lib/integer/add_sat.cl b/libclc/generic/lib/integer/add_sat.cl new file mode 100644 index 0000000..d4df66d --- /dev/null +++ b/libclc/generic/lib/integer/add_sat.cl @@ -0,0 +1,53 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +// From add_sat.ll +_CLC_DECL char __clc_add_sat_s8(char, char); +_CLC_DECL uchar __clc_add_sat_u8(uchar, uchar); +_CLC_DECL short __clc_add_sat_s16(short, short); +_CLC_DECL ushort __clc_add_sat_u16(ushort, ushort); +_CLC_DECL int __clc_add_sat_s32(int, int); +_CLC_DECL uint __clc_add_sat_u32(uint, uint); +_CLC_DECL long __clc_add_sat_s64(long, long); +_CLC_DECL ulong __clc_add_sat_u64(ulong, ulong); + +_CLC_OVERLOAD _CLC_DEF char add_sat(char x, char y) { + return __clc_add_sat_s8(x, y); +} + +_CLC_OVERLOAD _CLC_DEF uchar add_sat(uchar x, uchar y) { + return __clc_add_sat_u8(x, y); +} + +_CLC_OVERLOAD _CLC_DEF short add_sat(short x, short y) { + return __clc_add_sat_s16(x, y); +} + +_CLC_OVERLOAD _CLC_DEF ushort add_sat(ushort x, ushort y) { + return __clc_add_sat_u16(x, y); +} + +_CLC_OVERLOAD _CLC_DEF int add_sat(int x, int y) { + return __clc_add_sat_s32(x, y); +} + +_CLC_OVERLOAD _CLC_DEF uint add_sat(uint x, uint y) { + return __clc_add_sat_u32(x, y); +} + +_CLC_OVERLOAD _CLC_DEF long add_sat(long x, long y) { + return __clc_add_sat_s64(x, y); +} + +_CLC_OVERLOAD _CLC_DEF ulong add_sat(ulong x, ulong y) { + return __clc_add_sat_u64(x, y); +} + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, add_sat, char, char) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, add_sat, uchar, uchar) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, add_sat, short, short) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, add_sat, ushort, ushort) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, add_sat, int, int) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, add_sat, uint, uint) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, add_sat, long, long) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, add_sat, ulong, ulong) diff --git a/libclc/generic/lib/integer/add_sat_if.ll b/libclc/generic/lib/integer/add_sat_if.ll new file mode 100644 index 0000000..bcbe4c0 --- /dev/null +++ b/libclc/generic/lib/integer/add_sat_if.ll @@ -0,0 +1,55 @@ +declare i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y) + +define i8 @__clc_add_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y) + ret i8 %call +} + +declare i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y) + +define i8 @__clc_add_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y) + ret i8 %call +} + +declare i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y) + +define i16 @__clc_add_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y) + ret i16 %call +} + +declare i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y) + +define i16 @__clc_add_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y) + ret i16 %call +} + +declare i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y) + +define i32 @__clc_add_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y) + ret i32 %call +} + +declare i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y) + +define i32 @__clc_add_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y) + ret i32 %call +} + +declare i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y) + +define i64 @__clc_add_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y) + ret i64 %call +} + +declare i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y) + +define i64 @__clc_add_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y) + ret i64 %call +} diff --git a/libclc/generic/lib/integer/add_sat_impl.ll b/libclc/generic/lib/integer/add_sat_impl.ll new file mode 100644 index 0000000..c150ecb --- /dev/null +++ b/libclc/generic/lib/integer/add_sat_impl.ll @@ -0,0 +1,83 @@ +declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) +declare {i8, i1} @llvm.uadd.with.overflow.i8(i8, i8) + +define i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %x, i8 %y) + %res = extractvalue {i8, i1} %call, 0 + %over = extractvalue {i8, i1} %call, 1 + %x.msb = ashr i8 %x, 7 + %x.limit = xor i8 %x.msb, 127 + %sat = select i1 %over, i8 %x.limit, i8 %res + ret i8 %sat +} + +define i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %x, i8 %y) + %res = extractvalue {i8, i1} %call, 0 + %over = extractvalue {i8, i1} %call, 1 + %sat = select i1 %over, i8 -1, i8 %res + ret i8 %sat +} + +declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) +declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) + +define i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %x, i16 %y) + %res = extractvalue {i16, i1} %call, 0 + %over = extractvalue {i16, i1} %call, 1 + %x.msb = ashr i16 %x, 15 + %x.limit = xor i16 %x.msb, 32767 + %sat = select i1 %over, i16 %x.limit, i16 %res + ret i16 %sat +} + +define i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %x, i16 %y) + %res = extractvalue {i16, i1} %call, 0 + %over = extractvalue {i16, i1} %call, 1 + %sat = select i1 %over, i16 -1, i16 %res + ret i16 %sat +} + +declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) +declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) + +define i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %x, i32 %y) + %res = extractvalue {i32, i1} %call, 0 + %over = extractvalue {i32, i1} %call, 1 + %x.msb = ashr i32 %x, 31 + %x.limit = xor i32 %x.msb, 2147483647 + %sat = select i1 %over, i32 %x.limit, i32 %res + ret i32 %sat +} + +define i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y) + %res = extractvalue {i32, i1} %call, 0 + %over = extractvalue {i32, i1} %call, 1 + %sat = select i1 %over, i32 -1, i32 %res + ret i32 %sat +} + +declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) +declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) + +define i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %x, i64 %y) + %res = extractvalue {i64, i1} %call, 0 + %over = extractvalue {i64, i1} %call, 1 + %x.msb = ashr i64 %x, 63 + %x.limit = xor i64 %x.msb, 9223372036854775807 + %sat = select i1 %over, i64 %x.limit, i64 %res + ret i64 %sat +} + +define i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %x, i64 %y) + %res = extractvalue {i64, i1} %call, 0 + %over = extractvalue {i64, i1} %call, 1 + %sat = select i1 %over, i64 -1, i64 %res + ret i64 %sat +} diff --git a/libclc/generic/lib/integer/clz.cl b/libclc/generic/lib/integer/clz.cl new file mode 100644 index 0000000..17e3fe0 --- /dev/null +++ b/libclc/generic/lib/integer/clz.cl @@ -0,0 +1,53 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +// From clz.ll +_CLC_DECL char __clc_clz_s8(char); +_CLC_DECL uchar __clc_clz_u8(uchar); +_CLC_DECL short __clc_clz_s16(short); +_CLC_DECL ushort __clc_clz_u16(ushort); +_CLC_DECL int __clc_clz_s32(int); +_CLC_DECL uint __clc_clz_u32(uint); +_CLC_DECL long __clc_clz_s64(long); +_CLC_DECL ulong __clc_clz_u64(ulong); + +_CLC_OVERLOAD _CLC_DEF char clz(char x) { + return __clc_clz_s8(x); +} + +_CLC_OVERLOAD _CLC_DEF uchar clz(uchar x) { + return __clc_clz_u8(x); +} + +_CLC_OVERLOAD _CLC_DEF short clz(short x) { + return __clc_clz_s16(x); +} + +_CLC_OVERLOAD _CLC_DEF ushort clz(ushort x) { + return __clc_clz_u16(x); +} + +_CLC_OVERLOAD _CLC_DEF int clz(int x) { + return __clc_clz_s32(x); +} + +_CLC_OVERLOAD _CLC_DEF uint clz(uint x) { + return __clc_clz_u32(x); +} + +_CLC_OVERLOAD _CLC_DEF long clz(long x) { + return __clc_clz_s64(x); +} + +_CLC_OVERLOAD _CLC_DEF ulong clz(ulong x) { + return __clc_clz_u64(x); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, clz, char) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, clz, uchar) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, clz, short) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, clz, ushort) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, clz, int) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, clz, uint) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, clz, long) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, clz, ulong) diff --git a/libclc/generic/lib/integer/clz_if.ll b/libclc/generic/lib/integer/clz_if.ll new file mode 100644 index 0000000..23dfc74 --- /dev/null +++ b/libclc/generic/lib/integer/clz_if.ll @@ -0,0 +1,55 @@ +declare i8 @__clc_clz_impl_s8(i8 %x) + +define i8 @__clc_clz_s8(i8 %x) nounwind readnone alwaysinline { + %call = call i8 @__clc_clz_impl_s8(i8 %x) + ret i8 %call +} + +declare i8 @__clc_clz_impl_u8(i8 %x) + +define i8 @__clc_clz_u8(i8 %x) nounwind readnone alwaysinline { + %call = call i8 @__clc_clz_impl_u8(i8 %x) + ret i8 %call +} + +declare i16 @__clc_clz_impl_s16(i16 %x) + +define i16 @__clc_clz_s16(i16 %x) nounwind readnone alwaysinline { + %call = call i16 @__clc_clz_impl_s16(i16 %x) + ret i16 %call +} + +declare i16 @__clc_clz_impl_u16(i16 %x) + +define i16 @__clc_clz_u16(i16 %x) nounwind readnone alwaysinline { + %call = call i16 @__clc_clz_impl_u16(i16 %x) + ret i16 %call +} + +declare i32 @__clc_clz_impl_s32(i32 %x) + +define i32 @__clc_clz_s32(i32 %x) nounwind readnone alwaysinline { + %call = call i32 @__clc_clz_impl_s32(i32 %x) + ret i32 %call +} + +declare i32 @__clc_clz_impl_u32(i32 %x) + +define i32 @__clc_clz_u32(i32 %x) nounwind readnone alwaysinline { + %call = call i32 @__clc_clz_impl_u32(i32 %x) + ret i32 %call +} + +declare i64 @__clc_clz_impl_s64(i64 %x) + +define i64 @__clc_clz_s64(i64 %x) nounwind readnone alwaysinline { + %call = call i64 @__clc_clz_impl_s64(i64 %x) + ret i64 %call +} + +declare i64 @__clc_clz_impl_u64(i64 %x) + +define i64 @__clc_clz_u64(i64 %x) nounwind readnone alwaysinline { + %call = call i64 @__clc_clz_impl_u64(i64 %x) + ret i64 %call +} diff --git a/libclc/generic/lib/integer/clz_impl.ll b/libclc/generic/lib/integer/clz_impl.ll new file mode 100644 index 0000000..b5c3d98 --- /dev/null +++ b/libclc/generic/lib/integer/clz_impl.ll @@ -0,0 +1,44 @@ +declare i8 @llvm.ctlz.i8(i8, i1) +declare i16 @llvm.ctlz.i16(i16, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i64 @llvm.ctlz.i64(i64, i1) + +define i8 @__clc_clz_impl_s8(i8 %x) nounwind readnone alwaysinline { + %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0) + ret i8 %call +} + +define i8 @__clc_clz_impl_u8(i8 %x) nounwind readnone alwaysinline { + %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0) + ret i8 %call +} + +define i16 @__clc_clz_impl_s16(i16 %x) nounwind readnone alwaysinline { + %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0) + ret i16 %call +} + +define i16 @__clc_clz_impl_u16(i16 %x) nounwind readnone alwaysinline { + %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0) + ret i16 %call +} + +define i32 @__clc_clz_impl_s32(i32 %x) nounwind readnone alwaysinline { + %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0) + ret i32 %call +} + +define i32 @__clc_clz_impl_u32(i32 %x) nounwind readnone alwaysinline { + %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0) + ret i32 %call +} + +define i64 @__clc_clz_impl_s64(i64 %x) nounwind readnone alwaysinline { + %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0) + ret i64 %call +} + +define i64 @__clc_clz_impl_u64(i64 %x) nounwind readnone alwaysinline { + %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0) + ret i64 %call +} diff --git a/libclc/generic/lib/integer/hadd.cl b/libclc/generic/lib/integer/hadd.cl new file mode 100644 index 0000000..749026e --- /dev/null +++ b/libclc/generic/lib/integer/hadd.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <hadd.inc> +#include <clc/integer/gentype.inc> diff --git a/libclc/generic/lib/integer/hadd.inc b/libclc/generic/lib/integer/hadd.inc new file mode 100644 index 0000000..ea59d9b --- /dev/null +++ b/libclc/generic/lib/integer/hadd.inc @@ -0,0 +1,6 @@ +//hadd = (x+y)>>1 +//This can be simplified to x>>1 + y>>1 + (1 if both x and y have the 1s bit set) +//This saves us having to do any checks for overflow in the addition sum +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE hadd(__CLC_GENTYPE x, __CLC_GENTYPE y) { + return (x>>(__CLC_GENTYPE)1)+(y>>(__CLC_GENTYPE)1)+(x&y&(__CLC_GENTYPE)1); +} diff --git a/libclc/generic/lib/integer/mad24.cl b/libclc/generic/lib/integer/mad24.cl new file mode 100644 index 0000000..e29e99f --- /dev/null +++ b/libclc/generic/lib/integer/mad24.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <mad24.inc> +#include <clc/integer/integer-gentype.inc> diff --git a/libclc/generic/lib/integer/mad24.inc b/libclc/generic/lib/integer/mad24.inc new file mode 100644 index 0000000..902b0aa --- /dev/null +++ b/libclc/generic/lib/integer/mad24.inc @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z){ + return mul24(x, y) + z; +} diff --git a/libclc/generic/lib/integer/mad_sat.cl b/libclc/generic/lib/integer/mad_sat.cl new file mode 100644 index 0000000..1708b29 --- /dev/null +++ b/libclc/generic/lib/integer/mad_sat.cl @@ -0,0 +1,72 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF char mad_sat(char x, char y, char z) { + return clamp((short)mad24((short)x, (short)y, (short)z), (short)CHAR_MIN, (short) CHAR_MAX); +} + +_CLC_OVERLOAD _CLC_DEF uchar mad_sat(uchar x, uchar y, uchar z) { + return clamp((ushort)mad24((ushort)x, (ushort)y, (ushort)z), (ushort)0, (ushort) UCHAR_MAX); +} + +_CLC_OVERLOAD _CLC_DEF short mad_sat(short x, short y, short z) { + return clamp((int)mad24((int)x, (int)y, (int)z), (int)SHRT_MIN, (int) SHRT_MAX); +} + +_CLC_OVERLOAD _CLC_DEF ushort mad_sat(ushort x, ushort y, ushort z) { + return clamp((uint)mad24((uint)x, (uint)y, (uint)z), (uint)0, (uint) USHRT_MAX); +} + +_CLC_OVERLOAD _CLC_DEF int mad_sat(int x, int y, int z) { + int mhi = mul_hi(x, y); + uint mlo = x * y; + long m = upsample(mhi, mlo); + m += z; + if (m > INT_MAX) + return INT_MAX; + if (m < INT_MIN) + return INT_MIN; + return m; +} + +_CLC_OVERLOAD _CLC_DEF uint mad_sat(uint x, uint y, uint z) { + if (mul_hi(x, y) != 0) + return UINT_MAX; + return add_sat(x * y, z); +} + +_CLC_OVERLOAD _CLC_DEF long mad_sat(long x, long y, long z) { + long hi = mul_hi(x, y); + ulong ulo = x * y; + long slo = x * y; + /* Big overflow of more than 2 bits, add can't fix this */ + if (((x < 0) == (y < 0)) && hi != 0) + return LONG_MAX; + /* Low overflow in mul and z not neg enough to correct it */ + if (hi == 0 && ulo >= LONG_MAX && (z > 0 || (ulo + z) > LONG_MAX)) + return LONG_MAX; + /* Big overflow of more than 2 bits, add can't fix this */ + if (((x < 0) != (y < 0)) && hi != -1) + return LONG_MIN; + /* Low overflow in mul and z not pos enough to correct it */ + if (hi == -1 && ulo <= ((ulong)LONG_MAX + 1UL) && (z < 0 || z < (LONG_MAX - ulo))) + return LONG_MIN; + /* We have checked all conditions, any overflow in addition returns + * the correct value */ + return ulo + z; +} + +_CLC_OVERLOAD _CLC_DEF ulong mad_sat(ulong x, ulong y, ulong z) { + if (mul_hi(x, y) != 0) + return ULONG_MAX; + return add_sat(x * y, z); +} + +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, mad_sat, char, char, char) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, mad_sat, uchar, uchar, uchar) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, mad_sat, short, short, short) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, mad_sat, ushort, ushort, ushort) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, mad_sat, int, int, int) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, mad_sat, uint, uint, uint) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, mad_sat, long, long, long) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, mad_sat, ulong, ulong, ulong) diff --git a/libclc/generic/lib/integer/mul24.cl b/libclc/generic/lib/integer/mul24.cl new file mode 100644 index 0000000..8aedca6 --- /dev/null +++ b/libclc/generic/lib/integer/mul24.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <mul24.inc> +#include <clc/integer/integer-gentype.inc> diff --git a/libclc/generic/lib/integer/mul24.inc b/libclc/generic/lib/integer/mul24.inc new file mode 100644 index 0000000..95a2f1d --- /dev/null +++ b/libclc/generic/lib/integer/mul24.inc @@ -0,0 +1,11 @@ + +// We need to use shifts here in order to mantain the sign bit for signed +// integers. The compiler should optimize this to (x & 0x00FFFFFF) for +// unsigned integers. +#define CONVERT_TO_24BIT(x) (((x) << 8) >> 8) + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y){ + return CONVERT_TO_24BIT(x) * CONVERT_TO_24BIT(y); +} + +#undef CONVERT_TO_24BIT diff --git a/libclc/generic/lib/integer/mul_hi.cl b/libclc/generic/lib/integer/mul_hi.cl new file mode 100644 index 0000000..174d893 --- /dev/null +++ b/libclc/generic/lib/integer/mul_hi.cl @@ -0,0 +1,109 @@ +#include <clc/clc.h> + +//For all types EXCEPT long, which is implemented separately +#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \ + _CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \ + return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \ + } \ + +//FOIL-based long mul_hi +// +// Summary: Treat mul_hi(long x, long y) as: +// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively +// and b and d are the low-order parts of x and y. +// Thinking back to algebra, we use FOIL to do the work. + +_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){ + long f, o, i; + ulong l; + + //Move the high/low halves of x/y into the lower 32-bits of variables so + //that we can multiply them without worrying about overflow. + long x_hi = x >> 32; + long x_lo = x & UINT_MAX; + long y_hi = y >> 32; + long y_lo = y & UINT_MAX; + + //Multiply all of the components according to FOIL method + f = x_hi * y_hi; + o = x_hi * y_lo; + i = x_lo * y_hi; + l = x_lo * y_lo; + + //Now add the components back together in the following steps: + //F: doesn't need to be modified + //O/I: Need to be added together. + //L: Shift right by 32-bits, then add into the sum of O and I + //Once O/I/L are summed up, then shift the sum by 32-bits and add to F. + // + //We use hadd to give us a bit of extra precision for the intermediate sums + //but as a result, we shift by 31 bits instead of 32 + return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31)); +} + +_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y){ + ulong f, o, i; + ulong l; + + //Move the high/low halves of x/y into the lower 32-bits of variables so + //that we can multiply them without worrying about overflow. + ulong x_hi = x >> 32; + ulong x_lo = x & UINT_MAX; + ulong y_hi = y >> 32; + ulong y_lo = y & UINT_MAX; + + //Multiply all of the components according to FOIL method + f = x_hi * y_hi; + o = x_hi * y_lo; + i = x_lo * y_hi; + l = x_lo * y_lo; + + //Now add the components back together, taking care to respect the fact that: + //F: doesn't need to be modified + //O/I: Need to be added together. + //L: Shift right by 32-bits, then add into the sum of O and I + //Once O/I/L are summed up, then shift the sum by 32-bits and add to F. + // + //We use hadd to give us a bit of extra precision for the intermediate sums + //but as a result, we shift by 31 bits instead of 32 + return (f + (hadd(o, (i + (l>>32))) >> 31)); +} + +#define __CLC_MUL_HI_VEC(GENTYPE) \ + _CLC_OVERLOAD _CLC_DEF GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \ + return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \ + return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \ + return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \ + return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \ + return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \ + } \ + +#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \ + __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \ + __CLC_MUL_HI_VEC(TYPE) + +#define __CLC_MUL_HI_TYPES() \ + __CLC_MUL_HI_DEC_IMPL(short, char, 8) \ + __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \ + __CLC_MUL_HI_DEC_IMPL(int, short, 16) \ + __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \ + __CLC_MUL_HI_DEC_IMPL(long, int, 32) \ + __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \ + __CLC_MUL_HI_VEC(long) \ + __CLC_MUL_HI_VEC(ulong) + +__CLC_MUL_HI_TYPES() + +#undef __CLC_MUL_HI_TYPES +#undef __CLC_MUL_HI_DEC_IMPL +#undef __CLC_MUL_HI_IMPL +#undef __CLC_MUL_HI_VEC +#undef __CLC_B32 diff --git a/libclc/generic/lib/integer/rhadd.cl b/libclc/generic/lib/integer/rhadd.cl new file mode 100644 index 0000000..c985870 --- /dev/null +++ b/libclc/generic/lib/integer/rhadd.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <rhadd.inc> +#include <clc/integer/gentype.inc> diff --git a/libclc/generic/lib/integer/rhadd.inc b/libclc/generic/lib/integer/rhadd.inc new file mode 100644 index 0000000..3d60768 --- /dev/null +++ b/libclc/generic/lib/integer/rhadd.inc @@ -0,0 +1,6 @@ +//rhadd = (x+y+1)>>1 +//This can be simplified to x>>1 + y>>1 + (1 if either x or y have the 1s bit set) +//This saves us having to do any checks for overflow in the addition sums +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rhadd(__CLC_GENTYPE x, __CLC_GENTYPE y) { + return (x>>(__CLC_GENTYPE)1)+(y>>(__CLC_GENTYPE)1)+((x&(__CLC_GENTYPE)1)|(y&(__CLC_GENTYPE)1)); +} diff --git a/libclc/generic/lib/integer/rotate.cl b/libclc/generic/lib/integer/rotate.cl new file mode 100644 index 0000000..27ce515 --- /dev/null +++ b/libclc/generic/lib/integer/rotate.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <rotate.inc> +#include <clc/integer/gentype.inc> diff --git a/libclc/generic/lib/integer/rotate.inc b/libclc/generic/lib/integer/rotate.inc new file mode 100644 index 0000000..33bb0a8 --- /dev/null +++ b/libclc/generic/lib/integer/rotate.inc @@ -0,0 +1,42 @@ +/** + * Not necessarily optimal... but it produces correct results (at least for int) + * If we're lucky, LLVM will recognize the pattern and produce rotate + * instructions: + * http://llvm.1065342.n5.nabble.com/rotate-td47679.html + * + * Eventually, someone should feel free to implement an llvm-specific version + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rotate(__CLC_GENTYPE x, __CLC_GENTYPE n){ + //Try to avoid extra work if someone's spinning the value through multiple + //full rotations + n = n % (__CLC_GENTYPE)__CLC_GENSIZE; + +#ifdef __CLC_SCALAR + if (n > 0){ + return (x << n) | (((__CLC_U_GENTYPE)x) >> (__CLC_GENSIZE - n)); + } else if (n == 0){ + return x; + } else { + return ( (((__CLC_U_GENTYPE)x) >> -n) | (x << (__CLC_GENSIZE + n)) ); + } +#else + //XXX: There's a lot of __builtin_astype calls to cast everything to + // unsigned ... This should be improved so that if __CLC_GENTYPE==__CLC_U_GENTYPE, no + // casts are required. + + __CLC_U_GENTYPE x_1 = __builtin_astype(x, __CLC_U_GENTYPE); + + //XXX: Is (__CLC_U_GENTYPE >> S__CLC_GENTYPE) | (__CLC_U_GENTYPE << S__CLC_GENTYPE) legal? + // If so, then combine the amt and shifts into a single set of statements + + __CLC_U_GENTYPE amt; + amt = (n < (__CLC_GENTYPE)0 ? __builtin_astype((__CLC_GENTYPE)0-n, __CLC_U_GENTYPE) : (__CLC_U_GENTYPE)0); + x_1 = (x_1 >> amt) | (x_1 << ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt)); + + amt = (n < (__CLC_GENTYPE)0 ? (__CLC_U_GENTYPE)0 : __builtin_astype(n, __CLC_U_GENTYPE)); + x_1 = (x_1 << amt) | (x_1 >> ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt)); + + return __builtin_astype(x_1, __CLC_GENTYPE); +#endif +} diff --git a/libclc/generic/lib/integer/sub_sat.cl b/libclc/generic/lib/integer/sub_sat.cl new file mode 100644 index 0000000..6b42cc8 --- /dev/null +++ b/libclc/generic/lib/integer/sub_sat.cl @@ -0,0 +1,53 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +// From sub_sat.ll +_CLC_DECL char __clc_sub_sat_s8(char, char); +_CLC_DECL uchar __clc_sub_sat_u8(uchar, uchar); +_CLC_DECL short __clc_sub_sat_s16(short, short); +_CLC_DECL ushort __clc_sub_sat_u16(ushort, ushort); +_CLC_DECL int __clc_sub_sat_s32(int, int); +_CLC_DECL uint __clc_sub_sat_u32(uint, uint); +_CLC_DECL long __clc_sub_sat_s64(long, long); +_CLC_DECL ulong __clc_sub_sat_u64(ulong, ulong); + +_CLC_OVERLOAD _CLC_DEF char sub_sat(char x, char y) { + return __clc_sub_sat_s8(x, y); +} + +_CLC_OVERLOAD _CLC_DEF uchar sub_sat(uchar x, uchar y) { + return __clc_sub_sat_u8(x, y); +} + +_CLC_OVERLOAD _CLC_DEF short sub_sat(short x, short y) { + return __clc_sub_sat_s16(x, y); +} + +_CLC_OVERLOAD _CLC_DEF ushort sub_sat(ushort x, ushort y) { + return __clc_sub_sat_u16(x, y); +} + +_CLC_OVERLOAD _CLC_DEF int sub_sat(int x, int y) { + return __clc_sub_sat_s32(x, y); +} + +_CLC_OVERLOAD _CLC_DEF uint sub_sat(uint x, uint y) { + return __clc_sub_sat_u32(x, y); +} + +_CLC_OVERLOAD _CLC_DEF long sub_sat(long x, long y) { + return __clc_sub_sat_s64(x, y); +} + +_CLC_OVERLOAD _CLC_DEF ulong sub_sat(ulong x, ulong y) { + return __clc_sub_sat_u64(x, y); +} + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, sub_sat, char, char) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, sub_sat, uchar, uchar) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, sub_sat, short, short) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, sub_sat, ushort, ushort) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, sub_sat, int, int) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, sub_sat, uint, uint) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, sub_sat, long, long) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, sub_sat, ulong, ulong) diff --git a/libclc/generic/lib/integer/sub_sat_if.ll b/libclc/generic/lib/integer/sub_sat_if.ll new file mode 100644 index 0000000..7252574 --- /dev/null +++ b/libclc/generic/lib/integer/sub_sat_if.ll @@ -0,0 +1,55 @@ +declare i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y) + +define i8 @__clc_sub_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y) + ret i8 %call +} + +declare i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y) + +define i8 @__clc_sub_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y) + ret i8 %call +} + +declare i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y) + +define i16 @__clc_sub_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y) + ret i16 %call +} + +declare i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y) + +define i16 @__clc_sub_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y) + ret i16 %call +} + +declare i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y) + +define i32 @__clc_sub_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y) + ret i32 %call +} + +declare i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y) + +define i32 @__clc_sub_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y) + ret i32 %call +} + +declare i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y) + +define i64 @__clc_sub_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y) + ret i64 %call +} + +declare i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y) + +define i64 @__clc_sub_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y) + ret i64 %call +} diff --git a/libclc/generic/lib/integer/sub_sat_impl.ll b/libclc/generic/lib/integer/sub_sat_impl.ll new file mode 100644 index 0000000..e82b632 --- /dev/null +++ b/libclc/generic/lib/integer/sub_sat_impl.ll @@ -0,0 +1,83 @@ +declare {i8, i1} @llvm.ssub.with.overflow.i8(i8, i8) +declare {i8, i1} @llvm.usub.with.overflow.i8(i8, i8) + +define i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %x, i8 %y) + %res = extractvalue {i8, i1} %call, 0 + %over = extractvalue {i8, i1} %call, 1 + %x.msb = ashr i8 %x, 7 + %x.limit = xor i8 %x.msb, 127 + %sat = select i1 %over, i8 %x.limit, i8 %res + ret i8 %sat +} + +define i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y) nounwind readnone alwaysinline { + %call = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %x, i8 %y) + %res = extractvalue {i8, i1} %call, 0 + %over = extractvalue {i8, i1} %call, 1 + %sat = select i1 %over, i8 0, i8 %res + ret i8 %sat +} + +declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) +declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) + +define i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %x, i16 %y) + %res = extractvalue {i16, i1} %call, 0 + %over = extractvalue {i16, i1} %call, 1 + %x.msb = ashr i16 %x, 15 + %x.limit = xor i16 %x.msb, 32767 + %sat = select i1 %over, i16 %x.limit, i16 %res + ret i16 %sat +} + +define i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y) nounwind readnone alwaysinline { + %call = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %x, i16 %y) + %res = extractvalue {i16, i1} %call, 0 + %over = extractvalue {i16, i1} %call, 1 + %sat = select i1 %over, i16 0, i16 %res + ret i16 %sat +} + +declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) +declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) + +define i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %x, i32 %y) + %res = extractvalue {i32, i1} %call, 0 + %over = extractvalue {i32, i1} %call, 1 + %x.msb = ashr i32 %x, 31 + %x.limit = xor i32 %x.msb, 2147483647 + %sat = select i1 %over, i32 %x.limit, i32 %res + ret i32 %sat +} + +define i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y) nounwind readnone alwaysinline { + %call = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %x, i32 %y) + %res = extractvalue {i32, i1} %call, 0 + %over = extractvalue {i32, i1} %call, 1 + %sat = select i1 %over, i32 0, i32 %res + ret i32 %sat +} + +declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) +declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) + +define i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %x, i64 %y) + %res = extractvalue {i64, i1} %call, 0 + %over = extractvalue {i64, i1} %call, 1 + %x.msb = ashr i64 %x, 63 + %x.limit = xor i64 %x.msb, 9223372036854775807 + %sat = select i1 %over, i64 %x.limit, i64 %res + ret i64 %sat +} + +define i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y) nounwind readnone alwaysinline { + %call = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %x, i64 %y) + %res = extractvalue {i64, i1} %call, 0 + %over = extractvalue {i64, i1} %call, 1 + %sat = select i1 %over, i64 0, i64 %res + ret i64 %sat +} diff --git a/libclc/generic/lib/integer/upsample.cl b/libclc/generic/lib/integer/upsample.cl new file mode 100644 index 0000000..da77315 --- /dev/null +++ b/libclc/generic/lib/integer/upsample.cl @@ -0,0 +1,34 @@ +#include <clc/clc.h> + +#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE) \ + _CLC_OVERLOAD _CLC_DEF BGENTYPE upsample(GENTYPE hi, UGENTYPE lo){ \ + return ((BGENTYPE)hi << GENSIZE) | lo; \ + } \ + _CLC_OVERLOAD _CLC_DEF BGENTYPE##2 upsample(GENTYPE##2 hi, UGENTYPE##2 lo){ \ + return (BGENTYPE##2){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF BGENTYPE##3 upsample(GENTYPE##3 hi, UGENTYPE##3 lo){ \ + return (BGENTYPE##3){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1), upsample(hi.s2, lo.s2)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF BGENTYPE##4 upsample(GENTYPE##4 hi, UGENTYPE##4 lo){ \ + return (BGENTYPE##4){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF BGENTYPE##8 upsample(GENTYPE##8 hi, UGENTYPE##8 lo){ \ + return (BGENTYPE##8){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DEF BGENTYPE##16 upsample(GENTYPE##16 hi, UGENTYPE##16 lo){ \ + return (BGENTYPE##16){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + +#define __CLC_UPSAMPLE_TYPES() \ + __CLC_UPSAMPLE_IMPL(short, char, uchar, 8) \ + __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8) \ + __CLC_UPSAMPLE_IMPL(int, short, ushort, 16) \ + __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16) \ + __CLC_UPSAMPLE_IMPL(long, int, uint, 32) \ + __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32) \ + +__CLC_UPSAMPLE_TYPES() + +#undef __CLC_UPSAMPLE_TYPES +#undef __CLC_UPSAMPLE_IMPL diff --git a/libclc/generic/lib/math/acos.cl b/libclc/generic/lib/math/acos.cl new file mode 100644 index 0000000..3ce9655 --- /dev/null +++ b/libclc/generic/lib/math/acos.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <acos.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/acos.inc b/libclc/generic/lib/math/acos.inc new file mode 100644 index 0000000..cac94992 --- /dev/null +++ b/libclc/generic/lib/math/acos.inc @@ -0,0 +1,29 @@ +/* + * There are multiple formulas for calculating arccosine of x: + * 1) acos(x) = (1/2*pi) + i * ln(i*x + sqrt(1-x^2)) (notice the 'i'...) + * 2) acos(x) = pi/2 + asin(-x) (asin isn't implemented yet) + * 3) acos(x) = pi/2 - asin(x) (ditto) + * 4) acos(x) = 2*atan2(sqrt(1-x), sqrt(1+x)) + * 5) acos(x) = pi/2 - atan2(x, ( sqrt(1-x^2) ) ) + * + * Options 1-3 are not currently usable, #5 generates more concise radeonsi + * bitcode and assembly than #4 (134 vs 132 instructions on radeonsi), but + * precision of #4 may be better. + */ + +#if __CLC_FPSIZE == 32 +#define __CLC_CONST(x) x ## f +#else +#define __CLC_CONST(x) x +#endif + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE acos(__CLC_GENTYPE x) { + return ( + (__CLC_GENTYPE) __CLC_CONST(2.0) * atan2( + sqrt((__CLC_GENTYPE) __CLC_CONST(1.0) - x), + sqrt((__CLC_GENTYPE) __CLC_CONST(1.0) + x) + ) + ); +} + +#undef __CLC_CONST diff --git a/libclc/generic/lib/math/acosh.cl b/libclc/generic/lib/math/acosh.cl new file mode 100644 index 0000000..cc10dd4 --- /dev/null +++ b/libclc/generic/lib/math/acosh.cl @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "ep_log.h" +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float acosh(float x) { + uint ux = as_uint(x); + + // Arguments greater than 1/sqrt(epsilon) in magnitude are + // approximated by acosh(x) = ln(2) + ln(x) + // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is + // acosh(x) = ln(x + sqrt(x*x-1)) */ + int high = ux > 0x46000000U; + int med = ux > 0x40000000U; + + float w = x - 1.0f; + float s = w*w + 2.0f*w; + float t = x*x - 1.0f; + float r = sqrt(med ? t : s) + (med ? x : w); + float v = (high ? x : r) - (med ? 1.0f : 0.0f); + float z = log1p(v) + (high ? 0x1.62e430p-1f : 0.0f); + + z = ux >= PINFBITPATT_SP32 ? x : z; + z = x < 1.0f ? as_float(QNANBITPATT_SP32) : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, acosh, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double acosh(double x) { + const double recrteps = 0x1.6a09e667f3bcdp+26; // 1/sqrt(eps) = 9.49062656242515593767e+07 + //log2_lead and log2_tail sum to an extra-precise version of log(2) + const double log2_lead = 0x1.62e42ep-1; + const double log2_tail = 0x1.efa39ef35793cp-25; + + // Handle x >= 128 here + int xlarge = x > recrteps; + double r = x + sqrt(fma(x, x, -1.0)); + r = xlarge ? x : r; + + int xexp; + double r1, r2; + __clc_ep_log(r, &xexp, &r1, &r2); + + double dxexp = xexp + xlarge; + r1 = fma(dxexp, log2_lead, r1); + r2 = fma(dxexp, log2_tail, r2); + + double ret1 = r1 + r2; + + // Handle 1 < x < 128 here + // We compute the value + // t = x - 1.0 + sqrt(2.0*(x - 1.0) + (x - 1.0)*(x - 1.0)) + // using simulated quad precision. + double t = x - 1.0; + double u1 = t * 2.0; + + // (t,0) * (t,0) -> (v1, v2) + double v1 = t * t; + double v2 = fma(t, t, -v1); + + // (u1,0) + (v1,v2) -> (w1,w2) + r = u1 + v1; + double s = (((u1 - r) + v1) + v2); + double w1 = r + s; + double w2 = (r - w1) + s; + + // sqrt(w1,w2) -> (u1,u2) + double p1 = sqrt(w1); + double a1 = p1*p1; + double a2 = fma(p1, p1, -a1); + double temp = (((w1 - a1) - a2) + w2); + double p2 = MATH_DIVIDE(temp * 0.5, p1); + u1 = p1 + p2; + double u2 = (p1 - u1) + p2; + + // (u1,u2) + (t,0) -> (r1,r2) + r = u1 + t; + s = ((u1 - r) + t) + u2; + // r1 = r + s; + // r2 = (r - r1) + s; + // t = r1 + r2; + t = r + s; + + // For arguments 1.13 <= x <= 1.5 the log1p function is good enough + double ret2 = log1p(t); + + ulong ux = as_ulong(x); + double ret = x >= 128.0 ? ret1 : ret2; + + ret = ux >= 0x7FF0000000000000 ? x : ret; + ret = x == 1.0 ? 0.0 : ret; + ret = (ux & SIGNBIT_DP64) != 0UL | x < 1.0 ? as_double(QNANBITPATT_DP64) : ret; + + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acosh, double) + +#endif diff --git a/libclc/generic/lib/math/acospi.cl b/libclc/generic/lib/math/acospi.cl new file mode 100644 index 0000000..c91fc41 --- /dev/null +++ b/libclc/generic/lib/math/acospi.cl @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float acospi(float x) { + // Computes arccos(x). + // The argument is first reduced by noting that arccos(x) + // is invalid for abs(x) > 1. For denormal and small + // arguments arccos(x) = pi/2 to machine accuracy. + // Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arccos(x) = pi/2 - arcsin(x) + // = pi/2 - (x + x^3*R(x^2)) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + + // Some constants and split constants. + const float pi = 3.1415926535897933e+00f; + const float piby2_head = 1.5707963267948965580e+00f; /* 0x3ff921fb54442d18 */ + const float piby2_tail = 6.12323399573676603587e-17f; /* 0x3c91a62633145c07 */ + + uint ux = as_uint(x); + uint aux = ux & ~SIGNBIT_SP32; + int xneg = ux != aux; + int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + float y = as_float(aux); + + // transform if |x| >= 0.5 + int transform = xexp >= -1; + + float y2 = y * y; + float yt = 0.5f * (1.0f - y); + float r = transform ? yt : y2; + + // Use a rational approximation for [0.0, 0.5] + float a = mad(r, mad(r, mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F), + -0.0565298683201845211985026327361F), + 0.184161606965100694821398249421F); + float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F); + float u = r * MATH_DIVIDE(a, b); + + float s = MATH_SQRT(r); + y = s; + float s1 = as_float(as_uint(s) & 0xffff0000); + float c = MATH_DIVIDE(r - s1 * s1, s + s1); + // float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + (y * u - piby2_tail)), pi); + float rettn = 1.0f - MATH_DIVIDE(2.0f * (s + mad(y, u, -piby2_tail)), pi); + // float rettp = MATH_DIVIDE(2.0F * s1 + (2.0F * c + 2.0F * y * u), pi); + float rettp = MATH_DIVIDE(2.0f*(s1 + mad(y, u, c)), pi); + float rett = xneg ? rettn : rettp; + // float ret = MATH_DIVIDE(piby2_head - (x - (piby2_tail - x * u)), pi); + float ret = MATH_DIVIDE(piby2_head - (x - mad(x, -u, piby2_tail)), pi); + + ret = transform ? rett : ret; + ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret; + ret = ux == 0x3f800000U ? 0.0f : ret; + ret = ux == 0xbf800000U ? 1.0f : ret; + ret = xexp < -26 ? 0.5f : ret; + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, acospi, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double acospi(double x) { + // Computes arccos(x). + // The argument is first reduced by noting that arccos(x) + // is invalid for abs(x) > 1. For denormal and small + // arguments arccos(x) = pi/2 to machine accuracy. + // Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arccos(x) = pi/2 - arcsin(x) + // = pi/2 - (x + x^3*R(x^2)) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + const double pi = 0x1.921fb54442d18p+1; + const double piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */ + + double y = fabs(x); + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64; + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + // Transform y into the range [0,0.5) + double r1 = 0.5 * (1.0 - y); + double s = sqrt(r1); + double r = y * y; + r = transform ? r1 : r; + y = transform ? s : y; + + // Use a rational approximation for [0.0, 0.5] + double un = fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0.0000482901920344786991880522822991, + 0.00109242697235074662306043804220), + -0.0549989809235685841612020091328), + 0.275558175256937652532686256258), + -0.445017216867635649900123110649), + 0.227485835556935010735943483075); + + double ud = fma(r, + fma(r, + fma(r, + fma(r, 0.105869422087204370341222318533, + -0.943639137032492685763471240072), + 2.76568859157270989520376345954), + -3.28431505720958658909889444194), + 1.36491501334161032038194214209); + + double u = r * MATH_DIVIDE(un, ud); + + // Reconstruct acos carefully in transformed region + double res1 = fma(-2.0, MATH_DIVIDE(s + fma(y, u, -piby2_tail), pi), 1.0); + double s1 = as_double(as_ulong(s) & 0xffffffff00000000UL); + double c = MATH_DIVIDE(fma(-s1, s1, r), s + s1); + double res2 = MATH_DIVIDE(fma(2.0, s1, fma(2.0, c, 2.0 * y * u)), pi); + res1 = xneg ? res1 : res2; + res2 = 0.5 - fma(x, u, x) / pi; + res1 = transform ? res1 : res2; + + const double qnan = as_double(QNANBITPATT_DP64); + res2 = x == 1.0 ? 0.0 : qnan; + res2 = x == -1.0 ? 1.0 : res2; + res1 = xexp >= 0 ? res2 : res1; + res1 = xexp < -56 ? 0.5 : res1; + + return res1; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acospi, double) + +#endif diff --git a/libclc/generic/lib/math/asin.cl b/libclc/generic/lib/math/asin.cl new file mode 100644 index 0000000..d56dbd7 --- /dev/null +++ b/libclc/generic/lib/math/asin.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <asin.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/asin.inc b/libclc/generic/lib/math/asin.inc new file mode 100644 index 0000000..4643cf8 --- /dev/null +++ b/libclc/generic/lib/math/asin.inc @@ -0,0 +1,12 @@ + +#if __CLC_FPSIZE == 32 +#define __CLC_CONST(x) x ## f +#else +#define __CLC_CONST(x) x +#endif + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE asin(__CLC_GENTYPE x) { + return atan2(x, sqrt( (__CLC_GENTYPE)__CLC_CONST(1.0) -(x*x) )); +} + +#undef __CLC_CONST diff --git a/libclc/generic/lib/math/asinh.cl b/libclc/generic/lib/math/asinh.cl new file mode 100644 index 0000000..cfddb31c --- /dev/null +++ b/libclc/generic/lib/math/asinh.cl @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "ep_log.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float asinh(float x) { + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + uint xsgn = ax ^ ux; + + // |x| <= 2 + float t = x * x; + float a = mad(t, + mad(t, + mad(t, + mad(t, -1.177198915954942694e-4f, -4.162727710583425360e-2f), + -5.063201055468483248e-1f), + -1.480204186473758321f), + -1.152965835871758072f); + float b = mad(t, + mad(t, + mad(t, + mad(t, 6.284381367285534560e-2f, 1.260024978680227945f), + 6.582362487198468066f), + 11.99423176003939087f), + 6.917795026025976739f); + + float q = MATH_DIVIDE(a, b); + float z1 = mad(x*t, q, x); + + // |x| > 2 + + // Arguments greater than 1/sqrt(epsilon) in magnitude are + // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x + // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are + // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1)) + // with the sign of x (see Abramowitz and Stegun 4.6.20) + + float absx = as_float(ax); + int hi = ax > 0x46000000U; + float y = MATH_SQRT(absx * absx + 1.0f) + absx; + y = hi ? absx : y; + float r = log(y) + (hi ? 0x1.62e430p-1f : 0.0f); + float z2 = as_float(xsgn | as_uint(r)); + + float z = ax <= 0x40000000 ? z1 : z2; + z = ax < 0x39800000U | ax >= PINFBITPATT_SP32 ? x : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, asinh, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define NA0 -0.12845379283524906084997e0 +#define NA1 -0.21060688498409799700819e0 +#define NA2 -0.10188951822578188309186e0 +#define NA3 -0.13891765817243625541799e-1 +#define NA4 -0.10324604871728082428024e-3 + +#define DA0 0.77072275701149440164511e0 +#define DA1 0.16104665505597338100747e1 +#define DA2 0.11296034614816689554875e1 +#define DA3 0.30079351943799465092429e0 +#define DA4 0.235224464765951442265117e-1 + +#define NB0 -0.12186605129448852495563e0 +#define NB1 -0.19777978436593069928318e0 +#define NB2 -0.94379072395062374824320e-1 +#define NB3 -0.12620141363821680162036e-1 +#define NB4 -0.903396794842691998748349e-4 + +#define DB0 0.73119630776696495279434e0 +#define DB1 0.15157170446881616648338e1 +#define DB2 0.10524909506981282725413e1 +#define DB3 0.27663713103600182193817e0 +#define DB4 0.21263492900663656707646e-1 + +#define NC0 -0.81210026327726247622500e-1 +#define NC1 -0.12327355080668808750232e0 +#define NC2 -0.53704925162784720405664e-1 +#define NC3 -0.63106739048128554465450e-2 +#define NC4 -0.35326896180771371053534e-4 + +#define DC0 0.48726015805581794231182e0 +#define DC1 0.95890837357081041150936e0 +#define DC2 0.62322223426940387752480e0 +#define DC3 0.15028684818508081155141e0 +#define DC4 0.10302171620320141529445e-1 + +#define ND0 -0.4638179204422665073e-1 +#define ND1 -0.7162729496035415183e-1 +#define ND2 -0.3247795155696775148e-1 +#define ND3 -0.4225785421291932164e-2 +#define ND4 -0.3808984717603160127e-4 +#define ND5 0.8023464184964125826e-6 + +#define DD0 0.2782907534642231184e0 +#define DD1 0.5549945896829343308e0 +#define DD2 0.3700732511330698879e0 +#define DD3 0.9395783438240780722e-1 +#define DD4 0.7200057974217143034e-2 + +#define NE0 -0.121224194072430701e-4 +#define NE1 -0.273145455834305218e-3 +#define NE2 -0.152866982560895737e-2 +#define NE3 -0.292231744584913045e-2 +#define NE4 -0.174670900236060220e-2 +#define NE5 -0.891754209521081538e-12 + +#define DE0 0.499426632161317606e-4 +#define DE1 0.139591210395547054e-2 +#define DE2 0.107665231109108629e-1 +#define DE3 0.325809818749873406e-1 +#define DE4 0.415222526655158363e-1 +#define DE5 0.186315628774716763e-1 + +#define NF0 -0.195436610112717345e-4 +#define NF1 -0.233315515113382977e-3 +#define NF2 -0.645380957611087587e-3 +#define NF3 -0.478948863920281252e-3 +#define NF4 -0.805234112224091742e-12 +#define NF5 0.246428598194879283e-13 + +#define DF0 0.822166621698664729e-4 +#define DF1 0.135346265620413852e-2 +#define DF2 0.602739242861830658e-2 +#define DF3 0.972227795510722956e-2 +#define DF4 0.510878800983771167e-2 + +#define NG0 -0.209689451648100728e-6 +#define NG1 -0.219252358028695992e-5 +#define NG2 -0.551641756327550939e-5 +#define NG3 -0.382300259826830258e-5 +#define NG4 -0.421182121910667329e-17 +#define NG5 0.492236019998237684e-19 + +#define DG0 0.889178444424237735e-6 +#define DG1 0.131152171690011152e-4 +#define DG2 0.537955850185616847e-4 +#define DG3 0.814966175170941864e-4 +#define DG4 0.407786943832260752e-4 + +#define NH0 -0.178284193496441400e-6 +#define NH1 -0.928734186616614974e-6 +#define NH2 -0.923318925566302615e-6 +#define NH3 -0.776417026702577552e-19 +#define NH4 0.290845644810826014e-21 + +#define DH0 0.786694697277890964e-6 +#define DH1 0.685435665630965488e-5 +#define DH2 0.153780175436788329e-4 +#define DH3 0.984873520613417917e-5 + +#define NI0 -0.538003743384069117e-10 +#define NI1 -0.273698654196756169e-9 +#define NI2 -0.268129826956403568e-9 +#define NI3 -0.804163374628432850e-29 + +#define DI0 0.238083376363471960e-9 +#define DI1 0.203579344621125934e-8 +#define DI2 0.450836980450693209e-8 +#define DI3 0.286005148753497156e-8 + +_CLC_OVERLOAD _CLC_DEF double asinh(double x) { + const double rteps = 0x1.6a09e667f3bcdp-27; + const double recrteps = 0x1.6a09e667f3bcdp+26; + + // log2_lead and log2_tail sum to an extra-precise version of log(2) + const double log2_lead = 0x1.62e42ep-1; + const double log2_tail = 0x1.efa39ef35793cp-25; + + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + double absx = as_double(ax); + + double t = x * x; + double pn, tn, pd, td; + + // XXX we are betting here that we can evaluate 8 pairs of + // polys faster than we can grab 12 coefficients from a table + // This also uses fewer registers + + // |x| >= 8 + pn = fma(t, fma(t, fma(t, NI3, NI2), NI1), NI0); + pd = fma(t, fma(t, fma(t, DI3, DI2), DI1), DI0); + + tn = fma(t, fma(t, fma(t, fma(t, NH4, NH3), NH2), NH1), NH0); + td = fma(t, fma(t, fma(t, DH3, DH2), DH1), DH0); + pn = absx < 8.0 ? tn : pn; + pd = absx < 8.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, NG5, NG4), NG3), NG2), NG1), NG0); + td = fma(t, fma(t, fma(t, fma(t, DG4, DG3), DG2), DG1), DG0); + pn = absx < 4.0 ? tn : pn; + pd = absx < 4.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, NF5, NF4), NF3), NF2), NF1), NF0); + td = fma(t, fma(t, fma(t, fma(t, DF4, DF3), DF2), DF1), DF0); + pn = absx < 2.0 ? tn : pn; + pd = absx < 2.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, NE5, NE4), NE3), NE2), NE1), NE0); + td = fma(t, fma(t, fma(t, fma(t, fma(t, DE5, DE4), DE3), DE2), DE1), DE0); + pn = absx < 1.5 ? tn : pn; + pd = absx < 1.5 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, fma(t, ND5, ND4), ND3), ND2), ND1), ND0); + td = fma(t, fma(t, fma(t, fma(t, DD4, DD3), DD2), DD1), DD0); + pn = absx <= 1.0 ? tn : pn; + pd = absx <= 1.0 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, NC4, NC3), NC2), NC1), NC0); + td = fma(t, fma(t, fma(t, fma(t, DC4, DC3), DC2), DC1), DC0); + pn = absx < 0.75 ? tn : pn; + pd = absx < 0.75 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, NB4, NB3), NB2), NB1), NB0); + td = fma(t, fma(t, fma(t, fma(t, DB4, DB3), DB2), DB1), DB0); + pn = absx < 0.5 ? tn : pn; + pd = absx < 0.5 ? td : pd; + + tn = fma(t, fma(t, fma(t, fma(t, NA4, NA3), NA2), NA1), NA0); + td = fma(t, fma(t, fma(t, fma(t, DA4, DA3), DA2), DA1), DA0); + pn = absx < 0.25 ? tn : pn; + pd = absx < 0.25 ? td : pd; + + double pq = MATH_DIVIDE(pn, pd); + + // |x| <= 1 + double result1 = fma(absx*t, pq, absx); + + // Other ranges + int xout = absx <= 32.0 | absx > recrteps; + double y = absx + sqrt(fma(absx, absx, 1.0)); + y = xout ? absx : y; + + double r1, r2; + int xexp; + __clc_ep_log(y, &xexp, &r1, &r2); + + double dxexp = (double)(xexp + xout); + r1 = fma(dxexp, log2_lead, r1); + r2 = fma(dxexp, log2_tail, r2); + + // 1 < x <= 32 + double v2 = (pq + 0.25) / t; + double r = v2 + r1; + double s = ((r1 - r) + v2) + r2; + double v1 = r + s; + v2 = (r - v1) + s; + double result2 = v1 + v2; + + // x > 32 + double result3 = r1 + r2; + + double ret = absx > 1.0 ? result2 : result1; + ret = absx > 32.0 ? result3 : ret; + ret = x < 0.0 ? -ret : ret; + + // NaN, +-Inf, or x small enough that asinh(x) = x + ret = ax >= PINFBITPATT_DP64 | absx < rteps ? x : ret; + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, asinh, double) + +#endif diff --git a/libclc/generic/lib/math/asinpi.cl b/libclc/generic/lib/math/asinpi.cl new file mode 100644 index 0000000..511d74e --- /dev/null +++ b/libclc/generic/lib/math/asinpi.cl @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float asinpi(float x) { + // Computes arcsin(x). + // The argument is first reduced by noting that arcsin(x) + // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + // For denormal and small arguments arcsin(x) = x to machine + // accuracy. Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arcsin(x) = x + x^3*R(x^2) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + + const float pi = 3.1415926535897933e+00f; + const float piby2_tail = 7.5497894159e-08F; /* 0x33a22168 */ + const float hpiby2_head = 7.8539812565e-01F; /* 0x3f490fda */ + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint xs = ux ^ aux; + float shalf = as_float(xs | as_uint(0.5f)); + + int xexp = (int)(aux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + float y = as_float(aux); + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + float y2 = y * y; + float rt = 0.5f * (1.0f - y); + float r = transform ? rt : y2; + + // Use a rational approximation for [0.0, 0.5] + float a = mad(r, + mad(r, + mad(r, -0.00396137437848476485201154797087F, -0.0133819288943925804214011424456F), + -0.0565298683201845211985026327361F), + 0.184161606965100694821398249421F); + float b = mad(r, -0.836411276854206731913362287293F, 1.10496961524520294485512696706F); + float u = r * MATH_DIVIDE(a, b); + + float s = MATH_SQRT(r); + float s1 = as_float(as_uint(s) & 0xffff0000); + float c = MATH_DIVIDE(mad(-s1, s1, r), s + s1); + float p = mad(2.0f*s, u, -mad(c, -2.0f, piby2_tail)); + float q = mad(s1, -2.0f, hpiby2_head); + float vt = hpiby2_head - (p - q); + float v = mad(y, u, y); + v = transform ? vt : v; + v = MATH_DIVIDE(v, pi); + float xbypi = MATH_DIVIDE(x, pi); + + float ret = as_float(xs | as_uint(v)); + ret = aux > 0x3f800000U ? as_float(QNANBITPATT_SP32) : ret; + ret = aux == 0x3f800000U ? shalf : ret; + ret = xexp < -14 ? xbypi : ret; + + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, asinpi, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double asinpi(double x) { + // Computes arcsin(x). + // The argument is first reduced by noting that arcsin(x) + // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + // For denormal and small arguments arcsin(x) = x to machine + // accuracy. Remaining argument ranges are handled as follows. + // For abs(x) <= 0.5 use + // arcsin(x) = x + x^3*R(x^2) + // where R(x^2) is a rational minimax approximation to + // (arcsin(x) - x)/x^3. + // For abs(x) > 0.5 exploit the identity: + // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + // together with the above rational approximation, and + // reconstruct the terms carefully. + + const double pi = 0x1.921fb54442d18p+1; + const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + const double hpiby2_head = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ + + double y = fabs(x); + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(y).hi >> 20) - EXPBIAS_DP64; + + // abs(x) >= 0.5 + int transform = xexp >= -1; + + double rt = 0.5 * (1.0 - y); + double y2 = y * y; + double r = transform ? rt : y2; + + // Use a rational approximation for [0.0, 0.5] + double un = fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0.0000482901920344786991880522822991, + 0.00109242697235074662306043804220), + -0.0549989809235685841612020091328), + 0.275558175256937652532686256258), + -0.445017216867635649900123110649), + 0.227485835556935010735943483075); + + double ud = fma(r, + fma(r, + fma(r, + fma(r, 0.105869422087204370341222318533, + -0.943639137032492685763471240072), + 2.76568859157270989520376345954), + -3.28431505720958658909889444194), + 1.36491501334161032038194214209); + + double u = r * MATH_DIVIDE(un, ud); + + + // Reconstruct asin carefully in transformed region + double s = sqrt(r); + double sh = as_double(as_ulong(s) & 0xffffffff00000000UL); + double c = MATH_DIVIDE(fma(-sh, sh, r), s + sh); + double p = fma(2.0*s, u, -fma(-2.0, c, piby2_tail)); + double q = fma(-2.0, sh, hpiby2_head); + double vt = hpiby2_head - (p - q); + double v = fma(y, u, y); + v = transform ? vt : v; + + v = xexp < -28 ? y : v; + v = MATH_DIVIDE(v, pi); + v = xexp >= 0 ? as_double(QNANBITPATT_DP64) : v; + v = y == 1.0 ? 0.5 : v; + return xneg ? -v : v; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, asinpi, double) + +#endif diff --git a/libclc/generic/lib/math/atan.cl b/libclc/generic/lib/math/atan.cl new file mode 100644 index 0000000..fa3633c --- /dev/null +++ b/libclc/generic/lib/math/atan.cl @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math.h" +#include "../clcmacro.h" + +#include <clc/clc.h> + +_CLC_OVERLOAD _CLC_DEF float atan(float x) +{ + const float piby2 = 1.5707963267948966f; // 0x3ff921fb54442d18 + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint sx = ux ^ aux; + + float spiby2 = as_float(sx | as_uint(piby2)); + + float v = as_float(aux); + + // Return for NaN + float ret = x; + + // 2^26 <= |x| <= Inf => atan(x) is close to piby2 + ret = aux <= PINFBITPATT_SP32 ? spiby2 : ret; + + // Reduce arguments 2^-19 <= |x| < 2^26 + + // 39/16 <= x < 2^26 + x = -MATH_RECIP(v); + float c = 1.57079632679489655800f; // atan(infinity) + + // 19/16 <= x < 39/16 + int l = aux < 0x401c0000; + float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f)); + x = l ? xx : x; + c = l ? 9.82793723247329054082e-01f : c; // atan(1.5) + + // 11/16 <= x < 19/16 + l = aux < 0x3f980000U; + xx = MATH_DIVIDE(v - 1.0f, 1.0f + v); + x = l ? xx : x; + c = l ? 7.85398163397448278999e-01f : c; // atan(1) + + // 7/16 <= x < 11/16 + l = aux < 0x3f300000; + xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v); + x = l ? xx : x; + c = l ? 4.63647609000806093515e-01f : c; // atan(0.5) + + // 2^-19 <= x < 7/16 + l = aux < 0x3ee00000; + x = l ? v : x; + c = l ? 0.0f : c; + + // Core approximation: Remez(2,2) on [-7/16,7/16] + + float s = x * x; + float a = mad(s, + mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f), + 0.296528598819239217902158651186f); + + float b = mad(s, + mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f), + 0.889585796862432286486651434570f); + + float q = x * s * MATH_DIVIDE(a, b); + + float z = c - (q - x); + float zs = as_float(sx | as_uint(z)); + + ret = aux < 0x4c800000 ? zs : ret; + + // |x| < 2^-19 + ret = aux < 0x36000000 ? as_float(ux) : ret; + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + + +_CLC_OVERLOAD _CLC_DEF double atan(double x) +{ + const double piby2 = 1.5707963267948966e+00; // 0x3ff921fb54442d18 + + double v = fabs(x); + + // 2^56 > v > 39/16 + double a = -1.0; + double b = v; + // (chi + clo) = arctan(infinity) + double chi = 1.57079632679489655800e+00; + double clo = 6.12323399573676480327e-17; + + double ta = v - 1.5; + double tb = 1.0 + 1.5 * v; + int l = v <= 0x1.38p+1; // 39/16 > v > 19/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.5) + chi = l ? 9.82793723247329054082e-01 : chi; + clo = l ? 1.39033110312309953701e-17 : clo; + + ta = v - 1.0; + tb = 1.0 + v; + l = v <= 0x1.3p+0; // 19/16 > v > 11/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.) + chi = l ? 7.85398163397448278999e-01 : chi; + clo = l ? 3.06161699786838240164e-17 : clo; + + ta = 2.0 * v - 1.0; + tb = 2.0 + v; + l = v <= 0x1.6p-1; // 11/16 > v > 7/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(0.5) + chi = l ? 4.63647609000806093515e-01 : chi; + clo = l ? 2.26987774529616809294e-17 : clo; + + l = v <= 0x1.cp-2; // v < 7/16 + a = l ? v : a; + b = l ? 1.0 : b;; + chi = l ? 0.0 : chi; + clo = l ? 0.0 : clo; + + // Core approximation: Remez(4,4) on [-7/16,7/16] + double r = a / b; + double s = r * r; + double qn = fma(s, + fma(s, + fma(s, + fma(s, 0.142316903342317766e-3, + 0.304455919504853031e-1), + 0.220638780716667420e0), + 0.447677206805497472e0), + 0.268297920532545909e0); + + double qd = fma(s, + fma(s, + fma(s, + fma(s, 0.389525873944742195e-1, + 0.424602594203847109e0), + 0.141254259931958921e1), + 0.182596787737507063e1), + 0.804893761597637733e0); + + double q = r * s * qn / qd; + r = chi - ((q - clo) - r); + + double z = isnan(x) ? x : piby2; + z = v <= 0x1.0p+56 ? r : z; + z = v < 0x1.0p-26 ? v : z; + return x == v ? z : -z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan, double); + +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/atan2.cl b/libclc/generic/lib/math/atan2.cl new file mode 100644 index 0000000..a2f104f --- /dev/null +++ b/libclc/generic/lib/math/atan2.cl @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "tables.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float atan2(float y, float x) +{ + const float pi = 0x1.921fb6p+1f; + const float piby2 = 0x1.921fb6p+0f; + const float piby4 = 0x1.921fb6p-1f; + const float threepiby4 = 0x1.2d97c8p+1f; + + float ax = fabs(x); + float ay = fabs(y); + float v = min(ax, ay); + float u = max(ax, ay); + + // Scale since u could be large, as in "regular" divide + float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; + float vbyu = s * MATH_DIVIDE(v, s*u); + + float vbyu2 = vbyu * vbyu; + +#define USE_2_2_APPROXIMATION +#if defined USE_2_2_APPROXIMATION + float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu; + float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); +#else + float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu; + float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f); +#endif + + // Octant 0 result + float a = mad(p, MATH_RECIP(q), vbyu); + + // Fix up 3 other octants + float at = piby2 - a; + a = ay > ax ? at : a; + at = pi - a; + a = x < 0.0F ? at : a; + + // y == 0 => 0 for x >= 0, pi for x < 0 + at = as_int(x) < 0 ? pi : 0.0f; + a = y == 0.0f ? at : a; + + // if (!FINITE_ONLY()) { + // x and y are +- Inf + at = x > 0.0f ? piby4 : threepiby4; + a = ax == INFINITY & ay == INFINITY ? at : a; + + // x or y is NaN + a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a; + // } + + // Fixup sign and return + return copysign(a, y); +} + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan2, float, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double atan2(double y, double x) +{ + const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ + const double piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ + const double three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */ + const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ + const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + + double x2 = x; + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(x).hi >> 20) & 0x7ff; + + double y2 = y; + int yneg = as_int2(y).hi < 0; + int yexp = (as_int2(y).hi >> 20) & 0x7ff; + + int cond2 = (xexp < 1021) & (yexp < 1021); + int diffexp = yexp - xexp; + + // Scale up both x and y if they are both below 1/4 + double x1 = ldexp(x, 1024); + int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff; + double y1 = ldexp(y, 1024); + int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff; + int diffexp1 = yexp1 - xexp1; + + diffexp = cond2 ? diffexp1 : diffexp; + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + double u = fabs(x); + double v = fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + int swap_vu = u < v; + double uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + double vbyu = v / u; + double q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + double val = vbyu > 0.0625 ? vbyu : 0.063; + int index = convert_int(fma(256.0, val, 0.5)); + double2 tv = USE_TABLE(atan_jby256_tbl, index - 16); + q1 = tv.s0; + q2 = tv.s1; + double c = (double)index * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); + //double um = __amdil_ldexp_f64(u, m); + //double vm = __amdil_ldexp_f64(v, m); + double um = ldexp(u, m); + double vm = ldexp(v, m); + + // 26 leading bits of u + double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL); + double u2 = um - u1; + + double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um)); + + // Polynomial approximation to atan(r) + double s = r * r; + q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r); + } + + + double q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + double q5, q6; + { + double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL); + double u2 = u - u1; + double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL); + double vu2 = vbyu - vu1; + + q5 = 0.0; + double s = vbyu * vbyu; + q6 = vbyu + fma(-vbyu * s, + fma(-s, + fma(-s, + fma(-s, + fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u)); + } + + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + double res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = q1 + q2; + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -three_piby4 : three_piby4; + res2 = yneg ? -piby4 : piby4; + res3 = xneg ? res1 : res2; + + res3 = isinf(x2) & isinf(y2) ? res3 : res4; + res1 = yneg ? -pi : pi; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = (diffexp < -56 && xneg) ? res1 : res3; + + res4 = MATH_DIVIDE(y, x); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 & xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -piby2 : piby2; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x + res3 = isnan(y2) ? y2 : res3; + res3 = isnan(x2) ? x2 : res3; + + return res3; +} + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2, double, double); + +#endif diff --git a/libclc/generic/lib/math/atan2pi.cl b/libclc/generic/lib/math/atan2pi.cl new file mode 100644 index 0000000..a15b14f --- /dev/null +++ b/libclc/generic/lib/math/atan2pi.cl @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "tables.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float atan2pi(float y, float x) { + const float pi = 0x1.921fb6p+1f; + + float ax = fabs(x); + float ay = fabs(y); + float v = min(ax, ay); + float u = max(ax, ay); + + // Scale since u could be large, as in "regular" divide + float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; + float vbyu = s * MATH_DIVIDE(v, s*u); + + float vbyu2 = vbyu * vbyu; + + float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu; + float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); + + // Octant 0 result + float a = MATH_DIVIDE(mad(p, MATH_RECIP(q), vbyu), pi); + + // Fix up 3 other octants + float at = 0.5f - a; + a = ay > ax ? at : a; + at = 1.0f - a; + a = x < 0.0F ? at : a; + + // y == 0 => 0 for x >= 0, pi for x < 0 + at = as_int(x) < 0 ? 1.0f : 0.0f; + a = y == 0.0f ? at : a; + + // if (!FINITE_ONLY()) { + // x and y are +- Inf + at = x > 0.0f ? 0.25f : 0.75f; + a = ax == INFINITY & ay == INFINITY ? at : a; + + // x or y is NaN + a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a; + // } + + // Fixup sign and return + return copysign(a, y); +} + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan2pi, float, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double atan2pi(double y, double x) { + const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ + const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + + double x2 = x; + int xneg = as_int2(x).hi < 0; + int xexp = (as_int2(x).hi >> 20) & 0x7ff; + + double y2 = y; + int yneg = as_int2(y).hi < 0; + int yexp = (as_int2(y).hi >> 20) & 0x7ff; + + int cond2 = (xexp < 1021) & (yexp < 1021); + int diffexp = yexp - xexp; + + // Scale up both x and y if they are both below 1/4 + double x1 = ldexp(x, 1024); + int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff; + double y1 = ldexp(y, 1024); + int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff; + int diffexp1 = yexp1 - xexp1; + + diffexp = cond2 ? diffexp1 : diffexp; + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + double u = fabs(x); + double v = fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + int swap_vu = u < v; + double uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + double vbyu = v / u; + double q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + double val = vbyu > 0.0625 ? vbyu : 0.063; + int index = convert_int(fma(256.0, val, 0.5)); + double2 tv = USE_TABLE(atan_jby256_tbl, (index - 16)); + q1 = tv.s0; + q2 = tv.s1; + double c = (double)index * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); + double um = ldexp(u, m); + double vm = ldexp(v, m); + + // 26 leading bits of u + double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL); + double u2 = um - u1; + + double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um)); + + // Polynomial approximation to atan(r) + double s = r * r; + q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r); + } + + + double q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + double q5, q6; + { + double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL); + double u2 = u - u1; + double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL); + double vu2 = vbyu - vu1; + + q5 = 0.0; + double s = vbyu * vbyu; + q6 = vbyu + fma(-vbyu * s, + fma(-s, + fma(-s, + fma(-s, + fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u)); + } + + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + double res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = MATH_DIVIDE(q1 + q2, pi); + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -0.75 : 0.75; + res2 = yneg ? -0.25 : 0.25; + res3 = xneg ? res1 : res2; + + res3 = isinf(y2) & isinf(x2) ? res3 : res4; + res1 = yneg ? -1.0 : 1.0; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = (diffexp < -56 && xneg) ? res1 : res3; + + res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 & xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x + res3 = isnan(y2) ? y2 : res3; + res3 = isnan(x2) ? x2 : res3; + + return res3; +} + + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2pi, double, double) + +#endif diff --git a/libclc/generic/lib/math/atanh.cl b/libclc/generic/lib/math/atanh.cl new file mode 100644 index 0000000..4af2f45 --- /dev/null +++ b/libclc/generic/lib/math/atanh.cl @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float atanh(float x) { + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + uint xs = ux ^ ax; + + // |x| > 1 or NaN + float z = as_float(QNANBITPATT_SP32); + + // |x| == 1 + float t = as_float(xs | PINFBITPATT_SP32); + z = ax == 0x3f800000U ? t : z; + + // 1/2 <= |x| < 1 + t = as_float(ax); + t = MATH_DIVIDE(2.0f*t, 1.0f - t); + t = 0.5f * log1p(t); + t = as_float(xs | as_uint(t)); + z = ax < 0x3f800000U ? t : z; + + // |x| < 1/2 + t = x * x; + float a = mad(mad(0.92834212715e-2f, t, -0.28120347286e0f), t, 0.39453629046e0f); + float b = mad(mad(0.45281890445e0f, t, -0.15537744551e1f), t, 0.11836088638e1f); + float p = MATH_DIVIDE(a, b); + t = mad(x*t, p, x); + z = ax < 0x3f000000 ? t : z; + + // |x| < 2^-13 + z = ax < 0x39000000U ? x : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atanh, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double atanh(double x) { + double absx = fabs(x); + + double ret = absx == 1.0 ? as_double(PINFBITPATT_DP64) : as_double(QNANBITPATT_DP64); + + // |x| >= 0.5 + // Note that atanh(x) = 0.5 * ln((1+x)/(1-x)) + // For greater accuracy we use + // ln((1+x)/(1-x)) = ln(1 + 2x/(1-x)) = log1p(2x/(1-x)). + double r = 0.5 * log1p(2.0 * absx / (1.0 - absx)); + ret = absx < 1.0 ? r : ret; + + r = -ret; + ret = x < 0.0 ? r : ret; + + // Arguments up to 0.5 in magnitude are + // approximated by a [5,5] minimax polynomial + double t = x * x; + + double pn = fma(t, + fma(t, + fma(t, + fma(t, + fma(t, -0.10468158892753136958e-3, 0.28728638600548514553e-1), + -0.28180210961780814148e0), + 0.88468142536501647470e0), + -0.11028356797846341457e1), + 0.47482573589747356373e0); + + double pd = fma(t, + fma(t, + fma(t, + fma(t, + fma(t, -0.35861554370169537512e-1, 0.49561196555503101989e0), + -0.22608883748988489342e1), + 0.45414700626084508355e1), + -0.41631933639693546274e1), + 0.14244772076924206909e1); + + r = fma(x*t, pn/pd, x); + ret = absx < 0.5 ? r : ret; + + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanh, double) + +#endif diff --git a/libclc/generic/lib/math/atanpi.cl b/libclc/generic/lib/math/atanpi.cl new file mode 100644 index 0000000..2e2f032 --- /dev/null +++ b/libclc/generic/lib/math/atanpi.cl @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float atanpi(float x) { + const float pi = 3.1415926535897932f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint sx = ux ^ aux; + + float xbypi = MATH_DIVIDE(x, pi); + float shalf = as_float(sx | as_uint(0.5f)); + + float v = as_float(aux); + + // Return for NaN + float ret = x; + + // 2^26 <= |x| <= Inf => atan(x) is close to piby2 + ret = aux <= PINFBITPATT_SP32 ? shalf : ret; + + // Reduce arguments 2^-19 <= |x| < 2^26 + + // 39/16 <= x < 2^26 + x = -MATH_RECIP(v); + float c = 1.57079632679489655800f; // atan(infinity) + + // 19/16 <= x < 39/16 + int l = aux < 0x401c0000; + float xx = MATH_DIVIDE(v - 1.5f, mad(v, 1.5f, 1.0f)); + x = l ? xx : x; + c = l ? 9.82793723247329054082e-01f : c; // atan(1.5) + + // 11/16 <= x < 19/16 + l = aux < 0x3f980000U; + xx = MATH_DIVIDE(v - 1.0f, 1.0f + v); + x = l ? xx : x; + c = l ? 7.85398163397448278999e-01f : c; // atan(1) + + // 7/16 <= x < 11/16 + l = aux < 0x3f300000; + xx = MATH_DIVIDE(mad(v, 2.0f, -1.0f), 2.0f + v); + x = l ? xx : x; + c = l ? 4.63647609000806093515e-01f : c; // atan(0.5) + + // 2^-19 <= x < 7/16 + l = aux < 0x3ee00000; + x = l ? v : x; + c = l ? 0.0f : c; + + // Core approximation: Remez(2,2) on [-7/16,7/16] + + float s = x * x; + float a = mad(s, + mad(s, 0.470677934286149214138357545549e-2f, 0.192324546402108583211697690500f), + 0.296528598819239217902158651186f); + + float b = mad(s, + mad(s, 0.299309699959659728404442796915f, 0.111072499995399550138837673349e1f), + 0.889585796862432286486651434570f); + + float q = x * s * MATH_DIVIDE(a, b); + + float z = c - (q - x); + z = MATH_DIVIDE(z, pi); + float zs = as_float(sx | as_uint(z)); + + ret = aux < 0x4c800000 ? zs : ret; + + // |x| < 2^-19 + ret = aux < 0x36000000 ? xbypi : ret; + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atanpi, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double atanpi(double x) { + const double pi = 0x1.921fb54442d18p+1; + + double v = fabs(x); + + // 2^56 > v > 39/16 + double a = -1.0; + double b = v; + // (chi + clo) = arctan(infinity) + double chi = 1.57079632679489655800e+00; + double clo = 6.12323399573676480327e-17; + + double ta = v - 1.5; + double tb = 1.0 + 1.5 * v; + int l = v <= 0x1.38p+1; // 39/16 > v > 19/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.5) + chi = l ? 9.82793723247329054082e-01 : chi; + clo = l ? 1.39033110312309953701e-17 : clo; + + ta = v - 1.0; + tb = 1.0 + v; + l = v <= 0x1.3p+0; // 19/16 > v > 11/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(1.) + chi = l ? 7.85398163397448278999e-01 : chi; + clo = l ? 3.06161699786838240164e-17 : clo; + + ta = 2.0 * v - 1.0; + tb = 2.0 + v; + l = v <= 0x1.6p-1; // 11/16 > v > 7/16 + a = l ? ta : a; + b = l ? tb : b; + // (chi + clo) = arctan(0.5) + chi = l ? 4.63647609000806093515e-01 : chi; + clo = l ? 2.26987774529616809294e-17 : clo; + + l = v <= 0x1.cp-2; // v < 7/16 + a = l ? v : a; + b = l ? 1.0 : b;; + chi = l ? 0.0 : chi; + clo = l ? 0.0 : clo; + + // Core approximation: Remez(4,4) on [-7/16,7/16] + double r = a / b; + double s = r * r; + double qn = fma(s, + fma(s, + fma(s, + fma(s, 0.142316903342317766e-3, + 0.304455919504853031e-1), + 0.220638780716667420e0), + 0.447677206805497472e0), + 0.268297920532545909e0); + + double qd = fma(s, + fma(s, + fma(s, + fma(s, 0.389525873944742195e-1, + 0.424602594203847109e0), + 0.141254259931958921e1), + 0.182596787737507063e1), + 0.804893761597637733e0); + + double q = r * s * qn / qd; + r = (chi - ((q - clo) - r)) / pi; + double vp = v / pi; + + double z = isnan(x) ? x : 0.5; + z = v <= 0x1.0p+56 ? r : z; + z = v < 0x1.0p-26 ? vp : z; + return x == v ? z : -z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanpi, double) + +#endif diff --git a/libclc/generic/lib/math/binary_impl.inc b/libclc/generic/lib/math/binary_impl.inc new file mode 100644 index 0000000..c9bf972 --- /dev/null +++ b/libclc/generic/lib/math/binary_impl.inc @@ -0,0 +1,22 @@ + +#ifndef __CLC_SCALAR + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y) { + return FUNCTION_IMPL(x, y); +} + +#endif + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, float y) { + __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y); + return FUNCTION_IMPL(x, vec_y); +} + +#ifdef cl_khr_fp64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, double y) { + __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y); + return FUNCTION_IMPL(x, vec_y); +} + +#endif diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl new file mode 100644 index 0000000..61e34a5 --- /dev/null +++ b/libclc/generic/lib/math/clc_ldexp.cl @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> +#include "config.h" +#include "../clcmacro.h" +#include "math.h" + +_CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) { + + if (!__clc_fp32_subnormals_supported()) { + + // This treats subnormals as zeros + int i = as_int(x); + int e = (i >> 23) & 0xff; + int m = i & 0x007fffff; + int s = i & 0x80000000; + int v = add_sat(e, n); + v = clamp(v, 0, 0xff); + int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; + int c = e == 0xff; + mr = c ? m : mr; + int er = c ? e : v; + er = e ? er : e; + return as_float( s | (er << 23) | mr ); + } + + /* supports denormal values */ + const int multiplier = 24; + float val_f; + uint val_ui; + uint sign; + int exponent; + val_ui = as_uint(x); + sign = val_ui & 0x80000000; + val_ui = val_ui & 0x7fffffff;/* remove the sign bit */ + int val_x = val_ui; + + exponent = val_ui >> 23; /* get the exponent */ + int dexp = exponent; + + /* denormal support */ + int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23); + int dexponent = 25 - fbh; + uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23)); + int ex = dexponent + n - multiplier; + dexponent = ex; + uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); + int ex1 = dexponent + multiplier; + ex1 = -ex1 +25; + dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1); + dval_ui = dexponent > 0 ? val :dval_ui; + dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/ + dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ + dval_ui = dval_ui | sign; + val_f = as_float(dval_ui); + + exponent += n; + + val = sign | (exponent << 23) | (val_ui & 0x007fffff); + ex1 = exponent + multiplier; + ex1 = -ex1 +25; + val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1); + val_ui = exponent > 0 ? val :val_ui; + val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/ + val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ + val_ui = val_ui | sign; + + val_ui = dexp == 0? dval_ui : val_ui; + val_f = as_float(val_ui); + + val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; + return val_f; +} + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { + long l = as_ulong(x); + int e = (l >> 52) & 0x7ff; + long s = l & 0x8000000000000000; + + ulong ux = as_ulong(x * 0x1.0p+53); + int de = ((int)(ux >> 52) & 0x7ff) - 53; + int c = e == 0; + e = c ? de: e; + + ux = c ? ux : l; + + int v = e + n; + v = clamp(v, -0x7ff, 0x7ff); + + ux &= ~EXPBITS_DP64; + + double mr = as_double(ux | ((ulong)(v+53) << 52)); + mr = mr * 0x1.0p-53; + + mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; + + mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; + mr = v < -53 ? as_double(s) : mr; + + mr = ((n == 0) | isinf(x) | (x == 0) ) ? x : mr; + return mr; +} + +#endif diff --git a/libclc/generic/lib/math/clc_nextafter.cl b/libclc/generic/lib/math/clc_nextafter.cl new file mode 100644 index 0000000..e53837d --- /dev/null +++ b/libclc/generic/lib/math/clc_nextafter.cl @@ -0,0 +1,43 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +// This file provides OpenCL C implementations of nextafter for targets that +// don't support the clang builtin. + +#define FLT_NAN 0.0f/0.0f + +#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, NAN, ZERO, NEXTAFTER_ZERO) \ +_CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, FLOAT_TYPE y) { \ + union { \ + FLOAT_TYPE f; \ + UINT_TYPE i; \ + } next; \ + if (isnan(x) || isnan(y)) { \ + return NAN; \ + } \ + if (x == y) { \ + return y; \ + } \ + next.f = x; \ + if (x < y) { \ + next.i++; \ + } else { \ + if (next.f == ZERO) { \ + next.i = NEXTAFTER_ZERO; \ + } else { \ + next.i--; \ + } \ + } \ + return next.f; \ +} + +NEXTAFTER(float, uint, FLT_NAN, 0.0f, 0x80000001) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_nextafter, float, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define DBL_NAN 0.0/0.0 + +NEXTAFTER(double, ulong, DBL_NAN, 0.0, 0x8000000000000001) +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_nextafter, double, double) +#endif diff --git a/libclc/generic/lib/math/clc_sqrt.cl b/libclc/generic/lib/math/clc_sqrt.cl new file mode 100644 index 0000000..86a874d --- /dev/null +++ b/libclc/generic/lib/math/clc_sqrt.cl @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +// Map the llvm sqrt intrinsic to an OpenCL function. +#define __CLC_FUNCTION __clc_llvm_intr_sqrt +#define __CLC_INTRINSIC "llvm.sqrt" +#include <clc/math/unary_intrin.inc> +#undef __CLC_FUNCTION +#undef __CLC_INTRINSIC + +#define __CLC_BODY <clc_sqrt_impl.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/clc_sqrt_impl.inc b/libclc/generic/lib/math/clc_sqrt_impl.inc new file mode 100644 index 0000000..e97b540 --- /dev/null +++ b/libclc/generic/lib/math/clc_sqrt_impl.inc @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __CLC_FPSIZE == 32 +#define __CLC_NAN NAN +#define ZERO 0.0f +#elif __CLC_FPSIZE == 64 +#define __CLC_NAN __builtin_nan("") +#define ZERO 0.0 +#else +#error "Invalid value for __CLC_FPSIZE" +#endif + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sqrt(__CLC_GENTYPE val) { + return val < ZERO ? __CLC_NAN : __clc_llvm_intr_sqrt(val); +} + +#undef __CLC_NAN +#undef ZERO diff --git a/libclc/generic/lib/math/copysign.cl b/libclc/generic/lib/math/copysign.cl new file mode 100644 index 0000000..4e0c51b --- /dev/null +++ b/libclc/generic/lib/math/copysign.cl @@ -0,0 +1,12 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +_CLC_DEFINE_BINARY_BUILTIN(float, copysign, __builtin_copysignf, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEFINE_BINARY_BUILTIN(double, copysign, __builtin_copysign, double, double) + +#endif diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl new file mode 100644 index 0000000..157447f --- /dev/null +++ b/libclc/generic/lib/math/cos.cl @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "sincos_helpers.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float cos(float x) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = __clc_argReductionS(&r0, &r1, dx); + + float ss = -__clc_sinf_piby4(r0, r1); + float cc = __clc_cosf_piby4(r0, r1); + + float c = (regn & 1) != 0 ? ss : cc; + c = as_float(as_int(c) ^ ((regn > 1) << 31)); + + c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c; + + return c; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double cos(double x) { + x = fabs(x); + + double r, rr; + int regn; + + if (x < 0x1.0p+47) + __clc_remainder_piby2_medium(x, &r, &rr, ®n); + else + __clc_remainder_piby2_large(x, &r, &rr, ®n); + + double2 sc = __clc_sincos_piby4(r, rr); + sc.lo = -sc.lo; + + int2 c = as_int2(regn & 1 ? sc.lo : sc.hi); + c.hi ^= (regn > 1) << 31; + + return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double); + +#endif diff --git a/libclc/generic/lib/math/cospi.cl b/libclc/generic/lib/math/cospi.cl new file mode 100644 index 0000000..108b637 --- /dev/null +++ b/libclc/generic/lib/math/cospi.cl @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "sincos_helpers.h" +#include "sincospiF_piby4.h" +#include "../clcmacro.h" +#ifdef cl_khr_fp64 +#include "sincosD_piby4.h" +#endif + +_CLC_OVERLOAD _CLC_DEF float cospi(float x) +{ + int ix = as_int(x) & 0x7fffffff; + float ax = as_float(ix); + int iax = (int)ax; + float r = ax - iax; + int xodd = iax & 0x1 ? 0x80000000 : 0; + + // Initialize with return for +-Inf and NaN + int ir = 0x7fc00000; + + // 2^24 <= |x| < Inf, the result is always even integer + ir = ix < 0x7f800000 ? 0x3f800000 : ir; + + // 2^23 <= |x| < 2^24, the result is always integer + ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + float a = 1.0f - r; + int e = 1; + int s = xodd ^ 0x80000000; + + // r <= 0.75 + int c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 0 : e; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + s = c ? xodd : s; + + // r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 1 : e; + + float2 t = __libclc__sincosf_piby4(a * M_PI_F); + int jr = s ^ as_int(e ? t.hi : t.lo); + + ir = ix < 0x4b000000 ? jr : ir; + + return as_float(ir); +} + + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cospi, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double cospi(double x) { + + long ix = as_long(x) & 0x7fffffffffffffffL; + double ax = as_double(ix); + long iax = (long)ax; + double r = ax - (double)iax; + long xodd = iax & 0x1L ? 0x8000000000000000L : 0L; + + // Initialize with return for +-Inf and NaN + long ir = 0x7ff8000000000000L; + + // 2^53 <= |x| < Inf, the result is always even integer + ir = ix < 0x7ff0000000000000 ? 0x3ff0000000000000L : ir; + + // 2^52 <= |x| < 2^53, the result is always integer + ir = ax < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir; + + // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval + + // r < 1.0 + double a = 1.0 - r; + int e = 1; + long s = xodd ^ 0x8000000000000000L; + + // r <= 0.75 + int c = r <= 0.75; + double t = r - 0.5; + a = c ? t : a; + e = c ? 0 : e; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + s = c ? xodd : s; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 1 : e; + + double2 sc = __libclc__sincos_piby4(a * M_PI, 0.0); + long jr = s ^ as_long(e ? sc.hi : sc.lo); + + ir = ax < 0x1.0p+52 ? jr : ir; + + return as_double(ir); +} +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cospi, double); +#endif diff --git a/libclc/generic/lib/math/ep_log.cl b/libclc/generic/lib/math/ep_log.cl new file mode 100644 index 0000000..3c2c62c --- /dev/null +++ b/libclc/generic/lib/math/ep_log.cl @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifdef cl_khr_fp64 + +#include <clc/clc.h> +#include "ep_log.h" +#include "math.h" +#include "tables.h" + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define LN0 8.33333333333317923934e-02 +#define LN1 1.25000000037717509602e-02 +#define LN2 2.23213998791944806202e-03 +#define LN3 4.34887777707614552256e-04 + +#define LF0 8.33333333333333593622e-02 +#define LF1 1.24999999978138668903e-02 +#define LF2 2.23219810758559851206e-03 + +_CLC_DEF void __clc_ep_log(double x, int *xexp, double *r1, double *r2) +{ + // Computes natural log(x). Algorithm based on: + // Ping-Tak Peter Tang + // "Table-driven implementation of the logarithm function in IEEE + // floating-point arithmetic" + // ACM Transactions on Mathematical Software (TOMS) + // Volume 16, Issue 4 (December 1990) + int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; + + ulong ux = as_ulong(x); + ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); + int c = ux < IMPBIT_DP64; + ux = c ? uxs : ux; + int expadjust = c ? 60 : 0; + + // Store the exponent of x in xexp and put f into the range [0.5,1) + int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; + double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); + *xexp = near_one ? 0 : xexp1; + + double r = x - 1.0; + double u1 = MATH_DIVIDE(r, 2.0 + r); + double ru1 = -r * u1; + u1 = u1 + u1; + + int index = as_int2(ux).hi >> 13; + index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); + + double f1 = index * 0x1.0p-7; + double f2 = f - f1; + double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); + + double2 tv = USE_TABLE(ln_tbl, (index - 64)); + double z1 = tv.s0; + double q = tv.s1; + + z1 = near_one ? r : z1; + q = near_one ? 0.0 : q; + double u = near_one ? u1 : u2; + double v = u*u; + + double cc = near_one ? ru1 : u2; + + double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); + double z22 = fma(v, fma(v, LF2, LF1), LF0); + double z2 = near_one ? z21 : z22; + z2 = fma(u*v, z2, cc) + q; + + *r1 = z1; + *r2 = z2; +} + +#endif diff --git a/libclc/generic/lib/math/ep_log.h b/libclc/generic/lib/math/ep_log.h new file mode 100644 index 0000000..7f99ac6 --- /dev/null +++ b/libclc/generic/lib/math/ep_log.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DECL void __clc_ep_log(double x, int *xexp, double *r1, double *r2); + +#endif diff --git a/libclc/generic/lib/math/erfc.cl b/libclc/generic/lib/math/erfc.cl new file mode 100644 index 0000000..c322f86 --- /dev/null +++ b/libclc/generic/lib/math/erfc.cl @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#define erx_f 8.4506291151e-01f /* 0x3f58560b */ + +// Coefficients for approximation to erf on [00.84375] + +#define efx 1.2837916613e-01f /* 0x3e0375d4 */ +#define efx8 1.0270333290e+00f /* 0x3f8375d4 */ + +#define pp0 1.2837916613e-01f /* 0x3e0375d4 */ +#define pp1 -3.2504209876e-01f /* 0xbea66beb */ +#define pp2 -2.8481749818e-02f /* 0xbce9528f */ +#define pp3 -5.7702702470e-03f /* 0xbbbd1489 */ +#define pp4 -2.3763017452e-05f /* 0xb7c756b1 */ +#define qq1 3.9791721106e-01f /* 0x3ecbbbce */ +#define qq2 6.5022252500e-02f /* 0x3d852a63 */ +#define qq3 5.0813062117e-03f /* 0x3ba68116 */ +#define qq4 1.3249473704e-04f /* 0x390aee49 */ +#define qq5 -3.9602282413e-06f /* 0xb684e21a */ + +// Coefficients for approximation to erf in [0.843751.25] + +#define pa0 -2.3621185683e-03f /* 0xbb1acdc6 */ +#define pa1 4.1485610604e-01f /* 0x3ed46805 */ +#define pa2 -3.7220788002e-01f /* 0xbebe9208 */ +#define pa3 3.1834661961e-01f /* 0x3ea2fe54 */ +#define pa4 -1.1089469492e-01f /* 0xbde31cc2 */ +#define pa5 3.5478305072e-02f /* 0x3d1151b3 */ +#define pa6 -2.1663755178e-03f /* 0xbb0df9c0 */ +#define qa1 1.0642088205e-01f /* 0x3dd9f331 */ +#define qa2 5.4039794207e-01f /* 0x3f0a5785 */ +#define qa3 7.1828655899e-02f /* 0x3d931ae7 */ +#define qa4 1.2617121637e-01f /* 0x3e013307 */ +#define qa5 1.3637083583e-02f /* 0x3c5f6e13 */ +#define qa6 1.1984500103e-02f /* 0x3c445aa3 */ + +// Coefficients for approximation to erfc in [1.251/0.35] + +#define ra0 -9.8649440333e-03f /* 0xbc21a093 */ +#define ra1 -6.9385856390e-01f /* 0xbf31a0b7 */ +#define ra2 -1.0558626175e+01f /* 0xc128f022 */ +#define ra3 -6.2375331879e+01f /* 0xc2798057 */ +#define ra4 -1.6239666748e+02f /* 0xc322658c */ +#define ra5 -1.8460508728e+02f /* 0xc3389ae7 */ +#define ra6 -8.1287437439e+01f /* 0xc2a2932b */ +#define ra7 -9.8143291473e+00f /* 0xc11d077e */ +#define sa1 1.9651271820e+01f /* 0x419d35ce */ +#define sa2 1.3765776062e+02f /* 0x4309a863 */ +#define sa3 4.3456588745e+02f /* 0x43d9486f */ +#define sa4 6.4538726807e+02f /* 0x442158c9 */ +#define sa5 4.2900814819e+02f /* 0x43d6810b */ +#define sa6 1.0863500214e+02f /* 0x42d9451f */ +#define sa7 6.5702495575e+00f /* 0x40d23f7c */ +#define sa8 -6.0424413532e-02f /* 0xbd777f97 */ + +// Coefficients for approximation to erfc in [1/.3528] + +#define rb0 -9.8649431020e-03f /* 0xbc21a092 */ +#define rb1 -7.9928326607e-01f /* 0xbf4c9dd4 */ +#define rb2 -1.7757955551e+01f /* 0xc18e104b */ +#define rb3 -1.6063638306e+02f /* 0xc320a2ea */ +#define rb4 -6.3756646729e+02f /* 0xc41f6441 */ +#define rb5 -1.0250950928e+03f /* 0xc480230b */ +#define rb6 -4.8351919556e+02f /* 0xc3f1c275 */ +#define sb1 3.0338060379e+01f /* 0x41f2b459 */ +#define sb2 3.2579251099e+02f /* 0x43a2e571 */ +#define sb3 1.5367296143e+03f /* 0x44c01759 */ +#define sb4 3.1998581543e+03f /* 0x4547fdbb */ +#define sb5 2.5530502930e+03f /* 0x451f90ce */ +#define sb6 4.7452853394e+02f /* 0x43ed43a7 */ +#define sb7 -2.2440952301e+01f /* 0xc1b38712 */ + +_CLC_OVERLOAD _CLC_DEF float erfc(float x) { + int hx = as_int(x); + int ix = hx & 0x7fffffff; + float absx = as_float(ix); + + // Argument for polys + float x2 = absx * absx; + float t = 1.0f / x2; + float tt = absx - 1.0f; + t = absx < 1.25f ? tt : t; + t = absx < 0.84375f ? x2 : t; + + // Evaluate polys + float tu, tv, u, v; + + u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0); + v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1); + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1); + u = absx < 0x1.6db6dap+1f ? tu : u; + v = absx < 0x1.6db6dap+1f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1); + u = absx < 1.25f ? tu : u; + v = absx < 1.25f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0); + tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1); + u = absx < 0.84375f ? tu : u; + v = absx < 0.84375f ? tv : v; + + v = mad(t, v, 1.0f); + + float q = MATH_DIVIDE(u, v); + + float ret = 0.0f; + + float z = as_float(ix & 0xfffff000); + float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z - absx, z + absx, q)); + r = MATH_DIVIDE(r, absx); + t = 2.0f - r; + r = x < 0.0f ? t : r; + ret = absx < 28.0f ? r : ret; + + r = 1.0f - erx_f - q; + t = erx_f + q + 1.0f; + r = x < 0.0f ? t : r; + ret = absx < 1.25f ? r : ret; + + r = 0.5f - mad(x, q, x - 0.5f); + ret = absx < 0.84375f ? r : ret; + + ret = x < -6.0f ? 2.0f : ret; + + ret = isnan(x) ? x : ret; + + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, erfc, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* double erf(double x) + * double erfc(double x) + * x + * 2 |\ + * erf(x) = --------- | exp(-t*t)dt + * sqrt(pi) \| + * 0 + * + * erfc(x) = 1-erf(x) + * Note that + * erf(-x) = -erf(x) + * erfc(-x) = 2 - erfc(x) + * + * Method: + * 1. For |x| in [0, 0.84375] + * erf(x) = x + x*R(x^2) + * erfc(x) = 1 - erf(x) if x in [-.84375,0.25] + * = 0.5 + ((0.5-x)-x*R) if x in [0.25,0.84375] + * where R = P/Q where P is an odd poly of degree 8 and + * Q is an odd poly of degree 10. + * -57.90 + * | R - (erf(x)-x)/x | <= 2 + * + * + * Remark. The formula is derived by noting + * erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....) + * and that + * 2/sqrt(pi) = 1.128379167095512573896158903121545171688 + * is close to one. The interval is chosen because the fix + * point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is + * near 0.6174), and by some experiment, 0.84375 is chosen to + * guarantee the error is less than one ulp for erf. + * + * 2. For |x| in [0.84375,1.25], let s = |x| - 1, and + * c = 0.84506291151 rounded to single (24 bits) + * erf(x) = sign(x) * (c + P1(s)/Q1(s)) + * erfc(x) = (1-c) - P1(s)/Q1(s) if x > 0 + * 1+(c+P1(s)/Q1(s)) if x < 0 + * |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06 + * Remark: here we use the taylor series expansion at x=1. + * erf(1+s) = erf(1) + s*Poly(s) + * = 0.845.. + P1(s)/Q1(s) + * That is, we use rational approximation to approximate + * erf(1+s) - (c = (single)0.84506291151) + * Note that |P1/Q1|< 0.078 for x in [0.84375,1.25] + * where + * P1(s) = degree 6 poly in s + * Q1(s) = degree 6 poly in s + * + * 3. For x in [1.25,1/0.35(~2.857143)], + * erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1) + * erf(x) = 1 - erfc(x) + * where + * R1(z) = degree 7 poly in z, (z=1/x^2) + * S1(z) = degree 8 poly in z + * + * 4. For x in [1/0.35,28] + * erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0 + * = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0 + * = 2.0 - tiny (if x <= -6) + * erf(x) = sign(x)*(1.0 - erfc(x)) if x < 6, else + * erf(x) = sign(x)*(1.0 - tiny) + * where + * R2(z) = degree 6 poly in z, (z=1/x^2) + * S2(z) = degree 7 poly in z + * + * Note1: + * To compute exp(-x*x-0.5625+R/S), let s be a single + * precision number and s := x; then + * -x*x = -s*s + (s-x)*(s+x) + * exp(-x*x-0.5626+R/S) = + * exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S); + * Note2: + * Here 4 and 5 make use of the asymptotic series + * exp(-x*x) + * erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) ) + * x*sqrt(pi) + * We use rational approximation to approximate + * g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625 + * Here is the error bound for R1/S1 and R2/S2 + * |R1/S1 - f(x)| < 2**(-62.57) + * |R2/S2 - f(x)| < 2**(-61.52) + * + * 5. For inf > x >= 28 + * erf(x) = sign(x) *(1 - tiny) (raise inexact) + * erfc(x) = tiny*tiny (raise underflow) if x > 0 + * = 2 - tiny if x<0 + * + * 7. Special case: + * erf(0) = 0, erf(inf) = 1, erf(-inf) = -1, + * erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2, + * erfc/erf(NaN) is NaN + */ + +#define AU0 -9.86494292470009928597e-03 +#define AU1 -7.99283237680523006574e-01 +#define AU2 -1.77579549177547519889e+01 +#define AU3 -1.60636384855821916062e+02 +#define AU4 -6.37566443368389627722e+02 +#define AU5 -1.02509513161107724954e+03 +#define AU6 -4.83519191608651397019e+02 + +#define AV0 3.03380607434824582924e+01 +#define AV1 3.25792512996573918826e+02 +#define AV2 1.53672958608443695994e+03 +#define AV3 3.19985821950859553908e+03 +#define AV4 2.55305040643316442583e+03 +#define AV5 4.74528541206955367215e+02 +#define AV6 -2.24409524465858183362e+01 + +#define BU0 -9.86494403484714822705e-03 +#define BU1 -6.93858572707181764372e-01 +#define BU2 -1.05586262253232909814e+01 +#define BU3 -6.23753324503260060396e+01 +#define BU4 -1.62396669462573470355e+02 +#define BU5 -1.84605092906711035994e+02 +#define BU6 -8.12874355063065934246e+01 +#define BU7 -9.81432934416914548592e+00 + +#define BV0 1.96512716674392571292e+01 +#define BV1 1.37657754143519042600e+02 +#define BV2 4.34565877475229228821e+02 +#define BV3 6.45387271733267880336e+02 +#define BV4 4.29008140027567833386e+02 +#define BV5 1.08635005541779435134e+02 +#define BV6 6.57024977031928170135e+00 +#define BV7 -6.04244152148580987438e-02 + +#define CU0 -2.36211856075265944077e-03 +#define CU1 4.14856118683748331666e-01 +#define CU2 -3.72207876035701323847e-01 +#define CU3 3.18346619901161753674e-01 +#define CU4 -1.10894694282396677476e-01 +#define CU5 3.54783043256182359371e-02 +#define CU6 -2.16637559486879084300e-03 + +#define CV0 1.06420880400844228286e-01 +#define CV1 5.40397917702171048937e-01 +#define CV2 7.18286544141962662868e-02 +#define CV3 1.26171219808761642112e-01 +#define CV4 1.36370839120290507362e-02 +#define CV5 1.19844998467991074170e-02 + +#define DU0 1.28379167095512558561e-01 +#define DU1 -3.25042107247001499370e-01 +#define DU2 -2.84817495755985104766e-02 +#define DU3 -5.77027029648944159157e-03 +#define DU4 -2.37630166566501626084e-05 + +#define DV0 3.97917223959155352819e-01 +#define DV1 6.50222499887672944485e-02 +#define DV2 5.08130628187576562776e-03 +#define DV3 1.32494738004321644526e-04 +#define DV4 -3.96022827877536812320e-06 + +_CLC_OVERLOAD _CLC_DEF double erfc(double x) { + long lx = as_long(x); + long ax = lx & 0x7fffffffffffffffL; + double absx = as_double(ax); + int xneg = lx != ax; + + // Poly arg + double x2 = x * x; + double xm1 = absx - 1.0; + double t = 1.0 / x2; + t = absx < 1.25 ? xm1 : t; + t = absx < 0.84375 ? x2 : t; + + + // Evaluate rational poly + // XXX Need to evaluate if we can grab the 14 coefficients from a + // table faster than evaluating 3 pairs of polys + double tu, tv, u, v; + + // |x| < 28 + u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0); + v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV6, AV5), AV4), AV3), AV2), AV1), AV0); + + tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0); + tv = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV7, BV6), BV5), BV4), BV3), BV2), BV1), BV0); + u = absx < 0x1.6db6dp+1 ? tu : u; + v = absx < 0x1.6db6dp+1 ? tv : v; + + tu = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0); + tv = fma(t, fma(t, fma(t, fma(t, fma(t, CV5, CV4), CV3), CV2), CV1), CV0); + u = absx < 1.25 ? tu : u; + v = absx < 1.25 ? tv : v; + + tu = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0); + tv = fma(t, fma(t, fma(t, fma(t, DV4, DV3), DV2), DV1), DV0); + u = absx < 0.84375 ? tu : u; + v = absx < 0.84375 ? tv : v; + + v = fma(t, v, 1.0); + double q = u / v; + + + // Evaluate return value + + // |x| < 28 + double z = as_double(ax & 0xffffffff00000000UL); + double ret = exp(-z * z - 0.5625) * exp((z - absx) * (z + absx) + q) / absx; + t = 2.0 - ret; + ret = xneg ? t : ret; + + const double erx = 8.45062911510467529297e-01; + z = erx + q + 1.0; + t = 1.0 - erx - q; + t = xneg ? z : t; + ret = absx < 1.25 ? t : ret; + + // z = 1.0 - fma(x, q, x); + // t = 0.5 - fma(x, q, x - 0.5); + // t = xneg == 1 | absx < 0.25 ? z : t; + t = fma(-x, q, 1.0 - x); + ret = absx < 0.84375 ? t : ret; + + ret = x >= 28.0 ? 0.0 : ret; + ret = x <= -6.0 ? 2.0 : ret; + ret = ax > 0x7ff0000000000000UL ? x : ret; + + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, erfc, double); + +#endif diff --git a/libclc/generic/lib/math/exp.cl b/libclc/generic/lib/math/exp.cl new file mode 100644 index 0000000..37f693c --- /dev/null +++ b/libclc/generic/lib/math/exp.cl @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float exp(float x) { + + // Reduce x + const float ln2HI = 0x1.62e300p-1f; + const float ln2LO = 0x1.2fefa2p-17f; + const float invln2 = 0x1.715476p+0f; + + float fhalF = x < 0.0f ? -0.5f : 0.5f; + int p = mad(x, invln2, fhalF); + float fp = (float)p; + float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here + float lo = -fp*ln2LO; + + // Evaluate poly + float t = hi + lo; + float tt = t*t; + float v = mad(tt, + -mad(tt, + mad(tt, + mad(tt, + mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), + 0x1.1566aap-14f), + -0x1.6c16c2p-9f), + 0x1.555556p-3f), + t); + + float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); + + // Scale by 2^p + float r = as_float(as_int(y) + (p << 23)); + + const float ulim = 0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366 + const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657 + + r = x < llim ? 0.0f : r; + r = x < ulim ? r : as_float(0x7f800000); + return isnan(x) ? x : r; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, exp, float) + +#ifdef cl_khr_fp64 + +#include "exp_helper.h" + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double exp(double x) { + + const double X_MIN = -0x1.74910d52d3051p+9; // -1075*ln(2) + const double X_MAX = 0x1.62e42fefa39efp+9; // 1024*ln(2) + const double R_64_BY_LOG2 = 0x1.71547652b82fep+6; // 64/ln(2) + const double R_LOG2_BY_64_LD = 0x1.62e42fefa0000p-7; // head ln(2)/64 + const double R_LOG2_BY_64_TL = 0x1.cf79abc9e3b39p-46; // tail ln(2)/64 + + int n = convert_int(x * R_64_BY_LOG2); + double r = fma(-R_LOG2_BY_64_TL, (double)n, fma(-R_LOG2_BY_64_LD, (double)n, x)); + return __clc_exp_helper(x, X_MIN, X_MAX, r, n); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp, double) + +#endif diff --git a/libclc/generic/lib/math/exp10.cl b/libclc/generic/lib/math/exp10.cl new file mode 100644 index 0000000..c8039cb --- /dev/null +++ b/libclc/generic/lib/math/exp10.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <exp10.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/exp10.inc b/libclc/generic/lib/math/exp10.inc new file mode 100644 index 0000000..a592c19 --- /dev/null +++ b/libclc/generic/lib/math/exp10.inc @@ -0,0 +1,10 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE exp10(__CLC_GENTYPE val) { + // exp10(x) = exp2(x * log2(10)) +#if __CLC_FPSIZE == 32 + return exp2(val * log2(10.0f)); +#elif __CLC_FPSIZE == 64 + return exp2(val * log2(10.0)); +#else +#error unknown _CLC_FPSIZE +#endif +} diff --git a/libclc/generic/lib/math/exp2.cl b/libclc/generic/lib/math/exp2.cl new file mode 100644 index 0000000..1ddccbd --- /dev/null +++ b/libclc/generic/lib/math/exp2.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float exp2(float x) { + + // Reduce x + const float ln2HI = 0x1.62e300p-1f; + const float ln2LO = 0x1.2fefa2p-17f; + + float t = rint(x); + int p = (int)t; + float tt = x - t; + float hi = tt * ln2HI; + float lo = tt * ln2LO; + + // Evaluate poly + t = hi + lo; + tt = t*t; + float v = mad(tt, + -mad(tt, + mad(tt, + mad(tt, + mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), + 0x1.1566aap-14f), + -0x1.6c16c2p-9f), + 0x1.555556p-3f), + t); + + float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); + + // Scale by 2^p + float r = as_float(as_int(y) + (p << 23)); + + const float ulim = 128.0f; + const float llim = -126.0f; + + r = x < llim ? 0.0f : r; + r = x < ulim ? r : as_float(0x7f800000); + return isnan(x) ? x : r; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, exp2, float) + +#ifdef cl_khr_fp64 + +#include "exp_helper.h" + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double exp2(double x) { + const double R_LN2 = 0x1.62e42fefa39efp-1; // ln(2) + const double R_1_BY_64 = 1.0 / 64.0; + + int n = convert_int(x * 64.0); + double r = R_LN2 * fma(-R_1_BY_64, (double)n, x); + return __clc_exp_helper(x, -1074.0, 1024.0, r, n); +} + + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp2, double) + +#endif diff --git a/libclc/generic/lib/math/exp_helper.cl b/libclc/generic/lib/math/exp_helper.cl new file mode 100644 index 0000000..046f306 --- /dev/null +++ b/libclc/generic/lib/math/exp_helper.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "tables.h" + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF double __clc_exp_helper(double x, double x_min, double x_max, double r, int n) { + + int j = n & 0x3f; + int m = n >> 6; + + // 6 term tail of Taylor expansion of e^r + double z2 = r * fma(r, + fma(r, + fma(r, + fma(r, + fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7), + 0x1.5555555555555p-5), + 0x1.5555555555555p-3), + 0x1.0000000000000p-1), + 1.0); + + double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j); + z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0; + + int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0)); + + int n1 = m >> 2; + int n2 = m-n1; + double z3= z2 * as_double(((long)n1 + 1023) << 52); + z3 *= as_double(((long)n2 + 1023) << 52); + + z2 = ldexp(z2, m); + z2 = small_value ? z3: z2; + + z2 = isnan(x) ? x : z2; + + z2 = x > x_max ? as_double(PINFBITPATT_DP64) : z2; + z2 = x < x_min ? 0.0 : z2; + + return z2; +} + +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/exp_helper.h b/libclc/generic/lib/math/exp_helper.h new file mode 100644 index 0000000..e6df2fd --- /dev/null +++ b/libclc/generic/lib/math/exp_helper.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +_CLC_DECL double __clc_exp_helper(double x, double x_min, double x_max, double r, int n); + +#endif diff --git a/libclc/generic/lib/math/fmax.cl b/libclc/generic/lib/math/fmax.cl new file mode 100644 index 0000000..239da3d --- /dev/null +++ b/libclc/generic/lib/math/fmax.cl @@ -0,0 +1,16 @@ +#include <clc/clc.h> + +#include "../clcmacro.h" + +_CLC_DEFINE_BINARY_BUILTIN(float, fmax, __builtin_fmaxf, float, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEFINE_BINARY_BUILTIN(double, fmax, __builtin_fmax, double, double); + +#endif + +#define __CLC_BODY <fmax.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/fmax.inc b/libclc/generic/lib/math/fmax.inc new file mode 100644 index 0000000..8315c5f --- /dev/null +++ b/libclc/generic/lib/math/fmax.inc @@ -0,0 +1,18 @@ + +#if !defined(__CLC_SCALAR) + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmax(__CLC_GENTYPE x, float y) { + return fmax(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y)); +} + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmax(__CLC_GENTYPE x, double y) { + return fmax(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y)); +} + +#endif // ifdef cl_khr_fp64 + +#endif // !defined(__CLC_SCALAR) diff --git a/libclc/generic/lib/math/fmin.cl b/libclc/generic/lib/math/fmin.cl new file mode 100644 index 0000000..28c7d01 --- /dev/null +++ b/libclc/generic/lib/math/fmin.cl @@ -0,0 +1,16 @@ +#include <clc/clc.h> + +#include "../clcmacro.h" + +_CLC_DEFINE_BINARY_BUILTIN(float, fmin, __builtin_fminf, float, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEFINE_BINARY_BUILTIN(double, fmin, __builtin_fmin, double, double); + +#endif + +#define __CLC_BODY <fmin.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/fmin.inc b/libclc/generic/lib/math/fmin.inc new file mode 100644 index 0000000..d4b5ac2 --- /dev/null +++ b/libclc/generic/lib/math/fmin.inc @@ -0,0 +1,18 @@ + +#if !defined(__CLC_SCALAR) + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmin(__CLC_GENTYPE x, float y) { + return fmin(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y)); +} + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fmin(__CLC_GENTYPE x, double y) { + return fmin(x, (__CLC_GENTYPE)((__CLC_SCALAR_GENTYPE)y)); +} + +#endif // ifdef cl_khr_fp64 + +#endif // !defined(__CLC_SCALAR) diff --git a/libclc/generic/lib/math/fmod.cl b/libclc/generic/lib/math/fmod.cl new file mode 100644 index 0000000..f9a4e31 --- /dev/null +++ b/libclc/generic/lib/math/fmod.cl @@ -0,0 +1,12 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +_CLC_DEFINE_BINARY_BUILTIN(float, fmod, __builtin_fmodf, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEFINE_BINARY_BUILTIN(double, fmod, __builtin_fmod, double, double) + +#endif diff --git a/libclc/generic/lib/math/fract.cl b/libclc/generic/lib/math/fract.cl new file mode 100644 index 0000000..b434071 --- /dev/null +++ b/libclc/generic/lib/math/fract.cl @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <fract.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/fract.inc b/libclc/generic/lib/math/fract.inc new file mode 100644 index 0000000..8d2a4d7 --- /dev/null +++ b/libclc/generic/lib/math/fract.inc @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __CLC_FPSIZE == 32 +#define MIN_CONSTANT 0x1.fffffep-1f +#else +#define MIN_CONSTANT 0x1.fffffffffffffp-1 +#endif + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fract(__CLC_GENTYPE x, private __CLC_GENTYPE *iptr) { + *iptr = floor(x); + __CLC_GENTYPE r = fmin(x - *iptr, MIN_CONSTANT); + r = isinf(x) ? 0.0f : r; + r = isnan(x) ? x : r; + return r; +} + + +#define FRACT_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fract(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ + __CLC_GENTYPE private_iptr; \ + __CLC_GENTYPE ret = fract(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ + } + +FRACT_DEF(local); +FRACT_DEF(global); + +#undef MIN_CONSTANT diff --git a/libclc/generic/lib/math/frexp.cl b/libclc/generic/lib/math/frexp.cl new file mode 100644 index 0000000..acd5d93 --- /dev/null +++ b/libclc/generic/lib/math/frexp.cl @@ -0,0 +1,10 @@ +#include <clc/clc.h> + +#include "math.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <frexp.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/frexp.inc b/libclc/generic/lib/math/frexp.inc new file mode 100644 index 0000000..0f5ddea --- /dev/null +++ b/libclc/generic/lib/math/frexp.inc @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * Copyright (c) 2016 Aaron Watry + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#if __CLC_FPSIZE == 32 +#ifdef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(float x, private int *ep) { + int i = as_int(x); + int ai = i & 0x7fffffff; + int d = ai > 0 & ai < 0x00800000; + // scale subnormal by 2^26 without multiplying + float s = as_float(ai | 0x0d800000) - 0x1.0p-100F; + ai = d ? as_int(s) : ai; + int e = (ai >> 23) - 126 - (d ? 26 : 0); + int t = ai == 0 | e == 129; + i = (i & 0x80000000) | 0x3f000000 | (ai & 0x007fffff); + *ep = t ? 0 : e; + return t ? x : as_float(i); +} +#define __CLC_FREXP_VEC(width) \ +_CLC_OVERLOAD _CLC_DEF float##width frexp(float##width x, private int##width *ep) { \ + int##width i = as_int##width(x); \ + int##width ai = i & 0x7fffffff; \ + int##width d = ai > 0 & ai < 0x00800000; \ + /* scale subnormal by 2^26 without multiplying */ \ + float##width s = as_float##width(ai | 0x0d800000) - 0x1.0p-100F; \ + ai = bitselect(ai, as_int##width(s), d); \ + int##width e = (ai >> 23) - 126 - bitselect((int##width)0, (int##width)26, d); \ + int##width t = ai == (int##width)0 | e == (int##width)129; \ + i = (i & (int##width)0x80000000) | (int##width)0x3f000000 | (ai & 0x007fffff); \ + *ep = bitselect(e, (int##width)0, t); \ + return bitselect(as_float##width(i), x, as_float##width(t)); \ +} +__CLC_FREXP_VEC(2) +__CLC_FREXP_VEC(3) +__CLC_FREXP_VEC(4) +__CLC_FREXP_VEC(8) +__CLC_FREXP_VEC(16) +#undef __CLC_FREXP_VEC +#endif +#endif + +#if __CLC_FPSIZE == 64 +#ifdef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, private __CLC_INTN *ep) { + long i = as_long(x); + long ai = i & 0x7fffffffffffffffL; + int d = ai > 0 & ai < 0x0010000000000000L; + // scale subnormal by 2^54 without multiplying + double s = as_double(ai | 0x0370000000000000L) - 0x1.0p-968; + ai = d ? as_long(s) : ai; + int e = (int)(ai >> 52) - 1022 - (d ? 54 : 0); + int t = ai == 0 | e == 1025; + i = (i & 0x8000000000000000L) | 0x3fe0000000000000L | (ai & 0x000fffffffffffffL); + *ep = t ? 0 : e; + return t ? x : as_double(i); +} +#define __CLC_FREXP_VEC(width) \ +_CLC_OVERLOAD _CLC_DEF double##width frexp(double##width x, private int##width *ep) { \ + long##width i = as_long##width(x); \ + long##width ai = i & 0x7fffffffffffffffL; \ + long##width d = ai > 0 & ai < 0x0010000000000000L; \ + /* scale subnormal by 2^54 without multiplying */ \ + double##width s = as_double##width(ai | 0x0370000000000000L) - 0x1.0p-968; \ + ai = bitselect(ai, as_long##width(s), d); \ + int##width e = convert_int##width(ai >> 52) - 1022 - bitselect((int##width)0, (int##width)54, convert_int##width(d)); \ + int##width t = convert_int##width(ai == (long##width)0) | (e == (int##width)129); \ + i = (i & (long##width)0x8000000000000000L) | (long##width)0x3fe0000000000000L | (ai & 0x000fffffffffffffL); \ + *ep = bitselect(e, (int##width)0, t); \ + return bitselect(as_double##width(i), x, as_double##width(convert_long##width(t))); \ +} +__CLC_FREXP_VEC(2) +__CLC_FREXP_VEC(3) +__CLC_FREXP_VEC(4) +__CLC_FREXP_VEC(8) +__CLC_FREXP_VEC(16) +#undef __CLC_FREXP_VEC +#endif +#endif + +#define __CLC_FREXP_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, addrspace __CLC_INTN *iptr) { \ + __CLC_INTN private_iptr; \ + __CLC_GENTYPE ret = frexp(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ +} + +__CLC_FREXP_DEF(local); +__CLC_FREXP_DEF(global); + +#undef __CLC_FREXP_DEF diff --git a/libclc/generic/lib/math/half_rsqrt.cl b/libclc/generic/lib/math/half_rsqrt.cl new file mode 100644 index 0000000..726f65c --- /dev/null +++ b/libclc/generic/lib/math/half_rsqrt.cl @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#define __CLC_BODY <half_rsqrt.inc> +#define __FLOAT_ONLY +#include <clc/math/gentype.inc> +#undef __FLOAT_ONLY diff --git a/libclc/generic/lib/math/half_rsqrt.inc b/libclc/generic/lib/math/half_rsqrt.inc new file mode 100644 index 0000000..33ce6c2 --- /dev/null +++ b/libclc/generic/lib/math/half_rsqrt.inc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE half_rsqrt(__CLC_GENTYPE val) { + return rsqrt(val); +} diff --git a/libclc/generic/lib/math/half_sqrt.cl b/libclc/generic/lib/math/half_sqrt.cl new file mode 100644 index 0000000..a02896a --- /dev/null +++ b/libclc/generic/lib/math/half_sqrt.cl @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#define __CLC_BODY <half_sqrt.inc> +#define __FLOAT_ONLY +#include <clc/math/gentype.inc> +#undef __FLOAT_ONLY diff --git a/libclc/generic/lib/math/half_sqrt.inc b/libclc/generic/lib/math/half_sqrt.inc new file mode 100644 index 0000000..bcdc6a9 --- /dev/null +++ b/libclc/generic/lib/math/half_sqrt.inc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE half_sqrt(__CLC_GENTYPE val) { + return sqrt(val); +} diff --git a/libclc/generic/lib/math/hypot.cl b/libclc/generic/lib/math/hypot.cl new file mode 100644 index 0000000..eca042c --- /dev/null +++ b/libclc/generic/lib/math/hypot.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <hypot.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/hypot.inc b/libclc/generic/lib/math/hypot.inc new file mode 100644 index 0000000..036cee7 --- /dev/null +++ b/libclc/generic/lib/math/hypot.inc @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE hypot(__CLC_GENTYPE x, __CLC_GENTYPE y) { + return sqrt(x*x + y*y); +} diff --git a/libclc/generic/lib/math/ldexp.cl b/libclc/generic/lib/math/ldexp.cl new file mode 100644 index 0000000..9be3127 --- /dev/null +++ b/libclc/generic/lib/math/ldexp.cl @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> +#include "config.h" +#include "../clcmacro.h" +#include "math.h" +#include "math/clc_ldexp.h" + +_CLC_DEFINE_BINARY_BUILTIN(float, ldexp, __clc_ldexp, float, int) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEFINE_BINARY_BUILTIN(double, ldexp, __clc_ldexp, double, int) + +#endif + +// This defines all the ldexp(GENTYPE, int) variants +#define __CLC_BODY <ldexp.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/ldexp.inc b/libclc/generic/lib/math/ldexp.inc new file mode 100644 index 0000000..6e28fbb --- /dev/null +++ b/libclc/generic/lib/math/ldexp.inc @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __CLC_SCALAR + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE ldexp(__CLC_GENTYPE x, int n) { + return ldexp(x, (__CLC_INTN)n); +} + +#endif diff --git a/libclc/generic/lib/math/log.cl b/libclc/generic/lib/math/log.cl new file mode 100644 index 0000000..ec1faa1 --- /dev/null +++ b/libclc/generic/lib/math/log.cl @@ -0,0 +1,26 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +/* + *log(x) = log2(x) * (1/log2(e)) + */ + +_CLC_OVERLOAD _CLC_DEF float log(float x) +{ + return log2(x) * (1.0f / M_LOG2E_F); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double log(double x) +{ + return log2(x) * (1.0 / M_LOG2E); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log, double); + +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/log10.cl b/libclc/generic/lib/math/log10.cl new file mode 100644 index 0000000..d65764a --- /dev/null +++ b/libclc/generic/lib/math/log10.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <log10.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/log10.inc b/libclc/generic/lib/math/log10.inc new file mode 100644 index 0000000..423308a0 --- /dev/null +++ b/libclc/generic/lib/math/log10.inc @@ -0,0 +1,13 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE log10(__CLC_GENTYPE val) { + // log10(x) = log2(x) / log2(10) + // 1 / log2(10) = 0.30102999566 = log10(2) + // SP representation is 0.30103 (0x1.344136p-2) + // DP representation is 0.301029995659999993762312442414(0x1.34413509E61D8p-2) +#if __CLC_FPSIZE == 32 + return log2(val) * 0x1.344136p-2f; +#elif __CLC_FPSIZE == 64 + return log2(val) * 0x1.34413509E61D8p-2; +#else +#error unknown _CLC_FPSIZE +#endif +} diff --git a/libclc/generic/lib/math/log1p.cl b/libclc/generic/lib/math/log1p.cl new file mode 100644 index 0000000..be25c64 --- /dev/null +++ b/libclc/generic/lib/math/log1p.cl @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "tables.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float log1p(float x) +{ + float w = x; + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + + // |x| < 2^-4 + float u2 = MATH_DIVIDE(x, 2.0f + x); + float u = u2 + u2; + float v = u * u; + // 2/(5 * 2^5), 2/(3 * 2^3) + float zsmall = mad(-u2, x, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v * u) + x; + + // |x| >= 2^-4 + ux = as_uint(x + 1.0f); + + int m = (int)((ux >> EXPSHIFTBITS_SP32) & 0xff) - EXPBIAS_SP32; + float mf = (float)m; + uint indx = (ux & 0x007f0000) + ((ux & 0x00008000) << 1); + float F = as_float(indx | 0x3f000000); + + // x > 2^24 + float fg24 = F - as_float(0x3f000000 | (ux & MANTBITS_SP32)); + + // x <= 2^24 + uint xhi = ux & 0xffff8000; + float xh = as_float(xhi); + float xt = (1.0f - xh) + w; + uint xnm = ((~(xhi & 0x7f800000)) - 0x00800000) & 0x7f800000; + xt = xt * as_float(xnm) * 0.5f; + float fl24 = F - as_float(0x3f000000 | (xhi & MANTBITS_SP32)) - xt; + + float f = mf > 24.0f ? fg24 : fl24; + + indx = indx >> 16; + float r = f * USE_TABLE(log_inv_tbl, indx); + + // 1/3, 1/2 + float poly = mad(mad(r, 0x1.555556p-2f, 0x1.0p-1f), r*r, r); + + const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 + const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 + + float2 tv = USE_TABLE(loge_tbl, indx); + float z1 = mad(mf, LOG2_HEAD, tv.s0); + float z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1; + float z = z1 + z2; + + z = ax < 0x3d800000U ? zsmall : z; + + + + // Edge cases + z = ax >= PINFBITPATT_SP32 ? w : z; + z = w < -1.0f ? as_float(QNANBITPATT_SP32) : z; + z = w == -1.0f ? as_float(NINFBITPATT_SP32) : z; + //fix subnormals + z = ax < 0x33800000 ? x : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log1p, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double log1p(double x) +{ + // Computes natural log(1+x). Algorithm based on: + // Ping-Tak Peter Tang + // "Table-driven implementation of the logarithm function in IEEE + // floating-point arithmetic" + // ACM Transactions on Mathematical Software (TOMS) + // Volume 16, Issue 4 (December 1990) + // Note that we use a lookup table of size 64 rather than 128, + // and compensate by having extra terms in the minimax polynomial + // for the kernel approximation. + + // Process Inside the threshold now + ulong ux = as_ulong(1.0 + x); + int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64; + double f = as_double(ONEEXPBITS_DP64 | (ux & MANTBITS_DP64)); + + int j = as_int2(ux).hi >> 13; + j = ((0x80 | (j & 0x7e)) >> 1) + (j & 0x1); + double f1 = (double)j * 0x1.0p-6; + j -= 64; + + double f2temp = f - f1; + double m2 = as_double(convert_ulong(0x3ff - xexp) << EXPSHIFTBITS_DP64); + double f2l = fma(m2, x, m2 - f1); + double f2g = fma(m2, x, -f1) + m2; + double f2 = xexp <= MANTLENGTH_DP64-1 ? f2l : f2g; + f2 = (xexp <= -2) | (xexp >= MANTLENGTH_DP64+8) ? f2temp : f2; + + double2 tv = USE_TABLE(ln_tbl, j); + double z1 = tv.s0; + double q = tv.s1; + + double u = MATH_DIVIDE(f2, fma(0.5, f2, f1)); + double v = u * u; + + double poly = v * fma(v, + fma(v, 2.23219810758559851206e-03, 1.24999999978138668903e-02), + 8.33333333333333593622e-02); + + // log2_lead and log2_tail sum to an extra-precise version of log(2) + const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */ + const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */ + + double z2 = q + fma(u, poly, u); + double dxexp = (double)xexp; + double r1 = fma(dxexp, log2_lead, z1); + double r2 = fma(dxexp, log2_tail, z2); + double result1 = r1 + r2; + + // Process Outside the threshold now + double r = x; + u = r / (2.0 + r); + double correction = r * u; + u = u + u; + v = u * u; + r1 = r; + + poly = fma(v, + fma(v, + fma(v, 4.34887777707614552256e-04, 2.23213998791944806202e-03), + 1.25000000037717509602e-02), + 8.33333333333317923934e-02); + + r2 = fma(u*v, poly, -correction); + + // The values exp(-1/16)-1 and exp(1/16)-1 + const double log1p_thresh1 = -0x1.f0540438fd5c3p-5; + const double log1p_thresh2 = 0x1.082b577d34ed8p-4; + double result2 = r1 + r2; + result2 = x < log1p_thresh1 | x > log1p_thresh2 ? result1 : result2; + + result2 = isinf(x) ? x : result2; + result2 = x < -1.0 ? as_double(QNANBITPATT_DP64) : result2; + result2 = x == -1.0 ? as_double(NINFBITPATT_DP64) : result2; + return result2; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log1p, double); + +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/log2.cl b/libclc/generic/lib/math/log2.cl new file mode 100644 index 0000000..8776a80 --- /dev/null +++ b/libclc/generic/lib/math/log2.cl @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> +#include "../clcmacro.h" +#include "tables.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif // cl_khr_fp64 + +#define COMPILING_LOG2 +#include "log_base.h" +#undef COMPILING_LOG2 + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log2, float); + +#ifdef cl_khr_fp64 +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log2, double); +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h new file mode 100644 index 0000000..bf2f82b --- /dev/null +++ b/libclc/generic/lib/math/log_base.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "math.h" + +/* + Algorithm: + + Based on: + Ping-Tak Peter Tang + "Table-driven implementation of the logarithm function in IEEE + floating-point arithmetic" + ACM Transactions on Mathematical Software (TOMS) + Volume 16, Issue 4 (December 1990) + + + x very close to 1.0 is handled differently, for x everywhere else + a brief explanation is given below + + x = (2^m)*A + x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8)) + x = (2^m)*2*(G/2+g/2) + x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9)) + + Y = (2^(-1))*(2^(-m))*(2^m)*A + Now, range of Y is: 0.5 <= Y < 1 + + F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit) + Now, range of F is: 128 <= F <= 256 + F = F / 256 + Now, range of F is: 0.5 <= F <= 1 + + f = -(Y-F), with (f <= 2^(-9)) + + log(x) = m*log(2) + log(2) + log(F-f) + log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F)) + log(x) = m*log(2) + log(2*F) + log(1-r) + + r = (f/F), with (r <= 2^(-8)) + r = f*(1/F) with (1/F) precomputed to avoid division + + log(x) = m*log(2) + log(G) - poly + + log(G) is precomputed + poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5)) + + log(2) and log(G) need to be maintained in extra precision + to avoid losing precision in the calculations + + + For x close to 1.0, we employ the following technique to + ensure faster convergence. + + log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7 + x = ((1+s)/(1-s)) + x = 1 + r + s = r/(2+r) + +*/ + +_CLC_OVERLOAD _CLC_DEF float +#if defined(COMPILING_LOG2) +log2(float x) +#elif defined(COMPILING_LOG10) +log10(float x) +#else +log(float x) +#endif +{ + +#if defined(COMPILING_LOG2) + const float LOG2E = 0x1.715476p+0f; // 1.4426950408889634 + const float LOG2E_HEAD = 0x1.700000p+0f; // 1.4375 + const float LOG2E_TAIL = 0x1.547652p-8f; // 0.00519504072 +#elif defined(COMPILING_LOG10) + USE_TABLE(float2, p_log, LOG10_TBL); + const float LOG10E = 0x1.bcb7b2p-2f; // 0.43429448190325182 + const float LOG10E_HEAD = 0x1.bc0000p-2f; // 0.43359375 + const float LOG10E_TAIL = 0x1.6f62a4p-11f; // 0.0007007319 + const float LOG10_2_HEAD = 0x1.340000p-2f; // 0.30078125 + const float LOG10_2_TAIL = 0x1.04d426p-12f; // 0.000248745637 +#else + USE_TABLE(float2, p_log, LOGE_TBL); + const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 + const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 +#endif + + uint xi = as_uint(x); + uint ax = xi & EXSIGNBIT_SP32; + + // Calculations for |x-1| < 2^-4 + float r = x - 1.0f; + int near1 = fabs(r) < 0x1.0p-4f; + float u2 = MATH_DIVIDE(r, 2.0f + r); + float corr = u2 * r; + float u = u2 + u2; + float v = u * u; + float znear1, z1, z2; + + // 2/(5 * 2^5), 2/(3 * 2^3) + z2 = mad(u, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f)*v, -corr); + +#if defined(COMPILING_LOG2) + z1 = as_float(as_int(r) & 0xffff0000); + z2 = z2 + (r - z1); + znear1 = mad(z1, LOG2E_HEAD, mad(z2, LOG2E_HEAD, mad(z1, LOG2E_TAIL, z2*LOG2E_TAIL))); +#elif defined(COMPILING_LOG10) + z1 = as_float(as_int(r) & 0xffff0000); + z2 = z2 + (r - z1); + znear1 = mad(z1, LOG10E_HEAD, mad(z2, LOG10E_HEAD, mad(z1, LOG10E_TAIL, z2*LOG10E_TAIL))); +#else + znear1 = z2 + r; +#endif + + // Calculations for x not near 1 + int m = (int)(xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + // Normalize subnormal + uint xis = as_uint(as_float(xi | 0x3f800000) - 1.0f); + int ms = (int)(xis >> EXPSHIFTBITS_SP32) - 253; + int c = m == -127; + m = c ? ms : m; + uint xin = c ? xis : xi; + + float mf = (float)m; + uint indx = (xin & 0x007f0000) + ((xin & 0x00008000) << 1); + + // F - Y + float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (xin & MANTBITS_SP32)); + + indx = indx >> 16; + r = f * USE_TABLE(log_inv_tbl, indx); + + // 1/3, 1/2 + float poly = mad(mad(r, 0x1.555556p-2f, 0.5f), r*r, r); + +#if defined(COMPILING_LOG2) + float2 tv = USE_TABLE(log2_tbl, indx); + z1 = tv.s0 + mf; + z2 = mad(poly, -LOG2E, tv.s1); +#elif defined(COMPILING_LOG10) + float2 tv = p_log[indx]; + z1 = mad(mf, LOG10_2_HEAD, tv.s0); + z2 = mad(poly, -LOG10E, mf*LOG10_2_TAIL) + tv.s1; +#else + float2 tv = p_log[indx]; + z1 = mad(mf, LOG2_HEAD, tv.s0); + z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1; +#endif + + float z = z1 + z2; + z = near1 ? znear1 : z; + + // Corner cases + z = ax >= PINFBITPATT_SP32 ? x : z; + z = xi != ax ? as_float(QNANBITPATT_SP32) : z; + z = ax == 0 ? as_float(NINFBITPATT_SP32) : z; + + return z; +} + +#ifdef cl_khr_fp64 + +_CLC_OVERLOAD _CLC_DEF double +#if defined(COMPILING_LOG2) +log2(double x) +#elif defined(COMPILING_LOG10) +log10(double x) +#else +log(double x) +#endif +{ + +#ifndef COMPILING_LOG2 + // log2_lead and log2_tail sum to an extra-precise version of ln(2) + const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */ + const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */ +#endif + +#if defined(COMPILING_LOG10) + // log10e_lead and log10e_tail sum to an extra-precision version of log10(e) (19 bits in lead) + const double log10e_lead = 4.34293746948242187500e-01; /* 0x3fdbcb7800000000 */ + const double log10e_tail = 7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */ +#elif defined(COMPILING_LOG2) + // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19 bits in lead) + const double log2e_lead = 1.44269180297851562500E+00; /* 0x3FF7154400000000 */ + const double log2e_tail = 3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */ +#endif + + // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 + // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 + const double log_thresh1 = 0x1.e0faap-1; + const double log_thresh2 = 0x1.1082cp+0; + + int is_near = x >= log_thresh1 & x <= log_thresh2; + + // Near 1 code + double r = x - 1.0; + double u = r / (2.0 + r); + double correction = r * u; + u = u + u; + double v = u * u; + double r1 = r; + + const double ca_1 = 8.33333333333317923934e-02; /* 0x3fb55555555554e6 */ + const double ca_2 = 1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */ + const double ca_3 = 2.23213998791944806202e-03; /* 0x3f62492307f1519f */ + const double ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ + + double r2 = fma(u*v, fma(v, fma(v, fma(v, ca_4, ca_3), ca_2), ca_1), -correction); + +#if defined(COMPILING_LOG10) + r = r1; + r1 = as_double(as_ulong(r1) & 0xffffffff00000000); + r2 = r2 + (r - r1); + double ret_near = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail * r2))); +#elif defined(COMPILING_LOG2) + r = r1; + r1 = as_double(as_ulong(r1) & 0xffffffff00000000); + r2 = r2 + (r - r1); + double ret_near = fma(log2e_lead, r1, fma(log2e_lead, r2, fma(log2e_tail, r1, log2e_tail*r2))); +#else + double ret_near = r1 + r2; +#endif + + // This is the far from 1 code + + // Deal with subnormal + ulong ux = as_ulong(x); + ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); + int c = ux < IMPBIT_DP64; + ux = c ? uxs : ux; + int expadjust = c ? 60 : 0; + + int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; + double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); + int index = as_int2(ux).hi >> 13; + index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); + + double2 tv = USE_TABLE(ln_tbl, index - 64); + double z1 = tv.s0; + double q = tv.s1; + + double f1 = index * 0x1.0p-7; + double f2 = f - f1; + u = f2 / fma(f2, 0.5, f1); + v = u * u; + + const double cb_1 = 8.33333333333333593622e-02; /* 0x3fb5555555555557 */ + const double cb_2 = 1.24999999978138668903e-02; /* 0x3f89999999865ede */ + const double cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ + + double poly = v * fma(v, fma(v, cb_3, cb_2), cb_1); + double z2 = q + fma(u, poly, u); + + double dxexp = (double)xexp; +#if defined (COMPILING_LOG10) + // Add xexp * log(2) to z1,z2 to get log(x) + r1 = fma(dxexp, log2_lead, z1); + r2 = fma(dxexp, log2_tail, z2); + double ret_far = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail*r2))); +#elif defined(COMPILING_LOG2) + r1 = fma(log2e_lead, z1, dxexp); + r2 = fma(log2e_lead, z2, fma(log2e_tail, z1, log2e_tail*z2)); + double ret_far = r1 + r2; +#else + r1 = fma(dxexp, log2_lead, z1); + r2 = fma(dxexp, log2_tail, z2); + double ret_far = r1 + r2; +#endif + + double ret = is_near ? ret_near : ret_far; + + ret = isinf(x) ? as_double(PINFBITPATT_DP64) : ret; + ret = isnan(x) | (x < 0.0) ? as_double(QNANBITPATT_DP64) : ret; + ret = x == 0.0 ? as_double(NINFBITPATT_DP64) : ret; + return ret; +} + +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/mad.cl b/libclc/generic/lib/math/mad.cl new file mode 100644 index 0000000..6c7b90d --- /dev/null +++ b/libclc/generic/lib/math/mad.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <mad.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/mad.inc b/libclc/generic/lib/math/mad.inc new file mode 100644 index 0000000..d32c783 --- /dev/null +++ b/libclc/generic/lib/math/mad.inc @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) { + return a * b + c; +} diff --git a/libclc/generic/lib/math/math.h b/libclc/generic/lib/math/math.h new file mode 100644 index 0000000..f46c7ea --- /dev/null +++ b/libclc/generic/lib/math/math.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define SNAN 0x001 +#define QNAN 0x002 +#define NINF 0x004 +#define NNOR 0x008 +#define NSUB 0x010 +#define NZER 0x020 +#define PZER 0x040 +#define PSUB 0x080 +#define PNOR 0x100 +#define PINF 0x200 + +#define HAVE_HW_FMA32() (1) +#define HAVE_BITALIGN() (0) +#define HAVE_FAST_FMA32() (0) + +#define MATH_DIVIDE(X, Y) ((X) / (Y)) +#define MATH_RECIP(X) (1.0f / (X)) +#define MATH_SQRT(X) sqrt(X) + +#define SIGNBIT_SP32 0x80000000 +#define EXSIGNBIT_SP32 0x7fffffff +#define EXPBITS_SP32 0x7f800000 +#define MANTBITS_SP32 0x007fffff +#define ONEEXPBITS_SP32 0x3f800000 +#define TWOEXPBITS_SP32 0x40000000 +#define HALFEXPBITS_SP32 0x3f000000 +#define IMPBIT_SP32 0x00800000 +#define QNANBITPATT_SP32 0x7fc00000 +#define INDEFBITPATT_SP32 0xffc00000 +#define PINFBITPATT_SP32 0x7f800000 +#define NINFBITPATT_SP32 0xff800000 +#define EXPBIAS_SP32 127 +#define EXPSHIFTBITS_SP32 23 +#define BIASEDEMIN_SP32 1 +#define EMIN_SP32 -126 +#define BIASEDEMAX_SP32 254 +#define EMAX_SP32 127 +#define LAMBDA_SP32 1.0e30 +#define MANTLENGTH_SP32 24 +#define BASEDIGITS_SP32 7 + +#ifdef cl_khr_fp64 + +#define SIGNBIT_DP64 0x8000000000000000L +#define EXSIGNBIT_DP64 0x7fffffffffffffffL +#define EXPBITS_DP64 0x7ff0000000000000L +#define MANTBITS_DP64 0x000fffffffffffffL +#define ONEEXPBITS_DP64 0x3ff0000000000000L +#define TWOEXPBITS_DP64 0x4000000000000000L +#define HALFEXPBITS_DP64 0x3fe0000000000000L +#define IMPBIT_DP64 0x0010000000000000L +#define QNANBITPATT_DP64 0x7ff8000000000000L +#define INDEFBITPATT_DP64 0xfff8000000000000L +#define PINFBITPATT_DP64 0x7ff0000000000000L +#define NINFBITPATT_DP64 0xfff0000000000000L +#define EXPBIAS_DP64 1023 +#define EXPSHIFTBITS_DP64 52 +#define BIASEDEMIN_DP64 1 +#define EMIN_DP64 -1022 +#define BIASEDEMAX_DP64 2046 /* 0x7fe */ +#define EMAX_DP64 1023 /* 0x3ff */ +#define LAMBDA_DP64 1.0e300 +#define MANTLENGTH_DP64 53 +#define BASEDIGITS_DP64 15 + +#endif // cl_khr_fp64 + +#define ALIGNED(x) __attribute__((aligned(x))) diff --git a/libclc/generic/lib/math/modf.cl b/libclc/generic/lib/math/modf.cl new file mode 100644 index 0000000..3294fbc --- /dev/null +++ b/libclc/generic/lib/math/modf.cl @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <modf.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/modf.inc b/libclc/generic/lib/math/modf.inc new file mode 100644 index 0000000..1486b76 --- /dev/null +++ b/libclc/generic/lib/math/modf.inc @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, __CLC_GENTYPE *iptr) { + *iptr = trunc(x); + return copysign(isinf(x) ? 0.0f : x - *iptr, x); +} + +#define MODF_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ + __CLC_GENTYPE private_iptr; \ + __CLC_GENTYPE ret = modf(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ +} + +MODF_DEF(local); +MODF_DEF(global); diff --git a/libclc/generic/lib/math/native_log.cl b/libclc/generic/lib/math/native_log.cl new file mode 100644 index 0000000..f64f012 --- /dev/null +++ b/libclc/generic/lib/math/native_log.cl @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#define __CLC_FUNCTION __clc_native_log +#define __CLC_INTRINSIC "llvm.log" +#undef cl_khr_fp64 +#include <clc/math/unary_intrin.inc> + +#define __CLC_BODY <native_log.inc> +#define __FLOAT_ONLY +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/native_log.inc b/libclc/generic/lib/math/native_log.inc new file mode 100644 index 0000000..cb4db3f --- /dev/null +++ b/libclc/generic/lib/math/native_log.inc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_log(__CLC_GENTYPE val) { + return __clc_native_log(val); +} diff --git a/libclc/generic/lib/math/native_log2.cl b/libclc/generic/lib/math/native_log2.cl new file mode 100644 index 0000000..35ed18b --- /dev/null +++ b/libclc/generic/lib/math/native_log2.cl @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#define __CLC_FUNCTION __clc_native_log2 +#define __CLC_INTRINSIC "llvm.log2" +#undef cl_khr_fp64 +#include <clc/math/unary_intrin.inc> + +#define __CLC_BODY <native_log2.inc> +#define __FLOAT_ONLY +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/native_log2.inc b/libclc/generic/lib/math/native_log2.inc new file mode 100644 index 0000000..0f6a509 --- /dev/null +++ b/libclc/generic/lib/math/native_log2.inc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_log2(__CLC_GENTYPE val) { + return __clc_native_log2(val); +} diff --git a/libclc/generic/lib/math/nextafter.cl b/libclc/generic/lib/math/nextafter.cl new file mode 100644 index 0000000..cbe54cd --- /dev/null +++ b/libclc/generic/lib/math/nextafter.cl @@ -0,0 +1,12 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __builtin_nextafterf, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEFINE_BINARY_BUILTIN(double, nextafter, __builtin_nextafter, double, double) + +#endif diff --git a/libclc/generic/lib/math/pown.cl b/libclc/generic/lib/math/pown.cl new file mode 100644 index 0000000..f3b27d4 --- /dev/null +++ b/libclc/generic/lib/math/pown.cl @@ -0,0 +1,10 @@ +#include <clc/clc.h> +#include "../clcmacro.h" + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, pown, float, int) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, pown, double, int) +#endif diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl new file mode 100644 index 0000000..3a40749 --- /dev/null +++ b/libclc/generic/lib/math/sin.cl @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "sincos_helpers.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float sin(float x) +{ + int ix = as_int(x); + int ax = ix & 0x7fffffff; + float dx = as_float(ax); + + float r0, r1; + int regn = __clc_argReductionS(&r0, &r1, dx); + + float ss = __clc_sinf_piby4(r0, r1); + float cc = __clc_cosf_piby4(r0, r1); + + float s = (regn & 1) != 0 ? cc : ss; + s = as_float(as_int(s) ^ ((regn > 1) << 31) ^ (ix ^ ax)); + + s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s; + + //Subnormals + s = x == 0.0f ? x : s; + + return s; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sin, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double sin(double x) { + double y = fabs(x); + + double r, rr; + int regn; + + if (y < 0x1.0p+47) + __clc_remainder_piby2_medium(y, &r, &rr, ®n); + else + __clc_remainder_piby2_large(y, &r, &rr, ®n); + + double2 sc = __clc_sincos_piby4(r, rr); + + int2 s = as_int2(regn & 1 ? sc.hi : sc.lo); + s.hi ^= ((regn > 1) << 31) ^ ((x < 0.0) << 31); + + return isinf(x) | isnan(x) ? as_double(QNANBITPATT_DP64) : as_double(s); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double); + +#endif diff --git a/libclc/generic/lib/math/sincos.cl b/libclc/generic/lib/math/sincos.cl new file mode 100644 index 0000000..eace5ad --- /dev/null +++ b/libclc/generic/lib/math/sincos.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <sincos.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/sincos.inc b/libclc/generic/lib/math/sincos.inc new file mode 100644 index 0000000..e97f0f9 --- /dev/null +++ b/libclc/generic/lib/math/sincos.inc @@ -0,0 +1,11 @@ +#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE sincos (TYPE x, ADDRSPACE TYPE * cosval) { \ + *cosval = cos(x); \ + return sin(x); \ + } + +__CLC_DECLARE_SINCOS(global, __CLC_GENTYPE) +__CLC_DECLARE_SINCOS(local, __CLC_GENTYPE) +__CLC_DECLARE_SINCOS(private, __CLC_GENTYPE) + +#undef __CLC_DECLARE_SINCOS diff --git a/libclc/generic/lib/math/sincosD_piby4.h b/libclc/generic/lib/math/sincosD_piby4.h new file mode 100644 index 0000000..a00db85 --- /dev/null +++ b/libclc/generic/lib/math/sincosD_piby4.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_INLINE double2 +__libclc__sincos_piby4(double x, double xx) +{ + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we add a correction + // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) + // is an approximation to cos(x)*sin(xx) valid because + // xx is tiny relative to x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we subtract a correction + // term g(x,xx) = x*xx to the result, where g(x,xx) + // is an approximation to sin(x)*sin(xx) valid because + // xx is tiny relative to x. + + const double sc1 = -0.166666666666666646259241729; + const double sc2 = 0.833333333333095043065222816e-2; + const double sc3 = -0.19841269836761125688538679e-3; + const double sc4 = 0.275573161037288022676895908448e-5; + const double sc5 = -0.25051132068021699772257377197e-7; + const double sc6 = 0.159181443044859136852668200e-9; + + const double cc1 = 0.41666666666666665390037e-1; + const double cc2 = -0.13888888888887398280412e-2; + const double cc3 = 0.248015872987670414957399e-4; + const double cc4 = -0.275573172723441909470836e-6; + const double cc5 = 0.208761463822329611076335e-8; + const double cc6 = -0.113826398067944859590880e-10; + + double x2 = x * x; + double x3 = x2 * x; + double r = 0.5 * x2; + double t = 1.0 - r; + + double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); + + double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1), + x2*x2, fma(x, xx, (1.0 - t) - r)); + + double2 ret; + ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx)); + ret.hi = cp; + + return ret; +} diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl new file mode 100644 index 0000000..251b7f9 --- /dev/null +++ b/libclc/generic/lib/math/sincos_helpers.cl @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "tables.h" +#include "sincos_helpers.h" + +#define bitalign(hi, lo, shift) \ + ((hi) << (32 - (shift))) | ((lo) >> (shift)); + +#define bytealign(src0, src1, src2) \ + ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8))) + +_CLC_DEF float __clc_sinf_piby4(float x, float y) { + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + + const float c1 = -0.1666666666e0f; + const float c2 = 0.8333331876e-2f; + const float c3 = -0.198400874e-3f; + const float c4 = 0.272500015e-5f; + const float c5 = -2.5050759689e-08f; // 0xb2d72f34 + const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3 + + float z = x * x; + float v = z * x; + float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2); + float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y)); + + return ret; +} + +_CLC_DEF float __clc_cosf_piby4(float x, float y) { + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + + const float c1 = 0.416666666e-1f; + const float c2 = -0.138888876e-2f; + const float c3 = 0.248006008e-4f; + const float c4 = -0.2730101334e-6f; + const float c5 = 2.0875723372e-09f; // 0x310f74f6 + const float c6 = -1.1359647598e-11f; // 0xad47d74e + + float z = x * x; + float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2), c1); + + // if |x| < 0.3 + float qx = 0.0f; + + int ix = as_int(x) & EXSIGNBIT_SP32; + + // 0.78125 > |x| >= 0.3 + float xby4 = as_float(ix - 0x01000000); + qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx; + + // x > 0.78125 + qx = ix > 0x3f480000 ? 0.28125f : qx; + + float hz = mad(z, 0.5f, -qx); + float a = 1.0f - qx; + float ret = a - (hz - mad(z, r, -x*y)); + return ret; +} + +_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt) +{ + if (HAVE_HW_FMA32()) { + float ph = a * b; + *hi = ph; + *lo = fma(a, b, -ph); + } else { + float ah = as_float(as_uint(a) & 0xfffff000U); + float at = a - ah; + float ph = a * b; + float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph)))); + *hi = ph; + *lo = pt; + } +} + +_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) +{ + // 72 bits of pi/2 + const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f; + const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f; + const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f; + + const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f; + const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f; + const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f; + + const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f; + const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f; + const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f; + + const float twobypi = 0x1.45f306p-1f; + + float fnpi2 = trunc(mad(x, twobypi, 0.5f)); + + // subtract n * pi/2 from x + float rhead, rtail; + __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t); + float v = x - rhead; + float rem = v + (((x - v) - rhead) - rtail); + + float rhead2, rtail2; + __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t); + v = rem - rhead2; + rem = v + (((rem - v) - rhead2) - rtail2); + + float rhead3, rtail3; + __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t); + v = rem - rhead3; + + *hi = v + ((rem - v) - rhead3); + *lo = -rtail3; + return fnpi2; +} + +_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) +{ + float fnpi2 = __clc_removePi2S(r, rr, x); + return (int)fnpi2 & 0x3; +} + +#define FULL_MUL(A, B, HI, LO) \ + LO = A * B; \ + HI = mul_hi(A, B) + +#define FULL_MAD(A, B, C, HI, LO) \ + LO = ((A) * (B) + (C)); \ + HI = mul_hi(A, B); \ + HI += LO < C + +_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) +{ + int xe = (int)(as_uint(x) >> 23) - 127; + uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU); + + // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB + const uint b6 = 0xA2F9836EU; + const uint b5 = 0x4E441529U; + const uint b4 = 0xFC2757D1U; + const uint b3 = 0xF534DDC0U; + const uint b2 = 0xDB629599U; + const uint b1 = 0x3C439041U; + const uint b0 = 0xFE5163ABU; + + uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1; + + FULL_MUL(xm, b0, c0, p0); + FULL_MAD(xm, b1, c0, c1, p1); + FULL_MAD(xm, b2, c1, c0, p2); + FULL_MAD(xm, b3, c0, c1, p3); + FULL_MAD(xm, b4, c1, c0, p4); + FULL_MAD(xm, b5, c0, c1, p5); + FULL_MAD(xm, b6, c1, p7, p6); + + uint fbits = 224 + 23 - xe; + + // shift amount to get 2 lsb of integer part at top 2 bits + // min: 25 (xe=18) max: 134 (xe=127) + uint shift = 256U - 2 - fbits; + + // Shift by up to 134/32 = 4 words + int c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + p3 = c ? p2 : p3; + p2 = c ? p1 : p2; + p1 = c ? p0 : p1; + shift -= (-c) & 32; + + c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + p3 = c ? p2 : p3; + p2 = c ? p1 : p2; + shift -= (-c) & 32; + + c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + p3 = c ? p2 : p3; + shift -= (-c) & 32; + + c = shift > 31; + p7 = c ? p6 : p7; + p6 = c ? p5 : p6; + p5 = c ? p4 : p5; + p4 = c ? p3 : p4; + shift -= (-c) & 32; + + // bitalign cannot handle a shift of 32 + c = shift > 0; + shift = 32 - shift; + uint t7 = bitalign(p7, p6, shift); + uint t6 = bitalign(p6, p5, shift); + uint t5 = bitalign(p5, p4, shift); + p7 = c ? t7 : p7; + p6 = c ? t6 : p6; + p5 = c ? t5 : p5; + + // Get 2 lsb of int part and msb of fraction + int i = p7 >> 29; + + // Scoot up 2 more bits so only fraction remains + p7 = bitalign(p7, p6, 30); + p6 = bitalign(p6, p5, 30); + p5 = bitalign(p5, p4, 30); + + // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5 + uint flip = i & 1 ? 0xffffffffU : 0U; + uint sign = i & 1 ? 0x80000000U : 0U; + p7 = p7 ^ flip; + p6 = p6 ^ flip; + p5 = p5 ^ flip; + + // Find exponent and shift away leading zeroes and hidden bit + xe = clz(p7) + 1; + shift = 32 - xe; + p7 = bitalign(p7, p6, shift); + p6 = bitalign(p6, p5, shift); + + // Most significant part of fraction + float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9)); + + // Shift out bits we captured on q1 + p7 = bitalign(p7, p6, 32-23); + + // Get 24 more bits of fraction in another float, there are not long strings of zeroes here + int xxe = clz(p7) + 1; + p7 = bitalign(p7, p6, 32-xxe); + float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9)); + + // At this point, the fraction q1 + q0 is correct to at least 48 bits + // Now we need to multiply the fraction by pi/2 + // This loses us about 4 bits + // pi/2 = C90 FDA A22 168 C23 4C4 + + const float pio2h = (float)0xc90fda / 0x1.0p+23f; + const float pio2hh = (float)0xc90 / 0x1.0p+11f; + const float pio2ht = (float)0xfda / 0x1.0p+23f; + const float pio2t = (float)0xa22168 / 0x1.0p+47f; + + float rh, rt; + + if (HAVE_HW_FMA32()) { + rh = q1 * pio2h; + rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh))); + } else { + float q1h = as_float(as_uint(q1) & 0xfffff000); + float q1t = q1 - q1h; + rh = q1 * pio2h; + rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh)))); + rt = mad(q0, pio2h, mad(q1, pio2t, rt)); + } + + float t = rh + rt; + rt = rt - (t - rh); + + *r = t; + *rr = rt; + return ((i >> 1) + (i & 1)) & 0x3; +} + +_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) +{ + if (x < 0x1.0p+23f) + return __clc_argReductionSmallS(r, rr, x); + else + return __clc_argReductionLargeS(r, rr, x); +} + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// Reduction for medium sized arguments +_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) { + // How many pi/2 is x a multiple of? + const double two_by_pi = 0x1.45f306dc9c883p-1; + double dnpi2 = trunc(fma(x, two_by_pi, 0.5)); + + const double piby2_h = -7074237752028440.0 / 0x1.0p+52; + const double piby2_m = -2483878800010755.0 / 0x1.0p+105; + const double piby2_t = -3956492004828932.0 / 0x1.0p+158; + + // Compute product of npi2 with 159 bits of 2/pi + double p_hh = piby2_h * dnpi2; + double p_ht = fma(piby2_h, dnpi2, -p_hh); + double p_mh = piby2_m * dnpi2; + double p_mt = fma(piby2_m, dnpi2, -p_mh); + double p_th = piby2_t * dnpi2; + double p_tt = fma(piby2_t, dnpi2, -p_th); + + // Reduce to 159 bits + double ph = p_hh; + double pm = p_ht + p_mh; + double t = p_mh - (pm - p_ht); + double pt = p_th + t + p_mt + p_tt; + t = ph + pm; pm = pm - (t - ph); ph = t; + t = pm + pt; pt = pt - (t - pm); pm = t; + + // Subtract from x + t = x + ph; + double qh = t + pm; + double qt = pm - (qh - t) + pt; + + *r = qh; + *rr = qt; + *regn = (int)(long)dnpi2 & 0x3; +} + +// Given positive argument x, reduce it to the range [-pi/4,pi/4] using +// extra precision, and return the result in r, rr. +// Return value "regn" tells how many lots of pi/2 were subtracted +// from x to put it in the range [-pi/4,pi/4], mod 4. + +_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) { + + long ux = as_long(x); + int e = (int)(ux >> 52) - 1023; + int i = max(23, (e >> 3) + 17); + int j = 150 - i; + int j16 = j & ~0xf; + double fract_temp; + + // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary + uint4 q0 = USE_TABLE(pibits_tbl, j16); + uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16)); + uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32)); + + int k = (j >> 2) & 0x3; + int4 c = (int4)k == (int4)(0, 1, 2, 3); + + uint u0, u1, u2, u3, u4, u5, u6; + + u0 = c.s1 ? q0.s1 : q0.s0; + u0 = c.s2 ? q0.s2 : u0; + u0 = c.s3 ? q0.s3 : u0; + + u1 = c.s1 ? q0.s2 : q0.s1; + u1 = c.s2 ? q0.s3 : u1; + u1 = c.s3 ? q1.s0 : u1; + + u2 = c.s1 ? q0.s3 : q0.s2; + u2 = c.s2 ? q1.s0 : u2; + u2 = c.s3 ? q1.s1 : u2; + + u3 = c.s1 ? q1.s0 : q0.s3; + u3 = c.s2 ? q1.s1 : u3; + u3 = c.s3 ? q1.s2 : u3; + + u4 = c.s1 ? q1.s1 : q1.s0; + u4 = c.s2 ? q1.s2 : u4; + u4 = c.s3 ? q1.s3 : u4; + + u5 = c.s1 ? q1.s2 : q1.s1; + u5 = c.s2 ? q1.s3 : u5; + u5 = c.s3 ? q2.s0 : u5; + + u6 = c.s1 ? q1.s3 : q1.s2; + u6 = c.s2 ? q2.s0 : u6; + u6 = c.s3 ? q2.s1 : u6; + + uint v0 = bytealign(u1, u0, j); + uint v1 = bytealign(u2, u1, j); + uint v2 = bytealign(u3, u2, j); + uint v3 = bytealign(u4, u3, j); + uint v4 = bytealign(u5, u4, j); + uint v5 = bytealign(u6, u5, j); + + // Place those 192 bits in 4 48-bit doubles along with correct exponent + // If i > 1018 we would get subnormals so we scale p up and x down to get the same product + i = 2 + 8*i; + x *= i > 1018 ? 0x1.0p-136 : 1.0; + i -= i > 1018 ? 136 : 0; + + uint ua = (uint)(1023 + 52 - i) << 20; + double a = as_double((uint2)(0, ua)); + double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a; + + // Exact multiply + double f0h = p0 * x; + double f0l = fma(p0, x, -f0h); + double f1h = p1 * x; + double f1l = fma(p1, x, -f1h); + double f2h = p2 * x; + double f2l = fma(p2, x, -f2h); + double f3h = p3 * x; + double f3l = fma(p3, x, -f3h); + + // Accumulate product into 4 doubles + double s, t; + + double f3 = f3h + f2h; + t = f2h - (f3 - f3h); + s = f3l + t; + t = t - (s - f3l); + + double f2 = s + f1h; + t = f1h - (f2 - s) + t; + s = f2l + t; + t = t - (s - f2l); + + double f1 = s + f0h; + t = f0h - (f1 - s) + t; + s = f1l + t; + + double f0 = s + f0l; + + // Strip off unwanted large integer bits + f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp); + f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0; + + // Compute least significant integer bits + t = f3 + f2; + double di = t - fract(t, &fract_temp); + i = (float)di; + + // Shift out remaining integer part + f3 -= di; + s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t; + s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t; + f1 += f0; + + // Subtract 1 if fraction is >= 0.5, and update regn + int g = f3 >= 0.5; + i += g; + f3 -= (float)g; + + // Shift up bits + s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1; + + // Multiply precise fraction by pi/2 to get radians + const double p2h = 7074237752028440.0 / 0x1.0p+52; + const double p2t = 4967757600021510.0 / 0x1.0p+106; + + double rhi = f3 * p2h; + double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi))); + + *r = rhi + rlo; + *rr = rlo - (*r - rhi); + *regn = i & 0x3; +} + + +_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) { + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we add a correction + // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) + // is an approximation to cos(x)*sin(xx) valid because + // xx is tiny relative to x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we subtract a correction + // term g(x,xx) = x*xx to the result, where g(x,xx) + // is an approximation to sin(x)*sin(xx) valid because + // xx is tiny relative to x. + + const double sc1 = -0.166666666666666646259241729; + const double sc2 = 0.833333333333095043065222816e-2; + const double sc3 = -0.19841269836761125688538679e-3; + const double sc4 = 0.275573161037288022676895908448e-5; + const double sc5 = -0.25051132068021699772257377197e-7; + const double sc6 = 0.159181443044859136852668200e-9; + + const double cc1 = 0.41666666666666665390037e-1; + const double cc2 = -0.13888888888887398280412e-2; + const double cc3 = 0.248015872987670414957399e-4; + const double cc4 = -0.275573172723441909470836e-6; + const double cc5 = 0.208761463822329611076335e-8; + const double cc6 = -0.113826398067944859590880e-10; + + double x2 = x * x; + double x3 = x2 * x; + double r = 0.5 * x2; + double t = 1.0 - r; + + double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); + + double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1), + x2*x2, fma(x, xx, (1.0 - t) - r)); + + double2 ret; + ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx)); + ret.hi = cp; + + return ret; +} + +#endif diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h new file mode 100644 index 0000000..2565d44 --- /dev/null +++ b/libclc/generic/lib/math/sincos_helpers.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_DECL float __clc_sinf_piby4(float x, float y); +_CLC_DECL float __clc_cosf_piby4(float x, float y); +_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn); +_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn); +_CLC_DECL double2 __clc_sincos_piby4(double x, double xx); + +#endif diff --git a/libclc/generic/lib/math/sincospiF_piby4.h b/libclc/generic/lib/math/sincospiF_piby4.h new file mode 100644 index 0000000..90ecb1d --- /dev/null +++ b/libclc/generic/lib/math/sincospiF_piby4.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4] +_CLC_INLINE float2 +__libclc__sincosf_piby4(float x) +{ + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + + const float sc1 = -0.166666666638608441788607926e0F; + const float sc2 = 0.833333187633086262120839299e-2F; + const float sc3 = -0.198400874359527693921333720e-3F; + const float sc4 = 0.272500015145584081596826911e-5F; + + const float cc1 = 0.41666666664325175238031e-1F; + const float cc2 = -0.13888887673175665567647e-2F; + const float cc3 = 0.24800600878112441958053e-4F; + const float cc4 = -0.27301013343179832472841e-6F; + + float x2 = x * x; + + float2 ret; + ret.x = mad(x*x2, mad(x2, mad(x2, mad(x2, sc4, sc3), sc2), sc1), x); + ret.y = mad(x2*x2, mad(x2, mad(x2, mad(x2, cc4, cc3), cc2), cc1), mad(x2, -0.5f, 1.0f)); + return ret; +} diff --git a/libclc/generic/lib/math/sinpi.cl b/libclc/generic/lib/math/sinpi.cl new file mode 100644 index 0000000..dbb995f --- /dev/null +++ b/libclc/generic/lib/math/sinpi.cl @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "sincospiF_piby4.h" +#include "../clcmacro.h" +#ifdef cl_khr_fp64 +#include "sincosD_piby4.h" +#endif + +_CLC_OVERLOAD _CLC_DEF float sinpi(float x) +{ + int ix = as_int(x); + int xsgn = ix & 0x80000000; + ix ^= xsgn; + float ax = as_float(ix); + int iax = (int)ax; + float r = ax - iax; + int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0); + + // Initialize with return for +-Inf and NaN + int ir = 0x7fc00000; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < 0x7f800000 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + float a = 1.0f - r; + int e = 0; + + // r <= 0.75 + int c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + + // 0 < r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 0 : e; + + float2 t = __libclc__sincosf_piby4(a * M_PI_F); + int jr = xodd ^ as_int(e ? t.hi : t.lo); + + ir = ix < 0x4b000000 ? jr : ir; + + return as_float(ir); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sinpi, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double sinpi(double x) +{ + long ix = as_long(x); + long xsgn = ix & 0x8000000000000000L; + ix ^= xsgn; + double ax = as_double(ix); + long iax = (long)ax; + double r = ax - (double)iax; + long xodd = xsgn ^ (iax & 0x1L ? 0x8000000000000000L : 0L); + + // Initialize with return for +-Inf and NaN + long ir = 0x7ff8000000000000L; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < 0x7ff0000000000000 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + double a = 1.0 - r; + int e = 0; + + // r <= 0.75 + int c = r <= 0.75; + double t = r - 0.5; + a = c ? t : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 0 : e; + + double api = a * M_PI; + double2 sc = __libclc__sincos_piby4(api, 0.0); + long jr = xodd ^ as_long(e ? sc.hi : sc.lo); + + ir = ax < 0x1.0p+52 ? jr : ir; + + return as_double(ir); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinpi, double) + +#endif diff --git a/libclc/generic/lib/math/sqrt.cl b/libclc/generic/lib/math/sqrt.cl new file mode 100644 index 0000000..300e274 --- /dev/null +++ b/libclc/generic/lib/math/sqrt.cl @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> +#include "../clcmacro.h" +#include "math/clc_sqrt.h" + +_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEFINE_UNARY_BUILTIN(double, sqrt, __clc_sqrt, double) + +#endif diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl new file mode 100644 index 0000000..8286efb --- /dev/null +++ b/libclc/generic/lib/math/tables.cl @@ -0,0 +1,841 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "tables.h" + +DECLARE_TABLE(float2, LOGE_TBL, 129) = { + (float2)(0x0.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.fe0000p-8f, 0x1.535882p-23f), + (float2)(0x1.fc0000p-7f, 0x1.5161f8p-20f), + (float2)(0x1.7b8000p-6f, 0x1.1b07d4p-18f), + (float2)(0x1.f82000p-6f, 0x1.361cf0p-19f), + (float2)(0x1.39e000p-5f, 0x1.0f73fcp-18f), + (float2)(0x1.774000p-5f, 0x1.63d8cap-19f), + (float2)(0x1.b42000p-5f, 0x1.bae232p-18f), + (float2)(0x1.f0a000p-5f, 0x1.86008ap-20f), + (float2)(0x1.164000p-4f, 0x1.36eea2p-16f), + (float2)(0x1.340000p-4f, 0x1.d7961ap-16f), + (float2)(0x1.51a000p-4f, 0x1.073f06p-16f), + (float2)(0x1.6f0000p-4f, 0x1.a515cap-17f), + (float2)(0x1.8c2000p-4f, 0x1.45d630p-16f), + (float2)(0x1.a92000p-4f, 0x1.b4e92ap-18f), + (float2)(0x1.c5e000p-4f, 0x1.523d6ep-18f), + (float2)(0x1.e26000p-4f, 0x1.076e2ap-16f), + (float2)(0x1.fec000p-4f, 0x1.2263b6p-17f), + (float2)(0x1.0d6000p-3f, 0x1.7e7cd0p-15f), + (float2)(0x1.1b6000p-3f, 0x1.2ad52ep-15f), + (float2)(0x1.294000p-3f, 0x1.52f81ep-15f), + (float2)(0x1.370000p-3f, 0x1.fc201ep-15f), + (float2)(0x1.44c000p-3f, 0x1.2b6ccap-15f), + (float2)(0x1.526000p-3f, 0x1.cbc742p-16f), + (float2)(0x1.5fe000p-3f, 0x1.3070a6p-15f), + (float2)(0x1.6d6000p-3f, 0x1.fce33ap-20f), + (float2)(0x1.7aa000p-3f, 0x1.890210p-15f), + (float2)(0x1.87e000p-3f, 0x1.a06520p-15f), + (float2)(0x1.952000p-3f, 0x1.6a73d0p-17f), + (float2)(0x1.a22000p-3f, 0x1.bc1fe2p-15f), + (float2)(0x1.af2000p-3f, 0x1.c94e80p-15f), + (float2)(0x1.bc2000p-3f, 0x1.0ce85ap-16f), + (float2)(0x1.c8e000p-3f, 0x1.f7c79ap-15f), + (float2)(0x1.d5c000p-3f, 0x1.0b5a7cp-18f), + (float2)(0x1.e26000p-3f, 0x1.076e2ap-15f), + (float2)(0x1.ef0000p-3f, 0x1.5b97b8p-16f), + (float2)(0x1.fb8000p-3f, 0x1.186d5ep-15f), + (float2)(0x1.040000p-2f, 0x1.2ca5a6p-17f), + (float2)(0x1.0a2000p-2f, 0x1.24e272p-14f), + (float2)(0x1.104000p-2f, 0x1.8bf9aep-14f), + (float2)(0x1.166000p-2f, 0x1.5cabaap-14f), + (float2)(0x1.1c8000p-2f, 0x1.3182d2p-15f), + (float2)(0x1.228000p-2f, 0x1.41fbcep-14f), + (float2)(0x1.288000p-2f, 0x1.5a13dep-14f), + (float2)(0x1.2e8000p-2f, 0x1.c575c2p-15f), + (float2)(0x1.346000p-2f, 0x1.dd9a98p-14f), + (float2)(0x1.3a6000p-2f, 0x1.3155a4p-16f), + (float2)(0x1.404000p-2f, 0x1.843434p-17f), + (float2)(0x1.460000p-2f, 0x1.8bc21cp-14f), + (float2)(0x1.4be000p-2f, 0x1.7e55dcp-16f), + (float2)(0x1.51a000p-2f, 0x1.5b0e5ap-15f), + (float2)(0x1.576000p-2f, 0x1.dc5d14p-16f), + (float2)(0x1.5d0000p-2f, 0x1.bdbf58p-14f), + (float2)(0x1.62c000p-2f, 0x1.05e572p-15f), + (float2)(0x1.686000p-2f, 0x1.903d36p-15f), + (float2)(0x1.6e0000p-2f, 0x1.1d5456p-15f), + (float2)(0x1.738000p-2f, 0x1.d7f6bap-14f), + (float2)(0x1.792000p-2f, 0x1.4abfbap-15f), + (float2)(0x1.7ea000p-2f, 0x1.f07704p-15f), + (float2)(0x1.842000p-2f, 0x1.a3b43cp-15f), + (float2)(0x1.89a000p-2f, 0x1.9c360ap-17f), + (float2)(0x1.8f0000p-2f, 0x1.1e8736p-14f), + (float2)(0x1.946000p-2f, 0x1.941c20p-14f), + (float2)(0x1.99c000p-2f, 0x1.958116p-14f), + (float2)(0x1.9f2000p-2f, 0x1.23ecbep-14f), + (float2)(0x1.a48000p-2f, 0x1.024396p-16f), + (float2)(0x1.a9c000p-2f, 0x1.d93534p-15f), + (float2)(0x1.af0000p-2f, 0x1.293246p-14f), + (float2)(0x1.b44000p-2f, 0x1.eef798p-15f), + (float2)(0x1.b98000p-2f, 0x1.625a4cp-16f), + (float2)(0x1.bea000p-2f, 0x1.4d9da6p-14f), + (float2)(0x1.c3c000p-2f, 0x1.d7a7ccp-14f), + (float2)(0x1.c8e000p-2f, 0x1.f7c79ap-14f), + (float2)(0x1.ce0000p-2f, 0x1.af0b84p-14f), + (float2)(0x1.d32000p-2f, 0x1.fcfc00p-15f), + (float2)(0x1.d82000p-2f, 0x1.e7258ap-14f), + (float2)(0x1.dd4000p-2f, 0x1.a81306p-16f), + (float2)(0x1.e24000p-2f, 0x1.1034f8p-15f), + (float2)(0x1.e74000p-2f, 0x1.09875ap-16f), + (float2)(0x1.ec2000p-2f, 0x1.99d246p-14f), + (float2)(0x1.f12000p-2f, 0x1.1ebf5ep-15f), + (float2)(0x1.f60000p-2f, 0x1.23fa70p-14f), + (float2)(0x1.fae000p-2f, 0x1.588f78p-14f), + (float2)(0x1.ffc000p-2f, 0x1.2e0856p-14f), + (float2)(0x1.024000p-1f, 0x1.52a5a4p-13f), + (float2)(0x1.04a000p-1f, 0x1.df9da8p-13f), + (float2)(0x1.072000p-1f, 0x1.f2e0e6p-16f), + (float2)(0x1.098000p-1f, 0x1.bd3d5cp-15f), + (float2)(0x1.0be000p-1f, 0x1.cb9094p-15f), + (float2)(0x1.0e4000p-1f, 0x1.261746p-15f), + (float2)(0x1.108000p-1f, 0x1.f39e2cp-13f), + (float2)(0x1.12e000p-1f, 0x1.719592p-13f), + (float2)(0x1.154000p-1f, 0x1.87a5e8p-14f), + (float2)(0x1.178000p-1f, 0x1.eabbd8p-13f), + (float2)(0x1.19e000p-1f, 0x1.cd68cep-14f), + (float2)(0x1.1c2000p-1f, 0x1.b81f70p-13f), + (float2)(0x1.1e8000p-1f, 0x1.7d79c0p-15f), + (float2)(0x1.20c000p-1f, 0x1.b9a324p-14f), + (float2)(0x1.230000p-1f, 0x1.30d7bep-13f), + (float2)(0x1.254000p-1f, 0x1.5bce98p-13f), + (float2)(0x1.278000p-1f, 0x1.5e1288p-13f), + (float2)(0x1.29c000p-1f, 0x1.37fec2p-13f), + (float2)(0x1.2c0000p-1f, 0x1.d3da88p-14f), + (float2)(0x1.2e4000p-1f, 0x1.d0db90p-15f), + (float2)(0x1.306000p-1f, 0x1.d7334ep-13f), + (float2)(0x1.32a000p-1f, 0x1.133912p-13f), + (float2)(0x1.34e000p-1f, 0x1.44ece6p-16f), + (float2)(0x1.370000p-1f, 0x1.17b546p-13f), + (float2)(0x1.392000p-1f, 0x1.e0d356p-13f), + (float2)(0x1.3b6000p-1f, 0x1.0893fep-14f), + (float2)(0x1.3d8000p-1f, 0x1.026a70p-13f), + (float2)(0x1.3fa000p-1f, 0x1.5b84d0p-13f), + (float2)(0x1.41c000p-1f, 0x1.8fe846p-13f), + (float2)(0x1.43e000p-1f, 0x1.9fe2f8p-13f), + (float2)(0x1.460000p-1f, 0x1.8bc21cp-13f), + (float2)(0x1.482000p-1f, 0x1.53d1eap-13f), + (float2)(0x1.4a4000p-1f, 0x1.f0bb60p-14f), + (float2)(0x1.4c6000p-1f, 0x1.e6bf32p-15f), + (float2)(0x1.4e6000p-1f, 0x1.d811b6p-13f), + (float2)(0x1.508000p-1f, 0x1.13cc00p-13f), + (float2)(0x1.52a000p-1f, 0x1.6932dep-16f), + (float2)(0x1.54a000p-1f, 0x1.246798p-13f), + (float2)(0x1.56a000p-1f, 0x1.f9d5b2p-13f), + (float2)(0x1.58c000p-1f, 0x1.5b6b9ap-14f), + (float2)(0x1.5ac000p-1f, 0x1.404c34p-13f), + (float2)(0x1.5cc000p-1f, 0x1.b1dc6cp-13f), + (float2)(0x1.5ee000p-1f, 0x1.54920ap-20f), + (float2)(0x1.60e000p-1f, 0x1.97a23cp-16f), + (float2)(0x1.62e000p-1f, 0x1.0bfbe8p-15f), +}; + +DECLARE_TABLE(float, LOG_INV_TBL, 129) = { + 0x1.000000p+1f, + 0x1.fc07f0p+0f, + 0x1.f81f82p+0f, + 0x1.f4465ap+0f, + 0x1.f07c20p+0f, + 0x1.ecc07cp+0f, + 0x1.e9131ap+0f, + 0x1.e573acp+0f, + 0x1.e1e1e2p+0f, + 0x1.de5d6ep+0f, + 0x1.dae608p+0f, + 0x1.d77b66p+0f, + 0x1.d41d42p+0f, + 0x1.d0cb58p+0f, + 0x1.cd8568p+0f, + 0x1.ca4b30p+0f, + 0x1.c71c72p+0f, + 0x1.c3f8f0p+0f, + 0x1.c0e070p+0f, + 0x1.bdd2b8p+0f, + 0x1.bacf92p+0f, + 0x1.b7d6c4p+0f, + 0x1.b4e81cp+0f, + 0x1.b20364p+0f, + 0x1.af286cp+0f, + 0x1.ac5702p+0f, + 0x1.a98ef6p+0f, + 0x1.a6d01ap+0f, + 0x1.a41a42p+0f, + 0x1.a16d40p+0f, + 0x1.9ec8eap+0f, + 0x1.9c2d14p+0f, + 0x1.99999ap+0f, + 0x1.970e50p+0f, + 0x1.948b10p+0f, + 0x1.920fb4p+0f, + 0x1.8f9c18p+0f, + 0x1.8d3018p+0f, + 0x1.8acb90p+0f, + 0x1.886e60p+0f, + 0x1.861862p+0f, + 0x1.83c978p+0f, + 0x1.818182p+0f, + 0x1.7f4060p+0f, + 0x1.7d05f4p+0f, + 0x1.7ad220p+0f, + 0x1.78a4c8p+0f, + 0x1.767dcep+0f, + 0x1.745d18p+0f, + 0x1.724288p+0f, + 0x1.702e06p+0f, + 0x1.6e1f76p+0f, + 0x1.6c16c2p+0f, + 0x1.6a13cep+0f, + 0x1.681682p+0f, + 0x1.661ec6p+0f, + 0x1.642c86p+0f, + 0x1.623fa8p+0f, + 0x1.605816p+0f, + 0x1.5e75bcp+0f, + 0x1.5c9882p+0f, + 0x1.5ac056p+0f, + 0x1.58ed24p+0f, + 0x1.571ed4p+0f, + 0x1.555556p+0f, + 0x1.539094p+0f, + 0x1.51d07ep+0f, + 0x1.501502p+0f, + 0x1.4e5e0ap+0f, + 0x1.4cab88p+0f, + 0x1.4afd6ap+0f, + 0x1.49539ep+0f, + 0x1.47ae14p+0f, + 0x1.460cbcp+0f, + 0x1.446f86p+0f, + 0x1.42d662p+0f, + 0x1.414142p+0f, + 0x1.3fb014p+0f, + 0x1.3e22ccp+0f, + 0x1.3c995ap+0f, + 0x1.3b13b2p+0f, + 0x1.3991c2p+0f, + 0x1.381382p+0f, + 0x1.3698e0p+0f, + 0x1.3521d0p+0f, + 0x1.33ae46p+0f, + 0x1.323e34p+0f, + 0x1.30d190p+0f, + 0x1.2f684cp+0f, + 0x1.2e025cp+0f, + 0x1.2c9fb4p+0f, + 0x1.2b404ap+0f, + 0x1.29e412p+0f, + 0x1.288b02p+0f, + 0x1.27350cp+0f, + 0x1.25e228p+0f, + 0x1.24924ap+0f, + 0x1.234568p+0f, + 0x1.21fb78p+0f, + 0x1.20b470p+0f, + 0x1.1f7048p+0f, + 0x1.1e2ef4p+0f, + 0x1.1cf06ap+0f, + 0x1.1bb4a4p+0f, + 0x1.1a7b96p+0f, + 0x1.194538p+0f, + 0x1.181182p+0f, + 0x1.16e068p+0f, + 0x1.15b1e6p+0f, + 0x1.1485f0p+0f, + 0x1.135c82p+0f, + 0x1.12358ep+0f, + 0x1.111112p+0f, + 0x1.0fef02p+0f, + 0x1.0ecf56p+0f, + 0x1.0db20ap+0f, + 0x1.0c9714p+0f, + 0x1.0b7e6ep+0f, + 0x1.0a6810p+0f, + 0x1.0953f4p+0f, + 0x1.084210p+0f, + 0x1.073260p+0f, + 0x1.0624dep+0f, + 0x1.051980p+0f, + 0x1.041042p+0f, + 0x1.03091cp+0f, + 0x1.020408p+0f, + 0x1.010102p+0f, + 0x1.000000p+0f, +}; + +DECLARE_TABLE(float2, LOG2_TBL, 129) = { + (float2)(0x0.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.6f8000p-7f, 0x1.942dbap-17f), + (float2)(0x1.6e0000p-6f, 0x1.e5a170p-16f), + (float2)(0x1.118000p-5f, 0x1.347544p-15f), + (float2)(0x1.6b8000p-5f, 0x1.69bac6p-16f), + (float2)(0x1.c48000p-5f, 0x1.7eae42p-15f), + (float2)(0x1.0e8000p-4f, 0x1.9c4fd0p-15f), + (float2)(0x1.3a8000p-4f, 0x1.17ee92p-15f), + (float2)(0x1.660000p-4f, 0x1.fb7d64p-15f), + (float2)(0x1.918000p-4f, 0x1.42dc8cp-17f), + (float2)(0x1.bc8000p-4f, 0x1.0902b6p-18f), + (float2)(0x1.e70000p-4f, 0x1.7608bep-15f), + (float2)(0x1.088000p-3f, 0x1.162336p-13f), + (float2)(0x1.1d8000p-3f, 0x1.3465d4p-13f), + (float2)(0x1.328000p-3f, 0x1.74f13cp-14f), + (float2)(0x1.470000p-3f, 0x1.aa7e60p-13f), + (float2)(0x1.5c0000p-3f, 0x1.a39fbcp-19f), + (float2)(0x1.700000p-3f, 0x1.d0b53ap-13f), + (float2)(0x1.848000p-3f, 0x1.0af40ap-13f), + (float2)(0x1.988000p-3f, 0x1.b741dep-13f), + (float2)(0x1.ac8000p-3f, 0x1.d78b6cp-13f), + (float2)(0x1.c08000p-3f, 0x1.6db376p-13f), + (float2)(0x1.d48000p-3f, 0x1.ee4c32p-15f), + (float2)(0x1.e80000p-3f, 0x1.02f9d2p-13f), + (float2)(0x1.fb8000p-3f, 0x1.05ae40p-13f), + (float2)(0x1.078000p-2f, 0x1.0adbb0p-14f), + (float2)(0x1.110000p-2f, 0x1.83ed68p-13f), + (float2)(0x1.1a8000p-2f, 0x1.016ca4p-12f), + (float2)(0x1.240000p-2f, 0x1.01eac2p-12f), + (float2)(0x1.2d8000p-2f, 0x1.887e26p-13f), + (float2)(0x1.370000p-2f, 0x1.24cea4p-14f), + (float2)(0x1.400000p-2f, 0x1.918ec6p-12f), + (float2)(0x1.498000p-2f, 0x1.3c25e6p-13f), + (float2)(0x1.528000p-2f, 0x1.6f7f12p-12f), + (float2)(0x1.5c0000p-2f, 0x1.a39fbcp-18f), + (float2)(0x1.650000p-2f, 0x1.8fe466p-14f), + (float2)(0x1.6e0000p-2f, 0x1.10e6cep-13f), + (float2)(0x1.770000p-2f, 0x1.d2ba7ep-14f), + (float2)(0x1.800000p-2f, 0x1.4ac62cp-15f), + (float2)(0x1.888000p-2f, 0x1.a71cb8p-12f), + (float2)(0x1.918000p-2f, 0x1.dd448ep-13f), + (float2)(0x1.9a8000p-2f, 0x1.1c8f10p-21f), + (float2)(0x1.a30000p-2f, 0x1.bb053ep-13f), + (float2)(0x1.ab8000p-2f, 0x1.861e5ep-12f), + (float2)(0x1.b40000p-2f, 0x1.fafdcep-12f), + (float2)(0x1.bd0000p-2f, 0x1.e5d3cep-15f), + (float2)(0x1.c58000p-2f, 0x1.2fad28p-14f), + (float2)(0x1.ce0000p-2f, 0x1.492474p-15f), + (float2)(0x1.d60000p-2f, 0x1.d4f80cp-12f), + (float2)(0x1.de8000p-2f, 0x1.4ff510p-12f), + (float2)(0x1.e70000p-2f, 0x1.3550f2p-13f), + (float2)(0x1.ef0000p-2f, 0x1.b59ccap-12f), + (float2)(0x1.f78000p-2f, 0x1.42b464p-13f), + (float2)(0x1.ff8000p-2f, 0x1.5e66a0p-12f), + (float2)(0x1.038000p-1f, 0x1.f6a2e4p-11f), + (float2)(0x1.080000p-1f, 0x1.39e4fep-14f), + (float2)(0x1.0c0000p-1f, 0x1.0500d6p-13f), + (float2)(0x1.100000p-1f, 0x1.13b152p-13f), + (float2)(0x1.140000p-1f, 0x1.93f542p-14f), + (float2)(0x1.180000p-1f, 0x1.467b94p-16f), + (float2)(0x1.1b8000p-1f, 0x1.cc47a4p-11f), + (float2)(0x1.1f8000p-1f, 0x1.78f4c2p-11f), + (float2)(0x1.238000p-1f, 0x1.107508p-11f), + (float2)(0x1.278000p-1f, 0x1.2602c2p-12f), + (float2)(0x1.2b8000p-1f, 0x1.a39fbcp-20f), + (float2)(0x1.2f0000p-1f, 0x1.5a1d7ap-11f), + (float2)(0x1.330000p-1f, 0x1.3e355ap-12f), + (float2)(0x1.368000p-1f, 0x1.cffedap-11f), + (float2)(0x1.3a8000p-1f, 0x1.d9fd50p-12f), + (float2)(0x1.3e0000p-1f, 0x1.f64de6p-11f), + (float2)(0x1.420000p-1f, 0x1.d83f4cp-12f), + (float2)(0x1.458000p-1f, 0x1.cea628p-11f), + (float2)(0x1.498000p-1f, 0x1.3c25e6p-12f), + (float2)(0x1.4d0000p-1f, 0x1.5a96ccp-11f), + (float2)(0x1.510000p-1f, 0x1.18708ap-17f), + (float2)(0x1.548000p-1f, 0x1.374652p-12f), + (float2)(0x1.580000p-1f, 0x1.2089a6p-11f), + (float2)(0x1.5b8000p-1f, 0x1.93432cp-11f), + (float2)(0x1.5f0000p-1f, 0x1.f3fd06p-11f), + (float2)(0x1.630000p-1f, 0x1.0b8f54p-13f), + (float2)(0x1.668000p-1f, 0x1.004722p-12f), + (float2)(0x1.6a0000p-1f, 0x1.57cf2cp-12f), + (float2)(0x1.6d8000p-1f, 0x1.8cb53ap-12f), + (float2)(0x1.710000p-1f, 0x1.9f4d8ap-12f), + (float2)(0x1.748000p-1f, 0x1.8feb26p-12f), + (float2)(0x1.780000p-1f, 0x1.5edfeep-12f), + (float2)(0x1.7b8000p-1f, 0x1.0c7c9ap-12f), + (float2)(0x1.7f0000p-1f, 0x1.322182p-13f), + (float2)(0x1.828000p-1f, 0x1.3ab7cep-18f), + (float2)(0x1.858000p-1f, 0x1.a82c2cp-11f), + (float2)(0x1.890000p-1f, 0x1.3dd2c0p-11f), + (float2)(0x1.8c8000p-1f, 0x1.871da4p-12f), + (float2)(0x1.900000p-1f, 0x1.cc2c00p-14f), + (float2)(0x1.930000p-1f, 0x1.9fdb68p-11f), + (float2)(0x1.968000p-1f, 0x1.ed6956p-12f), + (float2)(0x1.9a0000p-1f, 0x1.f1a760p-14f), + (float2)(0x1.9d0000p-1f, 0x1.767f54p-11f), + (float2)(0x1.a08000p-1f, 0x1.3f6d26p-12f), + (float2)(0x1.a38000p-1f, 0x1.b9fce2p-11f), + (float2)(0x1.a70000p-1f, 0x1.8ae816p-12f), + (float2)(0x1.aa0000p-1f, 0x1.c23d60p-11f), + (float2)(0x1.ad8000p-1f, 0x1.60f388p-12f), + (float2)(0x1.b08000p-1f, 0x1.9049aep-11f), + (float2)(0x1.b40000p-1f, 0x1.8734a8p-13f), + (float2)(0x1.b70000p-1f, 0x1.2523d4p-11f), + (float2)(0x1.ba0000p-1f, 0x1.da6ce6p-11f), + (float2)(0x1.bd8000p-1f, 0x1.038e62p-12f), + (float2)(0x1.c08000p-1f, 0x1.1b511ep-11f), + (float2)(0x1.c38000p-1f, 0x1.a728b8p-11f), + (float2)(0x1.c70000p-1f, 0x1.2b5d22p-14f), + (float2)(0x1.ca0000p-1f, 0x1.2c6e54p-12f), + (float2)(0x1.cd0000p-1f, 0x1.f35064p-12f), + (float2)(0x1.d00000p-1f, 0x1.4fdb48p-11f), + (float2)(0x1.d30000p-1f, 0x1.98ec9ep-11f), + (float2)(0x1.d60000p-1f, 0x1.d4f80cp-11f), + (float2)(0x1.d98000p-1f, 0x1.0643d6p-17f), + (float2)(0x1.dc8000p-1f, 0x1.33567ep-14f), + (float2)(0x1.df8000p-1f, 0x1.e0410cp-14f), + (float2)(0x1.e28000p-1f, 0x1.142e0ep-13f), + (float2)(0x1.e58000p-1f, 0x1.063c88p-13f), + (float2)(0x1.e88000p-1f, 0x1.8d66c4p-14f), + (float2)(0x1.eb8000p-1f, 0x1.57e32ap-15f), + (float2)(0x1.ee0000p-1f, 0x1.ed1c6cp-11f), + (float2)(0x1.f10000p-1f, 0x1.b8a076p-11f), + (float2)(0x1.f40000p-1f, 0x1.7822f2p-11f), + (float2)(0x1.f70000p-1f, 0x1.2bbc3ap-11f), + (float2)(0x1.fa0000p-1f, 0x1.a708bap-12f), + (float2)(0x1.fd0000p-1f, 0x1.be4c7ep-13f), + (float2)(0x1.000000p+0f, 0x0.000000p+0f) +}; + +DECLARE_TABLE(uchar, PIBITS_TBL, ) = { + 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175, + 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31, + 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38, + 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188, + 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247, + 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249, + 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8, + 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39, + 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21, + 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109, + 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13, + 230, 139, 2, 0, 0, 0, 0, 0, 0, 0 +}; + +TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl); +TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl); +TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl); + +uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) { + return *(__constant uint4 *)(PIBITS_TBL + idx); +} + +#ifdef cl_khr_fp64 + +DECLARE_TABLE(double2, LN_TBL, 65) = { + (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.fc0a800000000p-7, 0x1.61f807c79f3dbp-28), + (double2)(0x1.f829800000000p-6, 0x1.873c1980267c8p-25), + (double2)(0x1.7745800000000p-5, 0x1.ec65b9f88c69ep-26), + (double2)(0x1.f0a3000000000p-5, 0x1.8022c54cc2f99p-26), + (double2)(0x1.341d700000000p-4, 0x1.2c37a3a125330p-25), + (double2)(0x1.6f0d200000000p-4, 0x1.15cad69737c93p-25), + (double2)(0x1.a926d00000000p-4, 0x1.d256ab1b285e9p-27), + (double2)(0x1.e270700000000p-4, 0x1.b8abcb97a7aa2p-26), + (double2)(0x1.0d77e00000000p-3, 0x1.f34239659a5dcp-25), + (double2)(0x1.2955280000000p-3, 0x1.e07fd48d30177p-25), + (double2)(0x1.44d2b00000000p-3, 0x1.b32df4799f4f6p-25), + (double2)(0x1.5ff3000000000p-3, 0x1.c29e4f4f21cf8p-25), + (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30), + (double2)(0x1.9525a80000000p-3, 0x1.cf456b4764130p-27), + (double2)(0x1.af3c900000000p-3, 0x1.3a02ffcb63398p-25), + (double2)(0x1.c8ff780000000p-3, 0x1.1e6a6886b0976p-25), + (double2)(0x1.e270700000000p-3, 0x1.b8abcb97a7aa2p-25), + (double2)(0x1.fb91800000000p-3, 0x1.b578f8aa35552p-25), + (double2)(0x1.0a324c0000000p-2, 0x1.139c871afb9fcp-25), + (double2)(0x1.1675c80000000p-2, 0x1.5d5d30701ce64p-25), + (double2)(0x1.22941c0000000p-2, 0x1.de7bcb2d12142p-25), + (double2)(0x1.2e8e280000000p-2, 0x1.d708e984e1664p-25), + (double2)(0x1.3a64c40000000p-2, 0x1.56945e9c72f36p-26), + (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bdap-29), + (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f6p-28), + (double2)(0x1.5d1bd80000000p-2, 0x1.fac04e52846c7p-25), + (double2)(0x1.686c800000000p-2, 0x1.e9b14aec442bep-26), + (double2)(0x1.739d7c0000000p-2, 0x1.b5de8034e7126p-25), + (double2)(0x1.7eaf800000000p-2, 0x1.dc157e1b259d3p-25), + (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28), + (double2)(0x1.9479400000000p-2, 0x1.c2116faba4cddp-26), + (double2)(0x1.9f323c0000000p-2, 0x1.65fcc25f95b47p-25), + (double2)(0x1.a9cec80000000p-2, 0x1.a9a08498d4850p-26), + (double2)(0x1.b44f740000000p-2, 0x1.de647b1465f77p-25), + (double2)(0x1.beb4d80000000p-2, 0x1.da71b7bf7861dp-26), + (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28), + (double2)(0x1.d32fe40000000p-2, 0x1.f0075eab0ef64p-25), + (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989bp-28), + (double2)(0x1.e744240000000p-2, 0x1.0eb43c3f1bed2p-25), + (double2)(0x1.f128f40000000p-2, 0x1.faf06ecb35c84p-26), + (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f68p-27), + (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27), + (double2)(0x1.0723e40000000p-1, 0x1.c1cdf404e5796p-25), + (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27), + (double2)(0x1.109f380000000p-1, 0x1.e2d4c96fde3ecp-25), + (double2)(0x1.154c3c0000000p-1, 0x1.2f4d5e9a98f34p-25), + (double2)(0x1.19ee6a0000000p-1, 0x1.467c96ecc5cbep-25), + (double2)(0x1.1e85f40000000p-1, 0x1.e7040d03dec5ap-25), + (double2)(0x1.23130c0000000p-1, 0x1.7bebf4282de36p-25), + (double2)(0x1.2795e00000000p-1, 0x1.289b11aeb783fp-25), + (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26), + (double2)(0x1.307d720000000p-1, 0x1.34f10be1fb591p-25), + (double2)(0x1.34e2880000000p-1, 0x1.d9ce1d316eb93p-25), + (double2)(0x1.393e0c0000000p-1, 0x1.3562a19a9c442p-25), + (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26), + (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c97ap-26), + (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bdap-28), + (double2)(0x1.4a4f840000000p-1, 0x1.db03ebb0227bfp-25), + (double2)(0x1.4e7d800000000p-1, 0x1.1b75bb09cb098p-25), + (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27), + (double2)(0x1.56bf9c0000000p-1, 0x1.5b3f399411c62p-25), + (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26), + (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26), + (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25), +}; + +TABLE_FUNCTION(double2, LN_TBL, ln_tbl); + + +// Arrays atan_jby256_lead and atan_jby256_tail contain +// leading and trailing parts respectively of precomputed +// values of atan(j/256), for j = 16, 17, ..., 256. +// atan_jby256_lead contains the first 21 bits of precision, +// and atan_jby256_tail contains a further 53 bits precision. + +DECLARE_TABLE(double2, ATAN_JBY256_TBL, 241) = { + (double2)(0x1.ff55b00000000p-5, 0x1.6e59fbd38db2cp-26), + (double2)(0x1.0f99e00000000p-4, 0x1.4e3aa54dedf96p-25), + (double2)(0x1.1f86d00000000p-4, 0x1.7e105ab1bda88p-25), + (double2)(0x1.2f71900000000p-4, 0x1.8c5254d013fd0p-27), + (double2)(0x1.3f59f00000000p-4, 0x1.cf8ab3ad62670p-29), + (double2)(0x1.4f3fd00000000p-4, 0x1.9dca4bec80468p-26), + (double2)(0x1.5f23200000000p-4, 0x1.3f4b5ec98a8dap-26), + (double2)(0x1.6f03b00000000p-4, 0x1.b9d49619d81fep-25), + (double2)(0x1.7ee1800000000p-4, 0x1.3017887460934p-27), + (double2)(0x1.8ebc500000000p-4, 0x1.11e3eca0b9944p-26), + (double2)(0x1.9e94100000000p-4, 0x1.4f3f73c5a332ep-26), + (double2)(0x1.ae68a00000000p-4, 0x1.c71c8ae0e00a6p-26), + (double2)(0x1.be39e00000000p-4, 0x1.7cde0f86fbdc7p-25), + (double2)(0x1.ce07c00000000p-4, 0x1.70f328c889c72p-26), + (double2)(0x1.ddd2100000000p-4, 0x1.c07ae9b994efep-26), + (double2)(0x1.ed98c00000000p-4, 0x1.0c8021d7b1698p-27), + (double2)(0x1.fd5ba00000000p-4, 0x1.35585edb8cb22p-25), + (double2)(0x1.068d500000000p-3, 0x1.0842567b30e96p-24), + (double2)(0x1.0e6ad00000000p-3, 0x1.99e811031472ep-24), + (double2)(0x1.1646500000000p-3, 0x1.041821416bceep-25), + (double2)(0x1.1e1fa00000000p-3, 0x1.f6086e4dc96f4p-24), + (double2)(0x1.25f6e00000000p-3, 0x1.71a535c5f1b58p-27), + (double2)(0x1.2dcbd00000000p-3, 0x1.65f743fe63ca1p-24), + (double2)(0x1.359e800000000p-3, 0x1.dbd733472d014p-24), + (double2)(0x1.3d6ee00000000p-3, 0x1.d18cc4d8b0d1dp-24), + (double2)(0x1.453ce00000000p-3, 0x1.8c12553c8fb29p-24), + (double2)(0x1.4d08700000000p-3, 0x1.53b49e2e8f991p-24), + (double2)(0x1.54d1800000000p-3, 0x1.7422ae148c141p-24), + (double2)(0x1.5c98100000000p-3, 0x1.e3ec269df56a8p-27), + (double2)(0x1.645bf00000000p-3, 0x1.ff6754e7e0ac9p-24), + (double2)(0x1.6c1d400000000p-3, 0x1.131267b1b5aadp-24), + (double2)(0x1.73dbd00000000p-3, 0x1.d14fa403a94bcp-24), + (double2)(0x1.7b97b00000000p-3, 0x1.2f396c089a3d8p-25), + (double2)(0x1.8350b00000000p-3, 0x1.c731d78fa95bbp-24), + (double2)(0x1.8b06e00000000p-3, 0x1.c50f385177399p-24), + (double2)(0x1.92ba300000000p-3, 0x1.f41409c6f2c20p-25), + (double2)(0x1.9a6a800000000p-3, 0x1.d2d90c4c39ec0p-24), + (double2)(0x1.a217e00000000p-3, 0x1.80420696f2106p-25), + (double2)(0x1.a9c2300000000p-3, 0x1.b40327943a2e8p-27), + (double2)(0x1.b169600000000p-3, 0x1.5d35e02f3d2a2p-25), + (double2)(0x1.b90d700000000p-3, 0x1.4a498288117b0p-25), + (double2)(0x1.c0ae500000000p-3, 0x1.35da119afb324p-25), + (double2)(0x1.c84bf00000000p-3, 0x1.14e85cdb9a908p-24), + (double2)(0x1.cfe6500000000p-3, 0x1.38754e5547b9ap-25), + (double2)(0x1.d77d500000000p-3, 0x1.be40ae6ce3246p-24), + (double2)(0x1.df11000000000p-3, 0x1.0c993b3bea7e7p-24), + (double2)(0x1.e6a1400000000p-3, 0x1.1d2dd89ac3359p-24), + (double2)(0x1.ee2e100000000p-3, 0x1.1476603332c46p-25), + (double2)(0x1.f5b7500000000p-3, 0x1.f25901bac55b7p-24), + (double2)(0x1.fd3d100000000p-3, 0x1.f881b7c826e28p-24), + (double2)(0x1.025fa00000000p-2, 0x1.441996d698d20p-24), + (double2)(0x1.061ee00000000p-2, 0x1.407ac521ea089p-23), + (double2)(0x1.09dc500000000p-2, 0x1.2fb0c6c4b1723p-23), + (double2)(0x1.0d97e00000000p-2, 0x1.ca135966a3e18p-23), + (double2)(0x1.1151a00000000p-2, 0x1.b1218e4d646e4p-25), + (double2)(0x1.1509700000000p-2, 0x1.d4e72a350d288p-25), + (double2)(0x1.18bf500000000p-2, 0x1.4617e2f04c329p-23), + (double2)(0x1.1c73500000000p-2, 0x1.096ec41e82650p-25), + (double2)(0x1.2025500000000p-2, 0x1.9f91f25773e6ep-24), + (double2)(0x1.23d5600000000p-2, 0x1.59c0820f1d674p-25), + (double2)(0x1.2783700000000p-2, 0x1.02bf7a2df1064p-25), + (double2)(0x1.2b2f700000000p-2, 0x1.fb36bfc40508fp-23), + (double2)(0x1.2ed9800000000p-2, 0x1.ea08f3f8dc892p-24), + (double2)(0x1.3281800000000p-2, 0x1.3ed6254656a0ep-24), + (double2)(0x1.3627700000000p-2, 0x1.b83f5e5e69c58p-25), + (double2)(0x1.39cb400000000p-2, 0x1.d6ec2af768592p-23), + (double2)(0x1.3d6d100000000p-2, 0x1.493889a226f94p-25), + (double2)(0x1.410cb00000000p-2, 0x1.5ad8fa65279bap-23), + (double2)(0x1.44aa400000000p-2, 0x1.b615784d45434p-25), + (double2)(0x1.4845a00000000p-2, 0x1.09a184368f145p-23), + (double2)(0x1.4bdee00000000p-2, 0x1.61a2439b0d91cp-24), + (double2)(0x1.4f75f00000000p-2, 0x1.ce1a65e39a978p-24), + (double2)(0x1.530ad00000000p-2, 0x1.32a39a93b6a66p-23), + (double2)(0x1.569d800000000p-2, 0x1.1c3699af804e7p-23), + (double2)(0x1.5a2e000000000p-2, 0x1.75e0f4e44ede8p-26), + (double2)(0x1.5dbc300000000p-2, 0x1.f77ced1a7a83bp-23), + (double2)(0x1.6148400000000p-2, 0x1.84e7f0cb1b500p-29), + (double2)(0x1.64d1f00000000p-2, 0x1.ec6b838b02dfep-23), + (double2)(0x1.6859700000000p-2, 0x1.3ebf4dfbeda87p-23), + (double2)(0x1.6bdea00000000p-2, 0x1.9397aed9cb475p-23), + (double2)(0x1.6f61900000000p-2, 0x1.07937bc239c54p-24), + (double2)(0x1.72e2200000000p-2, 0x1.aa754553131b6p-23), + (double2)(0x1.7660700000000p-2, 0x1.4a05d407c45dcp-24), + (double2)(0x1.79dc600000000p-2, 0x1.132231a206dd0p-23), + (double2)(0x1.7d56000000000p-2, 0x1.2d8ecfdd69c88p-24), + (double2)(0x1.80cd400000000p-2, 0x1.a852c74218606p-24), + (double2)(0x1.8442200000000p-2, 0x1.71bf2baeebb50p-23), + (double2)(0x1.87b4b00000000p-2, 0x1.83d7db7491820p-27), + (double2)(0x1.8b24d00000000p-2, 0x1.ca50d92b6da14p-25), + (double2)(0x1.8e92900000000p-2, 0x1.6f5cde8530298p-26), + (double2)(0x1.91fde00000000p-2, 0x1.f343198910740p-24), + (double2)(0x1.9566d00000000p-2, 0x1.0e8d241ccd80ap-24), + (double2)(0x1.98cd500000000p-2, 0x1.1535ac619e6c8p-24), + (double2)(0x1.9c31600000000p-2, 0x1.7316041c36cd2p-24), + (double2)(0x1.9f93000000000p-2, 0x1.985a000637d8ep-24), + (double2)(0x1.a2f2300000000p-2, 0x1.f2f29858c0a68p-25), + (double2)(0x1.a64ee00000000p-2, 0x1.879847f96d909p-23), + (double2)(0x1.a9a9200000000p-2, 0x1.ab3d319e12e42p-23), + (double2)(0x1.ad00f00000000p-2, 0x1.5088162dfc4c2p-24), + (double2)(0x1.b056400000000p-2, 0x1.05749a1cd9d8cp-25), + (double2)(0x1.b3a9100000000p-2, 0x1.da65c6c6b8618p-26), + (double2)(0x1.b6f9600000000p-2, 0x1.739bf7df1ad64p-25), + (double2)(0x1.ba47300000000p-2, 0x1.bc31252aa3340p-25), + (double2)(0x1.bd92800000000p-2, 0x1.e528191ad3aa8p-26), + (double2)(0x1.c0db400000000p-2, 0x1.929d93df19f18p-23), + (double2)(0x1.c421900000000p-2, 0x1.ff11eb693a080p-26), + (double2)(0x1.c765500000000p-2, 0x1.55ae3f145a3a0p-27), + (double2)(0x1.caa6800000000p-2, 0x1.cbcd8c6c0ca82p-24), + (double2)(0x1.cde5300000000p-2, 0x1.0cb04d425d304p-24), + (double2)(0x1.d121500000000p-2, 0x1.9adfcab5be678p-24), + (double2)(0x1.d45ae00000000p-2, 0x1.93d90c5662508p-23), + (double2)(0x1.d791f00000000p-2, 0x1.68489bd35ff40p-24), + (double2)(0x1.dac6700000000p-2, 0x1.586ed3da2b7e0p-28), + (double2)(0x1.ddf8500000000p-2, 0x1.7604d2e850eeep-23), + (double2)(0x1.e127b00000000p-2, 0x1.ac1d12bfb53d8p-24), + (double2)(0x1.e454800000000p-2, 0x1.9b3d468274740p-28), + (double2)(0x1.e77eb00000000p-2, 0x1.fc5d68d10e53cp-24), + (double2)(0x1.eaa6500000000p-2, 0x1.8f9e51884becbp-23), + (double2)(0x1.edcb600000000p-2, 0x1.a87f0869c06d1p-23), + (double2)(0x1.f0ede00000000p-2, 0x1.31e7279f685fap-23), + (double2)(0x1.f40dd00000000p-2, 0x1.6a8282f9719b0p-27), + (double2)(0x1.f72b200000000p-2, 0x1.0d2724a8a44e0p-25), + (double2)(0x1.fa45d00000000p-2, 0x1.a60524b11ad4ep-23), + (double2)(0x1.fd5e000000000p-2, 0x1.75fdf832750f0p-26), + (double2)(0x1.0039c00000000p-1, 0x1.cf06902e4cd36p-23), + (double2)(0x1.01c3400000000p-1, 0x1.e82422d4f6d10p-25), + (double2)(0x1.034b700000000p-1, 0x1.24a091063e6c0p-26), + (double2)(0x1.04d2500000000p-1, 0x1.8a1a172dc6f38p-24), + (double2)(0x1.0657e00000000p-1, 0x1.29b6619f8a92dp-22), + (double2)(0x1.07dc300000000p-1, 0x1.9274d9c1b70c8p-24), + (double2)(0x1.095f300000000p-1, 0x1.0c34b1fbb7930p-26), + (double2)(0x1.0ae0e00000000p-1, 0x1.639866c20eb50p-25), + (double2)(0x1.0c61400000000p-1, 0x1.6d6d0f6832e9ep-23), + (double2)(0x1.0de0500000000p-1, 0x1.af54def99f25ep-22), + (double2)(0x1.0f5e200000000p-1, 0x1.16cfc52a00262p-22), + (double2)(0x1.10daa00000000p-1, 0x1.dcc1e83569c32p-23), + (double2)(0x1.1255d00000000p-1, 0x1.37f7a551ed425p-22), + (double2)(0x1.13cfb00000000p-1, 0x1.f6360adc98887p-22), + (double2)(0x1.1548500000000p-1, 0x1.2c6ec8d35a2c1p-22), + (double2)(0x1.16bfa00000000p-1, 0x1.bd44df84cb036p-23), + (double2)(0x1.1835a00000000p-1, 0x1.117cf826e310ep-22), + (double2)(0x1.19aa500000000p-1, 0x1.ca533f332cfc9p-22), + (double2)(0x1.1b1dc00000000p-1, 0x1.0f208509dbc2ep-22), + (double2)(0x1.1c8fe00000000p-1, 0x1.cd07d93c945dep-23), + (double2)(0x1.1e00b00000000p-1, 0x1.57bdfd67e6d72p-22), + (double2)(0x1.1f70400000000p-1, 0x1.aab89c516c658p-24), + (double2)(0x1.20de800000000p-1, 0x1.3e823b1a1b8a0p-25), + (double2)(0x1.224b700000000p-1, 0x1.307464a9d6d3cp-23), + (double2)(0x1.23b7100000000p-1, 0x1.c5993cd438843p-22), + (double2)(0x1.2521700000000p-1, 0x1.ba2fca02ab554p-22), + (double2)(0x1.268a900000000p-1, 0x1.01a5b6983a268p-23), + (double2)(0x1.27f2600000000p-1, 0x1.273d1b350efc8p-25), + (double2)(0x1.2958e00000000p-1, 0x1.64c238c37b0c6p-23), + (double2)(0x1.2abe200000000p-1, 0x1.aded07370a300p-25), + (double2)(0x1.2c22100000000p-1, 0x1.78091197eb47ep-23), + (double2)(0x1.2d84c00000000p-1, 0x1.4b0f245e0dabcp-24), + (double2)(0x1.2ee6200000000p-1, 0x1.080d9794e2eafp-22), + (double2)(0x1.3046400000000p-1, 0x1.d4ec242b60c76p-23), + (double2)(0x1.31a5200000000p-1, 0x1.221d2f940caa0p-27), + (double2)(0x1.3302b00000000p-1, 0x1.cdbc42b2bba5cp-24), + (double2)(0x1.345f000000000p-1, 0x1.cce37bb440840p-25), + (double2)(0x1.35ba000000000p-1, 0x1.6c1d999cf1dd0p-22), + (double2)(0x1.3713d00000000p-1, 0x1.bed8a07eb0870p-26), + (double2)(0x1.386c500000000p-1, 0x1.69ed88f490e3cp-24), + (double2)(0x1.39c3900000000p-1, 0x1.cd41719b73ef0p-25), + (double2)(0x1.3b19800000000p-1, 0x1.cbc4ac95b41b7p-22), + (double2)(0x1.3c6e400000000p-1, 0x1.238f1b890f5d7p-22), + (double2)(0x1.3dc1c00000000p-1, 0x1.50c4282259cc4p-24), + (double2)(0x1.3f13f00000000p-1, 0x1.713d2de87b3e2p-22), + (double2)(0x1.4064f00000000p-1, 0x1.1d5a7d2255276p-23), + (double2)(0x1.41b4a00000000p-1, 0x1.c0dfd48227ac1p-22), + (double2)(0x1.4303200000000p-1, 0x1.1c964dab76753p-22), + (double2)(0x1.4450600000000p-1, 0x1.6de56d5704496p-23), + (double2)(0x1.459c600000000p-1, 0x1.4aeb71fd19968p-23), + (double2)(0x1.46e7200000000p-1, 0x1.fbf91c57b1918p-23), + (double2)(0x1.4830a00000000p-1, 0x1.d6bef7fbe5d9ap-22), + (double2)(0x1.4978f00000000p-1, 0x1.464d3dc249066p-22), + (double2)(0x1.4ac0000000000p-1, 0x1.638e2ec4d9073p-22), + (double2)(0x1.4c05e00000000p-1, 0x1.16f4a7247ea7cp-24), + (double2)(0x1.4d4a800000000p-1, 0x1.1a0a740f1d440p-28), + (double2)(0x1.4e8de00000000p-1, 0x1.6edbb0114a33cp-23), + (double2)(0x1.4fd0100000000p-1, 0x1.dbee8bf1d513cp-24), + (double2)(0x1.5111000000000p-1, 0x1.5b8bdb0248f73p-22), + (double2)(0x1.5250c00000000p-1, 0x1.7de3d3f5eac64p-22), + (double2)(0x1.538f500000000p-1, 0x1.ee24187ae448ap-23), + (double2)(0x1.54cca00000000p-1, 0x1.e06c591ec5192p-22), + (double2)(0x1.5608d00000000p-1, 0x1.4e3861a332738p-24), + (double2)(0x1.5743c00000000p-1, 0x1.a9599dcc2bfe4p-24), + (double2)(0x1.587d800000000p-1, 0x1.f732fbad43468p-25), + (double2)(0x1.59b6000000000p-1, 0x1.eb9f573b727d9p-22), + (double2)(0x1.5aed600000000p-1, 0x1.8b212a2eb9897p-22), + (double2)(0x1.5c23900000000p-1, 0x1.384884c167215p-22), + (double2)(0x1.5d58900000000p-1, 0x1.0e2d363020051p-22), + (double2)(0x1.5e8c600000000p-1, 0x1.2820879fbd022p-22), + (double2)(0x1.5fbf000000000p-1, 0x1.a1ab9893e4b30p-22), + (double2)(0x1.60f0800000000p-1, 0x1.2d1b817a24478p-23), + (double2)(0x1.6220d00000000p-1, 0x1.15d7b8ded4878p-25), + (double2)(0x1.634ff00000000p-1, 0x1.8968f9db3a5e4p-24), + (double2)(0x1.647de00000000p-1, 0x1.71c4171fe135fp-22), + (double2)(0x1.65aab00000000p-1, 0x1.6d80f605d0d8cp-22), + (double2)(0x1.66d6600000000p-1, 0x1.c91f043691590p-24), + (double2)(0x1.6800e00000000p-1, 0x1.39f8a15fce2b2p-23), + (double2)(0x1.692a400000000p-1, 0x1.55beda9d94b80p-27), + (double2)(0x1.6a52700000000p-1, 0x1.b12c15d60949ap-23), + (double2)(0x1.6b79800000000p-1, 0x1.24167b312bfe3p-22), + (double2)(0x1.6c9f700000000p-1, 0x1.0ab8633070277p-22), + (double2)(0x1.6dc4400000000p-1, 0x1.54554ebbc80eep-23), + (double2)(0x1.6ee7f00000000p-1, 0x1.0204aef5a4bb8p-25), + (double2)(0x1.700a700000000p-1, 0x1.8af08c679cf2cp-22), + (double2)(0x1.712be00000000p-1, 0x1.0852a330ae6c8p-22), + (double2)(0x1.724c300000000p-1, 0x1.6d3eb9ec32916p-23), + (double2)(0x1.736b600000000p-1, 0x1.685cb7fcbbafep-23), + (double2)(0x1.7489700000000p-1, 0x1.1f751c1e0bd95p-22), + (double2)(0x1.75a6700000000p-1, 0x1.705b1b0f72560p-26), + (double2)(0x1.76c2400000000p-1, 0x1.b98d8d808ca92p-22), + (double2)(0x1.77dd100000000p-1, 0x1.2ea22c75cc980p-25), + (double2)(0x1.78f6b00000000p-1, 0x1.7aba62bca0350p-22), + (double2)(0x1.7a0f400000000p-1, 0x1.d73833442278cp-22), + (double2)(0x1.7b26c00000000p-1, 0x1.5a5ca1fb18bf9p-22), + (double2)(0x1.7c3d300000000p-1, 0x1.1a6092b6ecf28p-25), + (double2)(0x1.7d52800000000p-1, 0x1.44fd049aac104p-24), + (double2)(0x1.7e66c00000000p-1, 0x1.c114fd8df5180p-29), + (double2)(0x1.7f79e00000000p-1, 0x1.5972f130feae5p-22), + (double2)(0x1.808c000000000p-1, 0x1.ca034a55fe198p-24), + (double2)(0x1.819d000000000p-1, 0x1.6e2b149990227p-22), + (double2)(0x1.82ad000000000p-1, 0x1.b00000294592cp-24), + (double2)(0x1.83bbe00000000p-1, 0x1.8b9bdc442620ep-22), + (double2)(0x1.84c9c00000000p-1, 0x1.d94fdfabf3e4ep-23), + (double2)(0x1.85d6900000000p-1, 0x1.5db30b145ad9ap-23), + (double2)(0x1.86e2500000000p-1, 0x1.e3e1eb95022b0p-23), + (double2)(0x1.87ed000000000p-1, 0x1.d5b8b45442bd6p-22), + (double2)(0x1.88f6b00000000p-1, 0x1.7a046231ecd2ep-22), + (double2)(0x1.89ff500000000p-1, 0x1.feafe3ef55232p-22), + (double2)(0x1.8b06f00000000p-1, 0x1.839e7bfd78267p-22), + (double2)(0x1.8c0d900000000p-1, 0x1.45cf49d6fa900p-25), + (double2)(0x1.8d13200000000p-1, 0x1.be3132b27f380p-27), + (double2)(0x1.8e17a00000000p-1, 0x1.533980bb84f9fp-22), + (double2)(0x1.8f1b300000000p-1, 0x1.889e2ce3ba390p-26), + (double2)(0x1.901db00000000p-1, 0x1.f7778c3ad0cc8p-24), + (double2)(0x1.911f300000000p-1, 0x1.46660cec4eba2p-23), + (double2)(0x1.921fb00000000p-1, 0x1.5110b4611a626p-23), +}; + +DECLARE_TABLE(double2, TWO_TO_JBY64_EP, 64) = { + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.02c9a30000000p+0, 0x1.cef00c1dcdef9p-25), + (double2)(0x1.059b0d0000000p+0, 0x1.8ac2ba1d73e2ap-27), + (double2)(0x1.0874510000000p+0, 0x1.0eb37901186bep-25), + (double2)(0x1.0b55860000000p+0, 0x1.9f3121ec53172p-25), + (double2)(0x1.0e3ec30000000p+0, 0x1.69e8d10103a17p-27), + (double2)(0x1.11301d0000000p+0, 0x1.25b50a4ebbf1ap-32), + (double2)(0x1.1429aa0000000p+0, 0x1.d525bbf668203p-25), + (double2)(0x1.172b830000000p+0, 0x1.8faa2f5b9bef9p-25), + (double2)(0x1.1a35be0000000p+0, 0x1.6df96ea796d31p-25), + (double2)(0x1.1d48730000000p+0, 0x1.68b9aa7805b80p-28), + (double2)(0x1.2063b80000000p+0, 0x1.0c519ac771dd6p-25), + (double2)(0x1.2387a60000000p+0, 0x1.ceac470cd83f5p-25), + (double2)(0x1.26b4560000000p+0, 0x1.789f37495e99cp-26), + (double2)(0x1.29e9df0000000p+0, 0x1.47f7b84b09745p-26), + (double2)(0x1.2d285a0000000p+0, 0x1.b900c2d002475p-26), + (double2)(0x1.306fe00000000p+0, 0x1.4636e2a5bd1abp-25), + (double2)(0x1.33c08b0000000p+0, 0x1.320b7fa64e430p-27), + (double2)(0x1.371a730000000p+0, 0x1.ceaa72a9c5154p-26), + (double2)(0x1.3a7db30000000p+0, 0x1.3967fdba86f24p-26), + (double2)(0x1.3dea640000000p+0, 0x1.82468446b6824p-25), + (double2)(0x1.4160a20000000p+0, 0x1.f72e29f84325bp-28), + (double2)(0x1.44e0860000000p+0, 0x1.8624b40c4dbd0p-30), + (double2)(0x1.486a2b0000000p+0, 0x1.704f3404f068ep-26), + (double2)(0x1.4bfdad0000000p+0, 0x1.4d8a89c750e5ep-26), + (double2)(0x1.4f9b270000000p+0, 0x1.a74b29ab4cf62p-26), + (double2)(0x1.5342b50000000p+0, 0x1.a753e077c2a0fp-26), + (double2)(0x1.56f4730000000p+0, 0x1.ad49f699bb2c0p-26), + (double2)(0x1.5ab07d0000000p+0, 0x1.a90a852b19260p-25), + (double2)(0x1.5e76f10000000p+0, 0x1.6b48521ba6f93p-26), + (double2)(0x1.6247eb0000000p+0, 0x1.d2ac258f87d03p-31), + (double2)(0x1.6623880000000p+0, 0x1.2a91124893ecfp-27), + (double2)(0x1.6a09e60000000p+0, 0x1.9fcef32422cbep-26), + (double2)(0x1.6dfb230000000p+0, 0x1.8ca345de441c5p-25), + (double2)(0x1.71f75e0000000p+0, 0x1.1d8bee7ba46e1p-25), + (double2)(0x1.75feb50000000p+0, 0x1.9099f22fdba6ap-26), + (double2)(0x1.7a11470000000p+0, 0x1.f580c36bea881p-27), + (double2)(0x1.7e2f330000000p+0, 0x1.b3d398841740ap-26), + (double2)(0x1.8258990000000p+0, 0x1.2999c25159f11p-25), + (double2)(0x1.868d990000000p+0, 0x1.68925d901c83bp-25), + (double2)(0x1.8ace540000000p+0, 0x1.15506dadd3e2ap-27), + (double2)(0x1.8f1ae90000000p+0, 0x1.22aee6c57304ep-25), + (double2)(0x1.93737b0000000p+0, 0x1.9b8bc9e8a0387p-29), + (double2)(0x1.97d8290000000p+0, 0x1.fbc9c9f173d24p-25), + (double2)(0x1.9c49180000000p+0, 0x1.51f8480e3e235p-27), + (double2)(0x1.a0c6670000000p+0, 0x1.6bbcac96535b5p-25), + (double2)(0x1.a5503b0000000p+0, 0x1.1f12ae45a1224p-27), + (double2)(0x1.a9e6b50000000p+0, 0x1.5e7f6fd0fac90p-26), + (double2)(0x1.ae89f90000000p+0, 0x1.2b5a75abd0e69p-25), + (double2)(0x1.b33a2b0000000p+0, 0x1.09e2bf5ed7fa1p-25), + (double2)(0x1.b7f76f0000000p+0, 0x1.7daf237553d84p-27), + (double2)(0x1.bcc1e90000000p+0, 0x1.2f074891ee83dp-30), + (double2)(0x1.c199bd0000000p+0, 0x1.b0aa538444196p-25), + (double2)(0x1.c67f120000000p+0, 0x1.cafa29694426fp-25), + (double2)(0x1.cb720d0000000p+0, 0x1.9df20d22a0797p-25), + (double2)(0x1.d072d40000000p+0, 0x1.40f12f71a1e45p-25), + (double2)(0x1.d5818d0000000p+0, 0x1.9f7490e4bb40bp-25), + (double2)(0x1.da9e600000000p+0, 0x1.ed9942b84600dp-27), + (double2)(0x1.dfc9730000000p+0, 0x1.bdcdaf5cb4656p-27), + (double2)(0x1.e502ee0000000p+0, 0x1.e2cffd89cf44cp-26), + (double2)(0x1.ea4afa0000000p+0, 0x1.52486cc2c7b9dp-27), + (double2)(0x1.efa1be0000000p+0, 0x1.cc2b44eee3fa4p-25), + (double2)(0x1.f507650000000p+0, 0x1.6dc8a80ce9f09p-25), + (double2)(0x1.fa7c180000000p+0, 0x1.9e90d82e90a7ep-28) + +}; + + +TABLE_FUNCTION(double2, ATAN_JBY256_TBL, atan_jby256_tbl); +TABLE_FUNCTION(double2, TWO_TO_JBY64_EP, two_to_jby64_ep_tbl); + +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/tables.h b/libclc/generic/lib/math/tables.h new file mode 100644 index 0000000..1348fe1 --- /dev/null +++ b/libclc/generic/lib/math/tables.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define TABLE_SPACE __constant + +#define TABLE_MANGLE(NAME) __clc_##NAME + +#define DECLARE_TABLE(TYPE,NAME,LENGTH) \ + TABLE_SPACE TYPE NAME [ LENGTH ] + +#define TABLE_FUNCTION(TYPE,TABLE,NAME) \ + TYPE TABLE_MANGLE(NAME)(size_t idx) { \ + return TABLE[idx]; \ + } + +#define TABLE_FUNCTION_DECL(TYPE, NAME) \ + TYPE TABLE_MANGLE(NAME)(size_t idx); + +#define USE_TABLE(NAME, IDX) \ + TABLE_MANGLE(NAME)(IDX) + +TABLE_FUNCTION_DECL(float2, loge_tbl); +TABLE_FUNCTION_DECL(float, log_inv_tbl); +TABLE_FUNCTION_DECL(float2, log2_tbl); +TABLE_FUNCTION_DECL(uint4, pibits_tbl); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +TABLE_FUNCTION_DECL(double2, ln_tbl); +TABLE_FUNCTION_DECL(double2, atan_jby256_tbl); +TABLE_FUNCTION_DECL(double2, two_to_jby64_ep_tbl); +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/tan.cl b/libclc/generic/lib/math/tan.cl new file mode 100644 index 0000000..a447999 --- /dev/null +++ b/libclc/generic/lib/math/tan.cl @@ -0,0 +1,8 @@ +#include <clc/clc.h> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <tan.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/tan.inc b/libclc/generic/lib/math/tan.inc new file mode 100644 index 0000000..b9ce33e --- /dev/null +++ b/libclc/generic/lib/math/tan.inc @@ -0,0 +1,17 @@ +/* + * Note: tan(x) = sin(x)/cos(x) also, but the final assembly ends up being + * twice as long for R600 (maybe for others as well). + */ + +#if __CLC_FPSIZE == 32 +#define __CLC_CONST(x) x ## f +#else +#define __CLC_CONST(x) x +#endif + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE tan(__CLC_GENTYPE x) { + __CLC_GENTYPE sinx = sin(x); + return sinx / sqrt( (__CLC_GENTYPE) __CLC_CONST(1.0) - (sinx*sinx) ); +} + +#undef __CLC_CONST diff --git a/libclc/generic/lib/math/tanh.cl b/libclc/generic/lib/math/tanh.cl new file mode 100644 index 0000000..e9c4079 --- /dev/null +++ b/libclc/generic/lib/math/tanh.cl @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float tanh(float x) +{ + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + const float large_threshold = 0x1.0a2b24p+3f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint xs = ux ^ aux; + + float y = as_float(aux); + float y2 = y*y; + + float a1 = mad(y2, + mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F), + -0.28192806108402678e0F); + float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F); + + float a2 = mad(y2, + mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F), + -0.24069858695196524e0F); + float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F); + + int c = y < 0.9f; + float a = c ? a1 : a2; + float b = c ? b1 : b2; + float zlo = mad(MATH_DIVIDE(a, b), y*y2, y); + + float p = exp(2.0f * y) + 1.0f; + float zhi = 1.0F - MATH_DIVIDE(2.0F, p); + + float z = y <= 1.0f ? zlo : zhi; + z = as_float(xs | as_uint(z)); + + // Edge cases + float sone = as_float(0x3f800000U | xs); + z = y > large_threshold ? sone : z; + z = aux < 0x39000000 | aux > 0x7f800000 ? x : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, tanh, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double tanh(double x) +{ + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + // The point at which e^-x is insignificant compared to e^x = ln(2^27) + const double large_threshold = 0x1.2b708872320e2p+4; + + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + ulong sx = ux ^ ax; + double y = as_double(ax); + double y2 = y * y; + + // y < 0.9 + double znl = fma(y2, + fma(y2, + fma(y2, -0.142077926378834722618091e-7, -0.200047621071909498730453e-3), + -0.176016349003044679402273e-1), + -0.274030424656179760118928e0); + + double zdl = fma(y2, + fma(y2, + fma(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1), + 0.381641414288328849317962e0), + 0.822091273968539282568011e0); + + // 0.9 <= y <= 1 + double znm = fma(y2, + fma(y2, + fma(y2, -0.115475878996143396378318e-7, -0.165597043903549960486816e-3), + -0.146173047288731678404066e-1), + -0.227793870659088295252442e0); + + double zdm = fma(y2, + fma(y2, + fma(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1), + 0.317204558977294374244770e0), + 0.683381611977295894959554e0); + + int c = y < 0.9; + double zn = c ? znl : znm; + double zd = c ? zdl : zdm; + double z = y + y*y2 * MATH_DIVIDE(zn, zd); + + // y > 1 + double p = exp(2.0 * y) + 1.0; + double zg = 1.0 - 2.0 / p; + + z = y > 1.0 ? zg : z; + + // Other cases + z = y < 0x1.0p-28 | ax > PINFBITPATT_DP64 ? x : z; + + z = y > large_threshold ? 1.0 : z; + + return as_double(sx | as_ulong(z)); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tanh, double); + +#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/relational/all.cl b/libclc/generic/lib/relational/all.cl new file mode 100644 index 0000000..607d7a9 --- /dev/null +++ b/libclc/generic/lib/relational/all.cl @@ -0,0 +1,29 @@ +#include <clc/clc.h> + +#define _CLC_ALL(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1) +#define _CLC_ALL2(v) (_CLC_ALL((v).s0) & _CLC_ALL((v).s1)) +#define _CLC_ALL3(v) (_CLC_ALL2((v)) & _CLC_ALL((v).s2)) +#define _CLC_ALL4(v) (_CLC_ALL3((v)) & _CLC_ALL((v).s3)) +#define _CLC_ALL8(v) (_CLC_ALL4((v)) & _CLC_ALL((v).s4) & _CLC_ALL((v).s5) \ + & _CLC_ALL((v).s6) & _CLC_ALL((v).s7)) +#define _CLC_ALL16(v) (_CLC_ALL8((v)) & _CLC_ALL((v).s8) & _CLC_ALL((v).s9) \ + & _CLC_ALL((v).sA) & _CLC_ALL((v).sB) \ + & _CLC_ALL((v).sC) & _CLC_ALL((v).sD) \ + & _CLC_ALL((v).sE) & _CLC_ALL((v).sf)) + + +#define ALL_ID(TYPE) \ + _CLC_OVERLOAD _CLC_DEF int all(TYPE v) + +#define ALL_VECTORIZE(TYPE) \ + ALL_ID(TYPE) { return _CLC_ALL(v); } \ + ALL_ID(TYPE##2) { return _CLC_ALL2(v); } \ + ALL_ID(TYPE##3) { return _CLC_ALL3(v); } \ + ALL_ID(TYPE##4) { return _CLC_ALL4(v); } \ + ALL_ID(TYPE##8) { return _CLC_ALL8(v); } \ + ALL_ID(TYPE##16) { return _CLC_ALL16(v); } + +ALL_VECTORIZE(char) +ALL_VECTORIZE(short) +ALL_VECTORIZE(int) +ALL_VECTORIZE(long) diff --git a/libclc/generic/lib/relational/any.cl b/libclc/generic/lib/relational/any.cl new file mode 100644 index 0000000..4d37210 --- /dev/null +++ b/libclc/generic/lib/relational/any.cl @@ -0,0 +1,30 @@ +#include <clc/clc.h> + +#define _CLC_ANY(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1) +#define _CLC_ANY2(v) (_CLC_ANY((v).s0) | _CLC_ANY((v).s1)) +#define _CLC_ANY3(v) (_CLC_ANY2((v)) | _CLC_ANY((v).s2)) +#define _CLC_ANY4(v) (_CLC_ANY3((v)) | _CLC_ANY((v).s3)) +#define _CLC_ANY8(v) (_CLC_ANY4((v)) | _CLC_ANY((v).s4) | _CLC_ANY((v).s5) \ + | _CLC_ANY((v).s6) | _CLC_ANY((v).s7)) +#define _CLC_ANY16(v) (_CLC_ANY8((v)) | _CLC_ANY((v).s8) | _CLC_ANY((v).s9) \ + | _CLC_ANY((v).sA) | _CLC_ANY((v).sB) \ + | _CLC_ANY((v).sC) | _CLC_ANY((v).sD) \ + | _CLC_ANY((v).sE) | _CLC_ANY((v).sf)) + + +#define ANY_ID(TYPE) \ + _CLC_OVERLOAD _CLC_DEF int any(TYPE v) + +#define ANY_VECTORIZE(TYPE) \ + ANY_ID(TYPE) { return _CLC_ANY(v); } \ + ANY_ID(TYPE##2) { return _CLC_ANY2(v); } \ + ANY_ID(TYPE##3) { return _CLC_ANY3(v); } \ + ANY_ID(TYPE##4) { return _CLC_ANY4(v); } \ + ANY_ID(TYPE##8) { return _CLC_ANY8(v); } \ + ANY_ID(TYPE##16) { return _CLC_ANY16(v); } + +ANY_VECTORIZE(char) +ANY_VECTORIZE(short) +ANY_VECTORIZE(int) +ANY_VECTORIZE(long) + diff --git a/libclc/generic/lib/relational/bitselect.cl b/libclc/generic/lib/relational/bitselect.cl new file mode 100644 index 0000000..af4e70c --- /dev/null +++ b/libclc/generic/lib/relational/bitselect.cl @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "../clcmacro.h" + +#define __CLC_BODY <bitselect.inc> +#include <clc/integer/gentype.inc> +#undef __CLC_BODY + +#define FLOAT_BITSELECT(f_type, i_type, width) \ + _CLC_OVERLOAD _CLC_DEF f_type##width bitselect(f_type##width x, f_type##width y, f_type##width z) { \ + return as_##f_type##width(bitselect(as_##i_type##width(x), as_##i_type##width(y), as_##i_type##width(z))); \ +} + +FLOAT_BITSELECT(float, uint, ) +FLOAT_BITSELECT(float, uint, 2) +FLOAT_BITSELECT(float, uint, 3) +FLOAT_BITSELECT(float, uint, 4) +FLOAT_BITSELECT(float, uint, 8) +FLOAT_BITSELECT(float, uint, 16) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +FLOAT_BITSELECT(double, ulong, ) +FLOAT_BITSELECT(double, ulong, 2) +FLOAT_BITSELECT(double, ulong, 3) +FLOAT_BITSELECT(double, ulong, 4) +FLOAT_BITSELECT(double, ulong, 8) +FLOAT_BITSELECT(double, ulong, 16) + +#endif diff --git a/libclc/generic/lib/relational/bitselect.inc b/libclc/generic/lib/relational/bitselect.inc new file mode 100644 index 0000000..3a78a8c --- /dev/null +++ b/libclc/generic/lib/relational/bitselect.inc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE bitselect(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) { + return ((x) ^ ((z) & ((y) ^ (x)))); +} diff --git a/libclc/generic/lib/relational/isequal.cl b/libclc/generic/lib/relational/isequal.cl new file mode 100644 index 0000000..9d79ba6 --- /dev/null +++ b/libclc/generic/lib/relational/isequal.cl @@ -0,0 +1,30 @@ +#include <clc/clc.h> + +#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ + return (x == y); \ +} \ + +_CLC_DEFINE_ISEQUAL(int, isequal, float, float) +_CLC_DEFINE_ISEQUAL(int2, isequal, float2, float2) +_CLC_DEFINE_ISEQUAL(int3, isequal, float3, float3) +_CLC_DEFINE_ISEQUAL(int4, isequal, float4, float4) +_CLC_DEFINE_ISEQUAL(int8, isequal, float8, float8) +_CLC_DEFINE_ISEQUAL(int16, isequal, float16, float16) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isequal(double) returns an int, but the vector versions +// return long. +_CLC_DEFINE_ISEQUAL(int, isequal, double, double) +_CLC_DEFINE_ISEQUAL(long2, isequal, double2, double2) +_CLC_DEFINE_ISEQUAL(long3, isequal, double3, double3) +_CLC_DEFINE_ISEQUAL(long4, isequal, double4, double4) +_CLC_DEFINE_ISEQUAL(long8, isequal, double8, double8) +_CLC_DEFINE_ISEQUAL(long16, isequal, double16, double16) + +#endif + +#undef _CLC_DEFINE_ISEQUAL
\ No newline at end of file diff --git a/libclc/generic/lib/relational/isfinite.cl b/libclc/generic/lib/relational/isfinite.cl new file mode 100644 index 0000000..d0658c0 --- /dev/null +++ b/libclc/generic/lib/relational/isfinite.cl @@ -0,0 +1,18 @@ +#include <clc/clc.h> +#include "relational.h" + +_CLC_DEFINE_RELATIONAL_UNARY(int, isfinite, __builtin_isfinite, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isfinite(double) returns an int, but the vector versions +// return long. +_CLC_DEF _CLC_OVERLOAD int isfinite(double x) { + return __builtin_isfinite(x); +} + +_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isfinite, double) + +#endif diff --git a/libclc/generic/lib/relational/isgreater.cl b/libclc/generic/lib/relational/isgreater.cl new file mode 100644 index 0000000..79456e5 --- /dev/null +++ b/libclc/generic/lib/relational/isgreater.cl @@ -0,0 +1,22 @@ +#include <clc/clc.h> +#include "relational.h" + +//Note: It would be nice to use __builtin_isgreater with vector inputs, but it seems to only take scalar values as +// input, which will produce incorrect output for vector input types. + +_CLC_DEFINE_RELATIONAL_BINARY(int, isgreater, __builtin_isgreater, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isgreater(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEF _CLC_OVERLOAD int isgreater(double x, double y){ + return __builtin_isgreater(x, y); +} + +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isgreater, double, double) + +#endif diff --git a/libclc/generic/lib/relational/isgreaterequal.cl b/libclc/generic/lib/relational/isgreaterequal.cl new file mode 100644 index 0000000..2d5ebe57 --- /dev/null +++ b/libclc/generic/lib/relational/isgreaterequal.cl @@ -0,0 +1,22 @@ +#include <clc/clc.h> +#include "relational.h" + +//Note: It would be nice to use __builtin_isgreaterequal with vector inputs, but it seems to only take scalar values as +// input, which will produce incorrect output for vector input types. + +_CLC_DEFINE_RELATIONAL_BINARY(int, isgreaterequal, __builtin_isgreaterequal, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isgreaterequal(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEF _CLC_OVERLOAD int isgreaterequal(double x, double y){ + return __builtin_isgreaterequal(x, y); +} + +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isgreaterequal, double, double) + +#endif diff --git a/libclc/generic/lib/relational/isinf.cl b/libclc/generic/lib/relational/isinf.cl new file mode 100644 index 0000000..1452d91 --- /dev/null +++ b/libclc/generic/lib/relational/isinf.cl @@ -0,0 +1,18 @@ +#include <clc/clc.h> +#include "relational.h" + +_CLC_DEFINE_RELATIONAL_UNARY(int, isinf, __builtin_isinf, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isinf(double) returns an int, but the vector versions +// return long. +_CLC_DEF _CLC_OVERLOAD int isinf(double x) { + return __builtin_isinf(x); +} + +_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isinf, double) + +#endif diff --git a/libclc/generic/lib/relational/isless.cl b/libclc/generic/lib/relational/isless.cl new file mode 100644 index 0000000..56a3e13 --- /dev/null +++ b/libclc/generic/lib/relational/isless.cl @@ -0,0 +1,22 @@ +#include <clc/clc.h> +#include "relational.h" + +//Note: It would be nice to use __builtin_isless with vector inputs, but it seems to only take scalar values as +// input, which will produce incorrect output for vector input types. + +_CLC_DEFINE_RELATIONAL_BINARY(int, isless, __builtin_isless, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isless(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEF _CLC_OVERLOAD int isless(double x, double y){ + return __builtin_isless(x, y); +} + +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isless, double, double) + +#endif diff --git a/libclc/generic/lib/relational/islessequal.cl b/libclc/generic/lib/relational/islessequal.cl new file mode 100644 index 0000000..259c307 --- /dev/null +++ b/libclc/generic/lib/relational/islessequal.cl @@ -0,0 +1,22 @@ +#include <clc/clc.h> +#include "relational.h" + +//Note: It would be nice to use __builtin_islessequal with vector inputs, but it seems to only take scalar values as +// input, which will produce incorrect output for vector input types. + +_CLC_DEFINE_RELATIONAL_BINARY(int, islessequal, __builtin_islessequal, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of islessequal(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEF _CLC_OVERLOAD int islessequal(double x, double y){ + return __builtin_islessequal(x, y); +} + +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, islessequal, double, double) + +#endif diff --git a/libclc/generic/lib/relational/islessgreater.cl b/libclc/generic/lib/relational/islessgreater.cl new file mode 100644 index 0000000..fc029f3 --- /dev/null +++ b/libclc/generic/lib/relational/islessgreater.cl @@ -0,0 +1,22 @@ +#include <clc/clc.h> +#include "relational.h" + +//Note: It would be nice to use __builtin_islessgreater with vector inputs, but it seems to only take scalar values as +// input, which will produce incorrect output for vector input types. + +_CLC_DEFINE_RELATIONAL_BINARY(int, islessgreater, __builtin_islessgreater, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of islessgreater(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEF _CLC_OVERLOAD int islessgreater(double x, double y){ + return __builtin_islessgreater(x, y); +} + +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, islessgreater, double, double) + +#endif diff --git a/libclc/generic/lib/relational/isnan.cl b/libclc/generic/lib/relational/isnan.cl new file mode 100644 index 0000000..f82dc5d --- /dev/null +++ b/libclc/generic/lib/relational/isnan.cl @@ -0,0 +1,18 @@ +#include <clc/clc.h> +#include "relational.h" + +_CLC_DEFINE_RELATIONAL_UNARY(int, isnan, __builtin_isnan, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isnan(double) returns an int, but the vector versions +// return long. +_CLC_DEF _CLC_OVERLOAD int isnan(double x) { + return __builtin_isnan(x); +} + +_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isnan, double) + +#endif diff --git a/libclc/generic/lib/relational/isnormal.cl b/libclc/generic/lib/relational/isnormal.cl new file mode 100644 index 0000000..2e6b42d --- /dev/null +++ b/libclc/generic/lib/relational/isnormal.cl @@ -0,0 +1,18 @@ +#include <clc/clc.h> +#include "relational.h" + +_CLC_DEFINE_RELATIONAL_UNARY(int, isnormal, __builtin_isnormal, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isnormal(double) returns an int, but the vector versions +// return long. +_CLC_DEF _CLC_OVERLOAD int isnormal(double x) { + return __builtin_isnormal(x); +} + +_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, isnormal, double) + +#endif diff --git a/libclc/generic/lib/relational/isnotequal.cl b/libclc/generic/lib/relational/isnotequal.cl new file mode 100644 index 0000000..787fd8d --- /dev/null +++ b/libclc/generic/lib/relational/isnotequal.cl @@ -0,0 +1,23 @@ +#include <clc/clc.h> +#include "relational.h" + +#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ + return (x != y); \ +} \ + +_CLC_DEFINE_ISNOTEQUAL(int, isnotequal, float, float) +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, isnotequal, float, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isnotequal(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEFINE_ISNOTEQUAL(int, isnotequal, double, double) +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isnotequal, double, double) + +#endif + +#undef _CLC_DEFINE_ISNOTEQUAL diff --git a/libclc/generic/lib/relational/isordered.cl b/libclc/generic/lib/relational/isordered.cl new file mode 100644 index 0000000..ebda2eb --- /dev/null +++ b/libclc/generic/lib/relational/isordered.cl @@ -0,0 +1,23 @@ +#include <clc/clc.h> +#include "relational.h" + +#define _CLC_DEFINE_ISORDERED(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ + return isequal(x, x) && isequal(y, y); \ +} \ + +_CLC_DEFINE_ISORDERED(int, isordered, float, float) +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, isordered, float, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isordered(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEFINE_ISORDERED(int, isordered, double, double) +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isordered, double, double) + +#endif + +#undef _CLC_DEFINE_ISORDERED diff --git a/libclc/generic/lib/relational/isunordered.cl b/libclc/generic/lib/relational/isunordered.cl new file mode 100644 index 0000000..8bc5e3f --- /dev/null +++ b/libclc/generic/lib/relational/isunordered.cl @@ -0,0 +1,22 @@ +#include <clc/clc.h> +#include "relational.h" + +//Note: It would be nice to use __builtin_isunordered with vector inputs, but it seems to only take scalar values as +// input, which will produce incorrect output for vector input types. + +_CLC_DEFINE_RELATIONAL_BINARY(int, isunordered, __builtin_isunordered, float, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of isunordered(double, double) returns an int, but the vector versions +// return long. + +_CLC_DEF _CLC_OVERLOAD int isunordered(double x, double y){ + return __builtin_isunordered(x, y); +} + +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, isunordered, double, double) + +#endif diff --git a/libclc/generic/lib/relational/relational.h b/libclc/generic/lib/relational/relational.h new file mode 100644 index 0000000..e492750 --- /dev/null +++ b/libclc/generic/lib/relational/relational.h @@ -0,0 +1,117 @@ +/* + * Contains relational macros that have to return 1 for scalar and -1 for vector + * when the result is true. + */ + +#define _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x){ \ + return BUILTIN_NAME(x); \ +} + +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)( (RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)} != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)( \ + (RET_TYPE){ \ + FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3) \ + } != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)( \ + (RET_TYPE){ \ + FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \ + FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7) \ + } != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)( \ + (RET_TYPE){ \ + FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \ + FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \ + FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \ + FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf) \ + } != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) \ +_CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2) \ +_CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3) \ +_CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4) \ +_CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8) \ +_CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16) + +#define _CLC_DEFINE_RELATIONAL_UNARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \ +_CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \ +_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) \ + +#define _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y){ \ + return BUILTIN_NAME(x, y); \ +} + +#define _CLC_DEFINE_RELATIONAL_BINARY_VEC(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ + return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ + return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ + return (RET_TYPE)( (RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2)} != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ + return (RET_TYPE)( \ + (RET_TYPE){ \ + FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3) \ + } != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ + return (RET_TYPE)( \ + (RET_TYPE){ \ + FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \ + FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7) \ + } != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ + return (RET_TYPE)( \ + (RET_TYPE){ \ + FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \ + FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), \ + FUNCTION(x.s8, y.s8), FUNCTION(x.s9, y.s9), FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \ + FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), FUNCTION(x.se, y.se), FUNCTION(x.sf, y.sf) \ + } != (RET_TYPE)0); \ +} + +#define _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE##2, FUNCTION, ARG0_TYPE##2, ARG1_TYPE##2) \ +_CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE##3, FUNCTION, ARG0_TYPE##3, ARG1_TYPE##3) \ +_CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE##4, FUNCTION, ARG0_TYPE##4, ARG1_TYPE##4) \ +_CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE##8, FUNCTION, ARG0_TYPE##8, ARG1_TYPE##8) \ +_CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE##16, FUNCTION, ARG0_TYPE##16, ARG1_TYPE##16) + +#define _CLC_DEFINE_RELATIONAL_BINARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG0_TYPE, ARG1_TYPE) \ +_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, ARG1_TYPE) diff --git a/libclc/generic/lib/relational/signbit.cl b/libclc/generic/lib/relational/signbit.cl new file mode 100644 index 0000000..ab37d2f --- /dev/null +++ b/libclc/generic/lib/relational/signbit.cl @@ -0,0 +1,19 @@ +#include <clc/clc.h> +#include "relational.h" + +_CLC_DEFINE_RELATIONAL_UNARY(int, signbit, __builtin_signbitf, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// The scalar version of signbit(double) returns an int, but the vector versions +// return long. + +_CLC_DEF _CLC_OVERLOAD int signbit(double x){ + return __builtin_signbit(x); +} + +_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, signbit, double) + +#endif diff --git a/libclc/generic/lib/shared/clamp.cl b/libclc/generic/lib/shared/clamp.cl new file mode 100644 index 0000000..c79a358 --- /dev/null +++ b/libclc/generic/lib/shared/clamp.cl @@ -0,0 +1,11 @@ +#include <clc/clc.h> + +#define __CLC_BODY <clamp.inc> +#include <clc/integer/gentype.inc> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <clamp.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/shared/clamp.inc b/libclc/generic/lib/shared/clamp.inc new file mode 100644 index 0000000..c918f9c --- /dev/null +++ b/libclc/generic/lib/shared/clamp.inc @@ -0,0 +1,9 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) { + return (x > z ? z : (x < y ? y : x)); +} + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_SCALAR_GENTYPE y, __CLC_SCALAR_GENTYPE z) { + return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x)); +} +#endif diff --git a/libclc/generic/lib/shared/max.cl b/libclc/generic/lib/shared/max.cl new file mode 100644 index 0000000..1c4457c --- /dev/null +++ b/libclc/generic/lib/shared/max.cl @@ -0,0 +1,11 @@ +#include <clc/clc.h> + +#define __CLC_BODY <max.inc> +#include <clc/integer/gentype.inc> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <max.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/shared/max.inc b/libclc/generic/lib/shared/max.inc new file mode 100644 index 0000000..75a24c0 --- /dev/null +++ b/libclc/generic/lib/shared/max.inc @@ -0,0 +1,9 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b) { + return (a > b ? a : b); +} + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { + return (a > (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b); +} +#endif diff --git a/libclc/generic/lib/shared/min.cl b/libclc/generic/lib/shared/min.cl new file mode 100644 index 0000000..433087a --- /dev/null +++ b/libclc/generic/lib/shared/min.cl @@ -0,0 +1,11 @@ +#include <clc/clc.h> + +#define __CLC_BODY <min.inc> +#include <clc/integer/gentype.inc> + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY <min.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/shared/min.inc b/libclc/generic/lib/shared/min.inc new file mode 100644 index 0000000..fe42864 --- /dev/null +++ b/libclc/generic/lib/shared/min.inc @@ -0,0 +1,9 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) { + return (a < b ? a : b); +} + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { + return (a < (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b); +} +#endif diff --git a/libclc/generic/lib/shared/vload.cl b/libclc/generic/lib/shared/vload.cl new file mode 100644 index 0000000..8897200 --- /dev/null +++ b/libclc/generic/lib/shared/vload.cl @@ -0,0 +1,52 @@ +#include <clc/clc.h> + +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[2*offset])); \ + } \ +\ + typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + PRIM_TYPE##2 vec = *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \ + return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \ + } \ +\ + typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&x[4*offset])); \ + } \ +\ + typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&x[8*offset])); \ + } \ +\ + typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&x[16*offset])); \ + } \ + +#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ + +#define VLOAD_TYPES() \ + VLOAD_ADDR_SPACES(char) \ + VLOAD_ADDR_SPACES(uchar) \ + VLOAD_ADDR_SPACES(short) \ + VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(int) \ + VLOAD_ADDR_SPACES(uint) \ + VLOAD_ADDR_SPACES(long) \ + VLOAD_ADDR_SPACES(ulong) \ + VLOAD_ADDR_SPACES(float) \ + +VLOAD_TYPES() + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + VLOAD_ADDR_SPACES(double) +#endif diff --git a/libclc/generic/lib/shared/vstore.cl b/libclc/generic/lib/shared/vstore.cl new file mode 100644 index 0000000..4777b7e --- /dev/null +++ b/libclc/generic/lib/shared/vstore.cl @@ -0,0 +1,52 @@ +#include <clc/clc.h> + +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ + mem[3 * offset + 2] = vec.s2;\ + } \ +\ + typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \ + } \ +\ + typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \ + } \ +\ + typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ + _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \ + } \ + +#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ + +#define VSTORE_TYPES() \ + VSTORE_ADDR_SPACES(char) \ + VSTORE_ADDR_SPACES(uchar) \ + VSTORE_ADDR_SPACES(short) \ + VSTORE_ADDR_SPACES(ushort) \ + VSTORE_ADDR_SPACES(int) \ + VSTORE_ADDR_SPACES(uint) \ + VSTORE_ADDR_SPACES(long) \ + VSTORE_ADDR_SPACES(ulong) \ + VSTORE_ADDR_SPACES(float) \ + +VSTORE_TYPES() + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + VSTORE_ADDR_SPACES(double) +#endif diff --git a/libclc/generic/lib/subnormal_config.cl b/libclc/generic/lib/subnormal_config.cl new file mode 100644 index 0000000..4bcecfd --- /dev/null +++ b/libclc/generic/lib/subnormal_config.cl @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <clc/clc.h> + +#include "config.h" + +_CLC_DEF bool __clc_fp16_subnormals_supported() { + return false; +} + +_CLC_DEF bool __clc_fp32_subnormals_supported() { + return false; +} + +_CLC_DEF bool __clc_fp64_subnormals_supported() { + return !__clc_subnormals_disabled(); +} diff --git a/libclc/generic/lib/subnormal_disable.ll b/libclc/generic/lib/subnormal_disable.ll new file mode 100644 index 0000000..b935583 --- /dev/null +++ b/libclc/generic/lib/subnormal_disable.ll @@ -0,0 +1 @@ +@__CLC_SUBNORMAL_DISABLE = unnamed_addr constant i1 true diff --git a/libclc/generic/lib/subnormal_helper_func.ll b/libclc/generic/lib/subnormal_helper_func.ll new file mode 100644 index 0000000..fb1b5d2 --- /dev/null +++ b/libclc/generic/lib/subnormal_helper_func.ll @@ -0,0 +1,8 @@ +@__CLC_SUBNORMAL_DISABLE = external global i1 + +define i1 @__clc_subnormals_disabled() #0 { + %disable = load i1, i1* @__CLC_SUBNORMAL_DISABLE + ret i1 %disable +} + +attributes #0 = { alwaysinline } diff --git a/libclc/generic/lib/subnormal_use_default.ll b/libclc/generic/lib/subnormal_use_default.ll new file mode 100644 index 0000000..d70c63b --- /dev/null +++ b/libclc/generic/lib/subnormal_use_default.ll @@ -0,0 +1 @@ +@__CLC_SUBNORMAL_DISABLE = unnamed_addr constant i1 false diff --git a/libclc/generic/lib/workitem/get_global_id.cl b/libclc/generic/lib/workitem/get_global_id.cl new file mode 100644 index 0000000..fdd83d2 --- /dev/null +++ b/libclc/generic/lib/workitem/get_global_id.cl @@ -0,0 +1,5 @@ +#include <clc/clc.h> + +_CLC_DEF size_t get_global_id(uint dim) { + return get_group_id(dim)*get_local_size(dim) + get_local_id(dim); +} diff --git a/libclc/generic/lib/workitem/get_global_size.cl b/libclc/generic/lib/workitem/get_global_size.cl new file mode 100644 index 0000000..5ae649e --- /dev/null +++ b/libclc/generic/lib/workitem/get_global_size.cl @@ -0,0 +1,5 @@ +#include <clc/clc.h> + +_CLC_DEF size_t get_global_size(uint dim) { + return get_num_groups(dim)*get_local_size(dim); +} |