7 files changed, 556 insertions, 449 deletions
diff --git a/target-arm/helper.c b/target-arm/helper.c
index c61c610..ee6cd59 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -1922,3 +1922,248 @@ void cpu_arm_set_cp_io(CPUARMState *env, int cpnum,
 }
 
 #endif
+
+/* Note that signed overflow is undefined in C.  The following routines are
+   careful to use unsigned types where modulo arithmetic is required.
+   Failure to do so _will_ break on newer gcc.  */
+
+/* Signed saturating arithmetic.  */
+
+/* Perform 16-bit signed satruating addition.  */
+static inline uint16_t add16_sat(uint16_t a, uint16_t b)
+{
+    uint16_t res;
+
+    res = a + b;
+    if (((res ^ a) & 0x8000) && !((a ^ b) & 0x8000)) {
+        if (a & 0x8000)
+            res = 0x8000;
+        else
+            res = 0x7fff;
+    }
+    return res;
+}
+
+/* Perform 8-bit signed satruating addition.  */
+static inline uint8_t add8_sat(uint8_t a, uint8_t b)
+{
+    uint8_t res;
+
+    res = a + b;
+    if (((res ^ a) & 0x80) && !((a ^ b) & 0x80)) {
+        if (a & 0x80)
+            res = 0x80;
+        else
+            res = 0x7f;
+    }
+    return res;
+}
+
+/* Perform 16-bit signed satruating subtraction.  */
+static inline uint16_t sub16_sat(uint16_t a, uint16_t b)
+{
+    uint16_t res;
+
+    res = a - b;
+    if (((res ^ a) & 0x8000) && ((a ^ b) & 0x8000)) {
+        if (a & 0x8000)
+            res = 0x8000;
+        else
+            res = 0x7fff;
+    }
+    return res;
+}
+
+/* Perform 8-bit signed satruating subtraction.  */
+static inline uint8_t sub8_sat(uint8_t a, uint8_t b)
+{
+    uint8_t res;
+
+    res = a - b;
+    if (((res ^ a) & 0x80) && ((a ^ b) & 0x80)) {
+        if (a & 0x80)
+            res = 0x80;
+        else
+            res = 0x7f;
+    }
+    return res;
+}
+
+#define ADD16(a, b, n) RESULT(add16_sat(a, b), n, 16);
+#define SUB16(a, b, n) RESULT(sub16_sat(a, b), n, 16);
+#define ADD8(a, b, n)  RESULT(add8_sat(a, b), n, 8);
+#define SUB8(a, b, n)  RESULT(sub8_sat(a, b), n, 8);
+#define PFX q
+
+#include "op_addsub.h"
+
+/* Unsigned saturating arithmetic.  */
+static inline uint16_t add16_usat(uint16_t a, uint8_t b)
+{
+    uint16_t res;
+    res = a + b;
+    if (res < a)
+        res = 0xffff;
+    return res;
+}
+
+static inline uint16_t sub16_usat(uint16_t a, uint8_t b)
+{
+    if (a < b)
+        return a - b;
+    else
+        return 0;
+}
+
+static inline uint8_t add8_usat(uint8_t a, uint8_t b)
+{
+    uint8_t res;
+    res = a + b;
+    if (res < a)
+        res = 0xff;
+    return res;
+}
+
+static inline uint8_t sub8_usat(uint8_t a, uint8_t b)
+{
+    if (a < b)
+        return a - b;
+    else
+        return 0;
+}
+
+#define ADD16(a, b, n) RESULT(add16_usat(a, b), n, 16);
+#define SUB16(a, b, n) RESULT(sub16_usat(a, b), n, 16);
+#define ADD8(a, b, n)  RESULT(add8_usat(a, b), n, 8);
+#define SUB8(a, b, n)  RESULT(sub8_usat(a, b), n, 8);
+#define PFX uq
+
+#include "op_addsub.h"
+
+/* Signed modulo arithmetic.  */
+#define SARITH16(a, b, n, op) do { \
+    int32_t sum; \
+    sum = (int16_t)((uint16_t)(a) op (uint16_t)(b)); \
+    RESULT(sum, n, 16); \
+    if (sum >= 0) \
+        ge |= 3 << (n * 2); \
+    } while(0)
+
+#define SARITH8(a, b, n, op) do { \
+    int32_t sum; \
+    sum = (int8_t)((uint8_t)(a) op (uint8_t)(b)); \
+    RESULT(sum, n, 8); \
+    if (sum >= 0) \
+        ge |= 1 << n; \
+    } while(0)
+
+
+#define ADD16(a, b, n) SARITH16(a, b, n, +)
+#define SUB16(a, b, n) SARITH16(a, b, n, -)
+#define ADD8(a, b, n)  SARITH8(a, b, n, +)
+#define SUB8(a, b, n)  SARITH8(a, b, n, -)
+#define PFX s
+#define ARITH_GE
+
+#include "op_addsub.h"
+
+/* Unsigned modulo arithmetic.  */
+#define ADD16(a, b, n) do { \
+    uint32_t sum; \
+    sum = (uint32_t)(uint16_t)(a) + (uint32_t)(uint16_t)(b); \
+    RESULT(sum, n, 16); \
+    if ((sum >> 16) == 0) \
+        ge |= 3 << (n * 2); \
+    } while(0)
+
+#define ADD8(a, b, n) do { \
+    uint32_t sum; \
+    sum = (uint32_t)(uint8_t)(a) + (uint32_t)(uint8_t)(b); \
+    RESULT(sum, n, 8); \
+    if ((sum >> 8) == 0) \
+        ge |= 3 << (n * 2); \
+    } while(0)
+
+#define SUB16(a, b, n) do { \
+    uint32_t sum; \
+    sum = (uint32_t)(uint16_t)(a) - (uint32_t)(uint16_t)(b); \
+    RESULT(sum, n, 16); \
+    if ((sum >> 16) == 0) \
+        ge |= 3 << (n * 2); \
+    } while(0)
+
+#define SUB8(a, b, n) do { \
+    uint32_t sum; \
+    sum = (uint32_t)(uint8_t)(a) - (uint32_t)(uint8_t)(b); \
+    RESULT(sum, n, 8); \
+    if ((sum >> 8) == 0) \
+        ge |= 3 << (n * 2); \
+    } while(0)
+
+#define PFX u
+#define ARITH_GE
+
+#include "op_addsub.h"
+
+/* Halved signed arithmetic.  */
+#define ADD16(a, b, n) \
+  RESULT(((int32_t)(int16_t)(a) + (int32_t)(int16_t)(b)) >> 1, n, 16)
+#define SUB16(a, b, n) \
+  RESULT(((int32_t)(int16_t)(a) - (int32_t)(int16_t)(b)) >> 1, n, 16)
+#define ADD8(a, b, n) \
+  RESULT(((int32_t)(int8_t)(a) + (int32_t)(int8_t)(b)) >> 1, n, 8)
+#define SUB8(a, b, n) \
+  RESULT(((int32_t)(int8_t)(a) - (int32_t)(int8_t)(b)) >> 1, n, 8)
+#define PFX sh
+
+#include "op_addsub.h"
+
+/* Halved unsigned arithmetic.  */
+#define ADD16(a, b, n) \
+  RESULT(((uint32_t)(uint16_t)(a) + (uint32_t)(uint16_t)(b)) >> 1, n, 16)
+#define SUB16(a, b, n) \
+  RESULT(((uint32_t)(uint16_t)(a) - (uint32_t)(uint16_t)(b)) >> 1, n, 16)
+#define ADD8(a, b, n) \
+  RESULT(((uint32_t)(uint8_t)(a) + (uint32_t)(uint8_t)(b)) >> 1, n, 8)
+#define SUB8(a, b, n) \
+  RESULT(((uint32_t)(uint8_t)(a) - (uint32_t)(uint8_t)(b)) >> 1, n, 8)
+#define PFX uh
+
+#include "op_addsub.h"
+
+static inline uint8_t do_usad(uint8_t a, uint8_t b)
+{
+    if (a > b)
+        return a - b;
+    else
+        return b - a;
+}
+
+/* Unsigned sum of absolute byte differences.  */
+uint32_t HELPER(usad8)(uint32_t a, uint32_t b)
+{
+    uint32_t sum;
+    sum = do_usad(a, b);
+    sum += do_usad(a >> 8, b >> 8);
+    sum += do_usad(a >> 16, b >>16);
+    sum += do_usad(a >> 24, b >> 24);
+    return sum;
+}
+
+/* For ARMv6 SEL instruction.  */
+uint32_t HELPER(sel_flags)(uint32_t flags, uint32_t a, uint32_t b)
+{
+    uint32_t mask;
+
+    mask = 0;
+    if (flags & 1)
+        mask |= 0xff;
+    if (flags & 2)
+        mask |= 0xff00;
+    if (flags & 4)
+        mask |= 0xff0000;
+    if (flags & 8)
+        mask |= 0xff000000;
+    return (a & mask) | (b & ~mask);
+}
+
diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index 9f60814..854b67c 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -1,4 +1,4 @@
-#define DEF_HELPER(name, ret, args) ret helper_##name args;
+#define DEF_HELPER(name, ret, args) ret glue(helper_,name) args;
 
 #ifdef GEN_HELPER
 #define DEF_HELPER_1_1(name, ret, args) \
@@ -13,10 +13,18 @@ static inline void gen_helper_##name(TCGv ret, TCGv arg1, TCGv arg2) \
 { \
     tcg_gen_helper_1_2(helper_##name, ret, arg1, arg2); \
 }
+#define DEF_HELPER_1_3(name, ret, args) \
+DEF_HELPER(name, ret, args) \
+static inline void gen_helper_##name(TCGv ret, \
+    TCGv arg1, TCGv arg2, TCGv arg3) \
+{ \
+    tcg_gen_helper_1_3(helper_##name, ret, arg1, arg2, arg3); \
+}
 #else /* !GEN_HELPER */
 #define DEF_HELPER_1_1 DEF_HELPER
 #define DEF_HELPER_1_2 DEF_HELPER
-#define HELPER(x) helper_##x
+#define DEF_HELPER_1_3 DEF_HELPER
+#define HELPER(x) glue(helper_,x)
 #endif
 
 DEF_HELPER_1_1(clz, uint32_t, (uint32_t))
@@ -33,6 +41,40 @@ DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t))
 DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t))
 DEF_HELPER_1_1(rbit, uint32_t, (uint32_t))
 
+#define PAS_OP(pfx)  \
+    DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
+    DEF_HELPER_1_3(pfx ## sub8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
+    DEF_HELPER_1_3(pfx ## sub16, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
+    DEF_HELPER_1_3(pfx ## add16, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
+    DEF_HELPER_1_3(pfx ## addsubx, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
+    DEF_HELPER_1_3(pfx ## subaddx, uint32_t, (uint32_t, uint32_t, uint32_t *))
+
+PAS_OP(s)
+PAS_OP(u)
+#undef PAS_OP
+
+#define PAS_OP(pfx)  \
+    DEF_HELPER_1_2(pfx ## add8, uint32_t, (uint32_t, uint32_t)) \
+    DEF_HELPER_1_2(pfx ## sub8, uint32_t, (uint32_t, uint32_t)) \
+    DEF_HELPER_1_2(pfx ## sub16, uint32_t, (uint32_t, uint32_t)) \
+    DEF_HELPER_1_2(pfx ## add16, uint32_t, (uint32_t, uint32_t)) \
+    DEF_HELPER_1_2(pfx ## addsubx, uint32_t, (uint32_t, uint32_t)) \
+    DEF_HELPER_1_2(pfx ## subaddx, uint32_t, (uint32_t, uint32_t))
+PAS_OP(q)
+PAS_OP(sh)
+PAS_OP(uq)
+PAS_OP(uh)
+#undef PAS_OP
+
+DEF_HELPER_1_2(ssat, uint32_t, (uint32_t, uint32_t))
+DEF_HELPER_1_2(usat, uint32_t, (uint32_t, uint32_t))
+DEF_HELPER_1_2(ssat16, uint32_t, (uint32_t, uint32_t))
+DEF_HELPER_1_2(usat16, uint32_t, (uint32_t, uint32_t))
+
+DEF_HELPER_1_2(usad8, uint32_t, (uint32_t, uint32_t))
+
+DEF_HELPER_1_3(sel_flags, uint32_t, (uint32_t, uint32_t, uint32_t))
+
 #undef DEF_HELPER
 #undef DEF_HELPER_1_1
 #undef DEF_HELPER_1_2
diff --git a/target-arm/op.c b/target-arm/op.c
index e714d41..c3150ad 100644
--- a/target-arm/op.c
+++ b/target-arm/op.c
@@ -805,327 +805,6 @@ void OPPROTO op_movl_user_T0(void)
     FORCE_RET();
 }
 
-/* ARMv6 Media instructions.  */
-
-/* Note that signed overflow is undefined in C.  The following routines are
-   careful to use unsigned types where modulo arithmetic is required.
-   Failure to do so _will_ break on newer gcc.  */
-
-/* Signed saturating arithmetic.  */
-
-/* Perform 16-bit signed satruating addition.  */
-static inline uint16_t add16_sat(uint16_t a, uint16_t b)
-{
-    uint16_t res;
-
-    res = a + b;
-    if (((res ^ a) & 0x8000) && !((a ^ b) & 0x8000)) {
-        if (a & 0x8000)
-            res = 0x8000;
-        else
-            res = 0x7fff;
-    }
-    return res;
-}
-
-/* Perform 8-bit signed satruating addition.  */
-static inline uint8_t add8_sat(uint8_t a, uint8_t b)
-{
-    uint8_t res;
-
-    res = a + b;
-    if (((res ^ a) & 0x80) && !((a ^ b) & 0x80)) {
-        if (a & 0x80)
-            res = 0x80;
-        else
-            res = 0x7f;
-    }
-    return res;
-}
-
-/* Perform 16-bit signed satruating subtraction.  */
-static inline uint16_t sub16_sat(uint16_t a, uint16_t b)
-{
-    uint16_t res;
-
-    res = a - b;
-    if (((res ^ a) & 0x8000) && ((a ^ b) & 0x8000)) {
-        if (a & 0x8000)
-            res = 0x8000;
-        else
-            res = 0x7fff;
-    }
-    return res;
-}
-
-/* Perform 8-bit signed satruating subtraction.  */
-static inline uint8_t sub8_sat(uint8_t a, uint8_t b)
-{
-    uint8_t res;
-
-    res = a - b;
-    if (((res ^ a) & 0x80) && ((a ^ b) & 0x80)) {
-        if (a & 0x80)
-            res = 0x80;
-        else
-            res = 0x7f;
-    }
-    return res;
-}
-
-#define ADD16(a, b, n) RESULT(add16_sat(a, b), n, 16);
-#define SUB16(a, b, n) RESULT(sub16_sat(a, b), n, 16);
-#define ADD8(a, b, n)  RESULT(add8_sat(a, b), n, 8);
-#define SUB8(a, b, n)  RESULT(sub8_sat(a, b), n, 8);
-#define PFX q
-
-#include "op_addsub.h"
-
-/* Unsigned saturating arithmetic.  */
-static inline uint16_t add16_usat(uint16_t a, uint8_t b)
-{
-    uint16_t res;
-    res = a + b;
-    if (res < a)
-        res = 0xffff;
-    return res;
-}
-
-static inline uint16_t sub16_usat(uint16_t a, uint8_t b)
-{
-    if (a < b)
-        return a - b;
-    else
-        return 0;
-}
-
-static inline uint8_t add8_usat(uint8_t a, uint8_t b)
-{
-    uint8_t res;
-    res = a + b;
-    if (res < a)
-        res = 0xff;
-    return res;
-}
-
-static inline uint8_t sub8_usat(uint8_t a, uint8_t b)
-{
-    if (a < b)
-        return a - b;
-    else
-        return 0;
-}
-
-#define ADD16(a, b, n) RESULT(add16_usat(a, b), n, 16);
-#define SUB16(a, b, n) RESULT(sub16_usat(a, b), n, 16);
-#define ADD8(a, b, n)  RESULT(add8_usat(a, b), n, 8);
-#define SUB8(a, b, n)  RESULT(sub8_usat(a, b), n, 8);
-#define PFX uq
-
-#include "op_addsub.h"
-
-/* Signed modulo arithmetic.  */
-#define SARITH16(a, b, n, op) do { \
-    int32_t sum; \
-    sum = (int16_t)((uint16_t)(a) op (uint16_t)(b)); \
-    RESULT(sum, n, 16); \
-    if (sum >= 0) \
-        ge |= 3 << (n * 2); \
-    } while(0)
-
-#define SARITH8(a, b, n, op) do { \
-    int32_t sum; \
-    sum = (int8_t)((uint8_t)(a) op (uint8_t)(b)); \
-    RESULT(sum, n, 8); \
-    if (sum >= 0) \
-        ge |= 1 << n; \
-    } while(0)
-
-
-#define ADD16(a, b, n) SARITH16(a, b, n, +)
-#define SUB16(a, b, n) SARITH16(a, b, n, -)
-#define ADD8(a, b, n)  SARITH8(a, b, n, +)
-#define SUB8(a, b, n)  SARITH8(a, b, n, -)
-#define PFX s
-#define ARITH_GE
-
-#include "op_addsub.h"
-
-/* Unsigned modulo arithmetic.  */
-#define ADD16(a, b, n) do { \
-    uint32_t sum; \
-    sum = (uint32_t)(uint16_t)(a) + (uint32_t)(uint16_t)(b); \
-    RESULT(sum, n, 16); \
-    if ((sum >> 16) == 0) \
-        ge |= 3 << (n * 2); \
-    } while(0)
-
-#define ADD8(a, b, n) do { \
-    uint32_t sum; \
-    sum = (uint32_t)(uint8_t)(a) + (uint32_t)(uint8_t)(b); \
-    RESULT(sum, n, 8); \
-    if ((sum >> 8) == 0) \
-        ge |= 3 << (n * 2); \
-    } while(0)
-
-#define SUB16(a, b, n) do { \
-    uint32_t sum; \
-    sum = (uint32_t)(uint16_t)(a) - (uint32_t)(uint16_t)(b); \
-    RESULT(sum, n, 16); \
-    if ((sum >> 16) == 0) \
-        ge |= 3 << (n * 2); \
-    } while(0)
-
-#define SUB8(a, b, n) do { \
-    uint32_t sum; \
-    sum = (uint32_t)(uint8_t)(a) - (uint32_t)(uint8_t)(b); \
-    RESULT(sum, n, 8); \
-    if ((sum >> 8) == 0) \
-        ge |= 3 << (n * 2); \
-    } while(0)
-
-#define PFX u
-#define ARITH_GE
-
-#include "op_addsub.h"
-
-/* Halved signed arithmetic.  */
-#define ADD16(a, b, n) \
-  RESULT(((int32_t)(int16_t)(a) + (int32_t)(int16_t)(b)) >> 1, n, 16)
-#define SUB16(a, b, n) \
-  RESULT(((int32_t)(int16_t)(a) - (int32_t)(int16_t)(b)) >> 1, n, 16)
-#define ADD8(a, b, n) \
-  RESULT(((int32_t)(int8_t)(a) + (int32_t)(int8_t)(b)) >> 1, n, 8)
-#define SUB8(a, b, n) \
-  RESULT(((int32_t)(int8_t)(a) - (int32_t)(int8_t)(b)) >> 1, n, 8)
-#define PFX sh
-
-#include "op_addsub.h"
-
-/* Halved unsigned arithmetic.  */
-#define ADD16(a, b, n) \
-  RESULT(((uint32_t)(uint16_t)(a) + (uint32_t)(uint16_t)(b)) >> 1, n, 16)
-#define SUB16(a, b, n) \
-  RESULT(((uint32_t)(uint16_t)(a) - (uint32_t)(uint16_t)(b)) >> 1, n, 16)
-#define ADD8(a, b, n) \
-  RESULT(((uint32_t)(uint8_t)(a) + (uint32_t)(uint8_t)(b)) >> 1, n, 8)
-#define SUB8(a, b, n) \
-  RESULT(((uint32_t)(uint8_t)(a) - (uint32_t)(uint8_t)(b)) >> 1, n, 8)
-#define PFX uh
-
-#include "op_addsub.h"
-
-void OPPROTO op_sel_T0_T1(void)
-{
-    uint32_t mask;
-    uint32_t flags;
-
-    flags = env->GE;
-    mask = 0;
-    if (flags & 1)
-        mask |= 0xff;
-    if (flags & 2)
-        mask |= 0xff00;
-    if (flags & 4)
-        mask |= 0xff0000;
-    if (flags & 8)
-        mask |= 0xff000000;
-    T0 = (T0 & mask) | (T1 & ~mask);
-    FORCE_RET();
-}
-
-/* Signed saturation.  */
-static inline uint32_t do_ssat(int32_t val, int shift)
-{
-    int32_t top;
-    uint32_t mask;
-
-    shift = PARAM1;
-    top = val >> shift;
-    mask = (1u << shift) - 1;
-    if (top > 0) {
-        env->QF = 1;
-        return mask;
-    } else if (top < -1) {
-        env->QF = 1;
-        return ~mask;
-    }
-    return val;
-}
-
-/* Unsigned saturation.  */
-static inline uint32_t do_usat(int32_t val, int shift)
-{
-    uint32_t max;
-
-    shift = PARAM1;
-    max = (1u << shift) - 1;
-    if (val < 0) {
-        env->QF = 1;
-        return 0;
-    } else if (val > max) {
-        env->QF = 1;
-        return max;
-    }
-    return val;
-}
-
-/* Signed saturate.  */
-void OPPROTO op_ssat_T1(void)
-{
-    T0 = do_ssat(T0, PARAM1);
-    FORCE_RET();
-}
-
-/* Dual halfword signed saturate.  */
-void OPPROTO op_ssat16_T1(void)
-{
-    uint32_t res;
-
-    res = (uint16_t)do_ssat((int16_t)T0, PARAM1);
-    res |= do_ssat(((int32_t)T0) >> 16, PARAM1) << 16;
-    T0 = res;
-    FORCE_RET();
-}
-
-/* Unsigned saturate.  */
-void OPPROTO op_usat_T1(void)
-{
-    T0 = do_usat(T0, PARAM1);
-    FORCE_RET();
-}
-
-/* Dual halfword unsigned saturate.  */
-void OPPROTO op_usat16_T1(void)
-{
-    uint32_t res;
-
-    res = (uint16_t)do_usat((int16_t)T0, PARAM1);
-    res |= do_usat(((int32_t)T0) >> 16, PARAM1) << 16;
-    T0 = res;
-    FORCE_RET();
-}
-
-/* Dual 16-bit add.  */
-static inline uint8_t do_usad(uint8_t a, uint8_t b)
-{
-    if (a > b)
-        return a - b;
-    else
-        return b - a;
-}
-
-/* Unsigned sum of absolute byte differences.  */
-void OPPROTO op_usad8_T0_T1(void)
-{
-    uint32_t sum;
-    sum = do_usad(T0, T1);
-    sum += do_usad(T0 >> 8, T1 >> 8);
-    sum += do_usad(T0 >> 16, T1 >>16);
-    sum += do_usad(T0 >> 24, T1 >> 24);
-    T0 = sum;
-}
-
 void OPPROTO op_movl_T1_r13_banked(void)
 {
     T1 = helper_get_r13_banked(env, PARAM1);
diff --git a/target-arm/op_addsub.h b/target-arm/op_addsub.h
index d15360d..376ee27 100644
--- a/target-arm/op_addsub.h
+++ b/target-arm/op_addsub.h
@@ -8,9 +8,11 @@
  */
 
 #ifdef ARITH_GE
+#define GE_ARG , uint32_t *gep
 #define DECLARE_GE uint32_t ge = 0
-#define SET_GE env->GE = ge
+#define SET_GE *gep = ge
 #else
+#define GE_ARG
 #define DECLARE_GE do{}while(0)
 #define SET_GE do{}while(0)
 #endif
@@ -18,82 +20,77 @@
 #define RESULT(val, n, width) \
     res |= ((uint32_t)(glue(glue(uint,width),_t))(val)) << (n * width)
 
-void OPPROTO glue(glue(op_,PFX),add16_T0_T1)(void)
+uint32_t HELPER(glue(PFX,add16))(uint32_t a, uint32_t b GE_ARG)
 {
     uint32_t res = 0;
     DECLARE_GE;
 
-    ADD16(T0, T1, 0);
-    ADD16(T0 >> 16, T1 >> 16, 1);
+    ADD16(a, b, 0);
+    ADD16(a >> 16, b >> 16, 1);
     SET_GE;
-    T0 = res;
-    FORCE_RET();
+    return res;
 }
 
-void OPPROTO glue(glue(op_,PFX),add8_T0_T1)(void)
+uint32_t HELPER(glue(PFX,add8))(uint32_t a, uint32_t b GE_ARG)
 {
     uint32_t res = 0;
     DECLARE_GE;
 
-    ADD8(T0, T1, 0);
-    ADD8(T0 >> 8, T1 >> 8, 1);
-    ADD8(T0 >> 16, T1 >> 16, 2);
-    ADD8(T0 >> 24, T1 >> 24, 3);
+    ADD8(a, b, 0);
+    ADD8(a >> 8, b >> 8, 1);
+    ADD8(a >> 16, b >> 16, 2);
+    ADD8(a >> 24, b >> 24, 3);
     SET_GE;
-    T0 = res;
-    FORCE_RET();
+    return res;
 }
 
-void OPPROTO glue(glue(op_,PFX),sub16_T0_T1)(void)
+uint32_t HELPER(glue(PFX,sub16))(uint32_t a, uint32_t b GE_ARG)
 {
     uint32_t res = 0;
     DECLARE_GE;
 
-    SUB16(T0, T1, 0);
-    SUB16(T0 >> 16, T1 >> 16, 1);
+    SUB16(a, b, 0);
+    SUB16(a >> 16, b >> 16, 1);
     SET_GE;
-    T0 = res;
-    FORCE_RET();
+    return res;
 }
 
-void OPPROTO glue(glue(op_,PFX),sub8_T0_T1)(void)
+uint32_t HELPER(glue(PFX,sub8))(uint32_t a, uint32_t b GE_ARG)
 {
     uint32_t res = 0;
     DECLARE_GE;
 
-    SUB8(T0, T1, 0);
-    SUB8(T0 >> 8, T1 >> 8, 1);
-    SUB8(T0 >> 16, T1 >> 16, 2);
-    SUB8(T0 >> 24, T1 >> 24, 3);
+    SUB8(a, b, 0);
+    SUB8(a >> 8, b >> 8, 1);
+    SUB8(a >> 16, b >> 16, 2);
+    SUB8(a >> 24, b >> 24, 3);
     SET_GE;
-    T0 = res;
-    FORCE_RET();
+    return res;
 }
 
-void OPPROTO glue(glue(op_,PFX),subaddx_T0_T1)(void)
+uint32_t HELPER(glue(PFX,subaddx))(uint32_t a, uint32_t b GE_ARG)
 {
     uint32_t res = 0;
     DECLARE_GE;
 
-    ADD16(T0, T1, 0);
-    SUB16(T0 >> 16, T1 >> 16, 1);
+    ADD16(a, b, 0);
+    SUB16(a >> 16, b >> 16, 1);
     SET_GE;
-    T0 = res;
-    FORCE_RET();
+    return res;
 }
 
-void OPPROTO glue(glue(op_,PFX),addsubx_T0_T1)(void)
+uint32_t HELPER(glue(PFX,addsubx))(uint32_t a, uint32_t b GE_ARG)
 {
     uint32_t res = 0;
     DECLARE_GE;
 
-    SUB16(T0, T1, 0);
-    ADD16(T0 >> 16, T1 >> 16, 1);
+    SUB16(a, b, 0);
+    ADD16(a >> 16, b >> 16, 1);
     SET_GE;
-    T0 = res;
-    FORCE_RET();
+    return res;
 }
 
+#undef GE_ARG
 #undef DECLARE_GE
 #undef SET_GE
 #undef RESULT
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 1b90f58..2d3abfd 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -369,3 +369,70 @@ uint32_t HELPER(sub_usaturate)(uint32_t a, uint32_t b)
     return res;
 }
 
+/* Signed saturation.  */
+static inline uint32_t do_ssat(int32_t val, int shift)
+{
+    int32_t top;
+    uint32_t mask;
+
+    shift = PARAM1;
+    top = val >> shift;
+    mask = (1u << shift) - 1;
+    if (top > 0) {
+        env->QF = 1;
+        return mask;
+    } else if (top < -1) {
+        env->QF = 1;
+        return ~mask;
+    }
+    return val;
+}
+
+/* Unsigned saturation.  */
+static inline uint32_t do_usat(int32_t val, int shift)
+{
+    uint32_t max;
+
+    shift = PARAM1;
+    max = (1u << shift) - 1;
+    if (val < 0) {
+        env->QF = 1;
+        return 0;
+    } else if (val > max) {
+        env->QF = 1;
+        return max;
+    }
+    return val;
+}
+
+/* Signed saturate.  */
+uint32_t HELPER(ssat)(uint32_t x, uint32_t shift)
+{
+    return do_ssat(x, shift);
+}
+
+/* Dual halfword signed saturate.  */
+uint32_t HELPER(ssat16)(uint32_t x, uint32_t shift)
+{
+    uint32_t res;
+
+    res = (uint16_t)do_ssat((int16_t)x, shift);
+    res |= do_ssat(((int32_t)x) >> 16, shift) << 16;
+    return res;
+}
+
+/* Unsigned saturate.  */
+uint32_t HELPER(usat)(uint32_t x, uint32_t shift)
+{
+    return do_usat(x, shift);
+}
+
+/* Dual halfword unsigned saturate.  */
+uint32_t HELPER(usat16)(uint32_t x, uint32_t shift)
+{
+    uint32_t res;
+
+    res = (uint16_t)do_usat((int16_t)x, shift);
+    res |= do_usat(((int32_t)x) >> 16, shift) << 16;
+    return res;
+}
diff --git a/target-arm/translate.c b/target-arm/translate.c
index e46cfb9..f732f91 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -226,7 +226,6 @@ static void gen_smul_dual(TCGv a, TCGv b)
 {
     TCGv tmp1 = new_tmp();
     TCGv tmp2 = new_tmp();
-    TCGv res;
     tcg_gen_ext8s_i32(tmp1, a);
     tcg_gen_ext8s_i32(tmp2, b);
     tcg_gen_mul_i32(tmp1, tmp1, tmp2);
@@ -495,49 +494,93 @@ static inline void gen_arm_shift_im(TCGv var, int shiftop, int shift, int flags)
     }
 };
 
-#define PAS_OP(pfx) {  \
-    gen_op_ ## pfx ## add16_T0_T1, \
-    gen_op_ ## pfx ## addsubx_T0_T1, \
-    gen_op_ ## pfx ## subaddx_T0_T1, \
-    gen_op_ ## pfx ## sub16_T0_T1, \
-    gen_op_ ## pfx ## add8_T0_T1, \
-    NULL, \
-    NULL, \
-    gen_op_ ## pfx ## sub8_T0_T1 }
-
-static GenOpFunc *gen_arm_parallel_addsub[8][8] = {
-    {},
-    PAS_OP(s),
-    PAS_OP(q),
-    PAS_OP(sh),
-    {},
-    PAS_OP(u),
-    PAS_OP(uq),
-    PAS_OP(uh),
-};
+#define PAS_OP(pfx) \
+    switch (op2) {  \
+    case 0: gen_pas_helper(glue(pfx,add16)); break; \
+    case 1: gen_pas_helper(glue(pfx,addsubx)); break; \
+    case 2: gen_pas_helper(glue(pfx,subaddx)); break; \
+    case 3: gen_pas_helper(glue(pfx,sub16)); break; \
+    case 4: gen_pas_helper(glue(pfx,add8)); break; \
+    case 7: gen_pas_helper(glue(pfx,sub8)); break; \
+    }
+void gen_arm_parallel_addsub(int op1, int op2, TCGv a, TCGv b)
+{
+    TCGv tmp;
+
+    switch (op1) {
+#define gen_pas_helper(name) glue(gen_helper_,name)(a, a, b, tmp)
+    case 1:
+        tmp = tcg_temp_new(TCG_TYPE_PTR);
+        tcg_gen_addi_ptr(tmp, cpu_env, offsetof(CPUState, GE));
+        PAS_OP(s)
+        break;
+    case 5:
+        tmp = tcg_temp_new(TCG_TYPE_PTR);
+        tcg_gen_addi_ptr(tmp, cpu_env, offsetof(CPUState, GE));
+        PAS_OP(u)
+        break;
+#undef gen_pas_helper
+#define gen_pas_helper(name) glue(gen_helper_,name)(a, a, b)
+    case 2:
+        PAS_OP(q);
+        break;
+    case 3:
+        PAS_OP(sh);
+        break;
+    case 6:
+        PAS_OP(uq);
+        break;
+    case 7:
+        PAS_OP(uh);
+        break;
+#undef gen_pas_helper
+    }
+}
 #undef PAS_OP
 
-/* For unknown reasons Arm and Thumb-2 use arbitrarily diffenet encodings.  */
-#define PAS_OP(pfx) {  \
-    gen_op_ ## pfx ## add8_T0_T1, \
-    gen_op_ ## pfx ## add16_T0_T1, \
-    gen_op_ ## pfx ## addsubx_T0_T1, \
-    NULL, \
-    gen_op_ ## pfx ## sub8_T0_T1, \
-    gen_op_ ## pfx ## sub16_T0_T1, \
-    gen_op_ ## pfx ## subaddx_T0_T1, \
-    NULL }
-
-static GenOpFunc *gen_thumb2_parallel_addsub[8][8] = {
-    PAS_OP(s),
-    PAS_OP(q),
-    PAS_OP(sh),
-    {},
-    PAS_OP(u),
-    PAS_OP(uq),
-    PAS_OP(uh),
-    {}
-};
+/* For unknown reasons Arm and Thumb-2 use arbitrarily different encodings.  */
+#define PAS_OP(pfx) \
+    switch (op2) {  \
+    case 0: gen_pas_helper(glue(pfx,add8)); break; \
+    case 1: gen_pas_helper(glue(pfx,add16)); break; \
+    case 2: gen_pas_helper(glue(pfx,addsubx)); break; \
+    case 4: gen_pas_helper(glue(pfx,sub8)); break; \
+    case 5: gen_pas_helper(glue(pfx,sub16)); break; \
+    case 6: gen_pas_helper(glue(pfx,subaddx)); break; \
+    }
+void gen_thumb2_parallel_addsub(int op1, int op2, TCGv a, TCGv b)
+{
+    TCGv tmp;
+
+    switch (op1) {
+#define gen_pas_helper(name) glue(gen_helper_,name)(a, a, b, tmp)
+    case 0:
+        tmp = tcg_temp_new(TCG_TYPE_PTR);
+        tcg_gen_addi_ptr(tmp, cpu_env, offsetof(CPUState, GE));
+        PAS_OP(s)
+        break;
+    case 4:
+        tmp = tcg_temp_new(TCG_TYPE_PTR);
+        tcg_gen_addi_ptr(tmp, cpu_env, offsetof(CPUState, GE));
+        PAS_OP(u)
+        break;
+#undef gen_pas_helper
+#define gen_pas_helper(name) glue(gen_helper_,name)(a, a, b)
+    case 1:
+        PAS_OP(q);
+        break;
+    case 2:
+        PAS_OP(sh);
+        break;
+    case 5:
+        PAS_OP(uq);
+        break;
+    case 6:
+        PAS_OP(uh);
+        break;
+#undef gen_pas_helper
+    }
+}
 #undef PAS_OP
 
 static GenOpFunc1 *gen_test_cc[14] = {
@@ -4906,6 +4949,7 @@ static void disas_arm_insn(CPUState * env, DisasContext *s)
     unsigned int cond, insn, val, op1, i, shift, rm, rs, rn, rd, sh;
     TCGv tmp;
     TCGv tmp2;
+    TCGv tmp3;
 
     insn = ldl_code(s->pc);
     s->pc += 4;
@@ -5591,13 +5635,14 @@ static void disas_arm_insn(CPUState * env, DisasContext *s)
                 switch ((insn >> 23) & 3) {
                 case 0: /* Parallel add/subtract.  */
                     op1 = (insn >> 20) & 7;
-                    gen_movl_T0_reg(s, rn);
-                    gen_movl_T1_reg(s, rm);
+                    tmp = load_reg(s, rn);
+                    tmp2 = load_reg(s, rm);
                     sh = (insn >> 5) & 7;
                     if ((op1 & 3) == 0 || sh == 5 || sh == 6)
                         goto illegal_op;
-                    gen_arm_parallel_addsub[op1][sh]();
-                    gen_movl_reg_T0(s, rd);
+                    gen_arm_parallel_addsub(op1, sh, tmp, tmp2);
+                    dead_tmp(tmp2);
+                    store_reg(s, rd, tmp);
                     break;
                 case 1:
                     if ((insn & 0x00700020) == 0) {
@@ -5620,40 +5665,44 @@ static void disas_arm_insn(CPUState * env, DisasContext *s)
                         store_reg(s, rd, tmp);
                     } else if ((insn & 0x00200020) == 0x00200000) {
                         /* [us]sat */
-                        gen_movl_T1_reg(s, rm);
+                        tmp = load_reg(s, rm);
                         shift = (insn >> 7) & 0x1f;
                         if (insn & (1 << 6)) {
                             if (shift == 0)
                                 shift = 31;
-                            gen_op_sarl_T1_im(shift);
+                            tcg_gen_sari_i32(tmp, tmp, shift);
                         } else {
-                            gen_op_shll_T1_im(shift);
+                            tcg_gen_shli_i32(tmp, tmp, shift);
                         }
                         sh = (insn >> 16) & 0x1f;
                         if (sh != 0) {
                             if (insn & (1 << 22))
-                                gen_op_usat_T1(sh);
+                                gen_helper_usat(tmp, tmp, tcg_const_i32(sh));
                             else
-                                gen_op_ssat_T1(sh);
+                                gen_helper_ssat(tmp, tmp, tcg_const_i32(sh));
                         }
-                        gen_movl_T1_reg(s, rd);
+                        store_reg(s, rd, tmp);
                     } else if ((insn & 0x00300fe0) == 0x00200f20) {
                         /* [us]sat16 */
-                        gen_movl_T1_reg(s, rm);
+                        tmp = load_reg(s, rm);
                         sh = (insn >> 16) & 0x1f;
                         if (sh != 0) {
                             if (insn & (1 << 22))
-                                gen_op_usat16_T1(sh);
+                                gen_helper_usat16(tmp, tmp, tcg_const_i32(sh));
                             else
-                                gen_op_ssat16_T1(sh);
+                                gen_helper_ssat16(tmp, tmp, tcg_const_i32(sh));
                         }
-                        gen_movl_T1_reg(s, rd);
+                        store_reg(s, rd, tmp);
                     } else if ((insn & 0x00700fe0) == 0x00000fa0) {
                         /* Select bytes.  */
-                        gen_movl_T0_reg(s, rn);
-                        gen_movl_T1_reg(s, rm);
-                        gen_op_sel_T0_T1();
-                        gen_movl_reg_T0(s, rd);
+                        tmp = load_reg(s, rn);
+                        tmp2 = load_reg(s, rm);
+                        tmp3 = new_tmp();
+                        tcg_gen_ld_i32(tmp3, cpu_env, offsetof(CPUState, GE));
+                        gen_helper_sel_flags(tmp, tmp3, tmp, tmp2);
+                        dead_tmp(tmp3);
+                        dead_tmp(tmp2);
+                        store_reg(s, rd, tmp);
                     } else if ((insn & 0x000003e0) == 0x00000060) {
                         gen_movl_T1_reg(s, rm);
                         shift = (insn >> 10) & 3;
@@ -5755,15 +5804,17 @@ static void disas_arm_insn(CPUState * env, DisasContext *s)
                     op1 = ((insn >> 17) & 0x38) | ((insn >> 5) & 7);
                     switch (op1) {
                     case 0: /* Unsigned sum of absolute differences.  */
-                            goto illegal_op;
-                        gen_movl_T0_reg(s, rm);
-                        gen_movl_T1_reg(s, rs);
-                        gen_op_usad8_T0_T1();
+                        ARCH(6);
+                        tmp = load_reg(s, rm);
+                        tmp2 = load_reg(s, rs);
+                        gen_helper_usad8(tmp, tmp, tmp2);
+                        dead_tmp(tmp2);
                         if (rn != 15) {
-                            gen_movl_T1_reg(s, rn);
-                            gen_op_addl_T0_T1();
+                            tmp2 = load_reg(s, rn);
+                            tcg_gen_add_i32(tmp, tmp, tmp2);
+                            dead_tmp(tmp2);
                         }
-                        gen_movl_reg_T0(s, rd);
+                        store_reg(s, rd, tmp);
                         break;
                     case 0x20: case 0x24: case 0x28: case 0x2c:
                         /* Bitfield insert/clear.  */
@@ -6120,6 +6171,8 @@ static int disas_thumb2_insn(CPUState *env, DisasContext *s, uint16_t insn_hw1)
     uint32_t insn, imm, shift, offset, addr;
     uint32_t rd, rn, rm, rs;
     TCGv tmp;
+    TCGv tmp2;
+    TCGv tmp3;
     int op;
     int shiftop;
     int conds;
@@ -6464,10 +6517,11 @@ static int disas_thumb2_insn(CPUState *env, DisasContext *s, uint16_t insn_hw1)
             shift = (insn >> 4) & 7;
             if ((op & 3) == 3 || (shift & 3) == 3)
                 goto illegal_op;
-            gen_movl_T0_reg(s, rn);
-            gen_movl_T1_reg(s, rm);
-            gen_thumb2_parallel_addsub[op][shift]();
-            gen_movl_reg_T0(s, rd);
+            tmp = load_reg(s, rn);
+            tmp2 = load_reg(s, rm);
+            gen_thumb2_parallel_addsub(op, shift, tmp, tmp2);
+            dead_tmp(tmp2);
+            store_reg(s, rd, tmp);
             break;
         case 3: /* Other data processing.  */
             op = ((insn >> 17) & 0x38) | ((insn >> 4) & 7);
@@ -6498,7 +6552,10 @@ static int disas_thumb2_insn(CPUState *env, DisasContext *s, uint16_t insn_hw1)
                     break;
                 case 0x10: /* sel */
                     gen_movl_T1_reg(s, rm);
-                    gen_op_sel_T0_T1();
+                    tmp3 = new_tmp();
+                    tcg_gen_ld_i32(tmp3, cpu_env, offsetof(CPUState, GE));
+                    gen_helper_sel_flags(cpu_T[0], tmp3, cpu_T[0], cpu_T[1]);
+                    dead_tmp(tmp3);
                     break;
                 case 0x18: /* clz */
                     gen_helper_clz(cpu_T[0], cpu_T[0]);
@@ -6581,7 +6638,7 @@ static int disas_thumb2_insn(CPUState *env, DisasContext *s, uint16_t insn_hw1)
                 gen_movl_reg_T0(s, rd);
                 break;
             case 7: /* Unsigned sum of absolute differences.  */
-                gen_op_usad8_T0_T1();
+                gen_helper_usad8(cpu_T[0], cpu_T[0], cpu_T[1]);
                 if (rs != 15) {
                     gen_movl_T1_reg(s, rs);
                     gen_op_addl_T0_T1();
@@ -6821,63 +6878,64 @@ static int disas_thumb2_insn(CPUState *env, DisasContext *s, uint16_t insn_hw1)
                     op = (insn >> 21) & 7;
                     imm = insn & 0x1f;
                     shift = ((insn >> 6) & 3) | ((insn >> 10) & 0x1c);
-                    if (rn == 15)
-                        gen_op_movl_T1_im(0);
-                    else
-                        gen_movl_T1_reg(s, rn);
+                    if (rn == 15) {
+                        tmp = new_tmp();
+                        tcg_gen_movi_i32(tmp, 0);
+                    } else {
+                        tmp = load_reg(s, rn);
+                    }
                     switch (op) {
                     case 2: /* Signed bitfield extract.  */
                         imm++;
                         if (shift + imm > 32)
                             goto illegal_op;
                         if (imm < 32)
-                            gen_sbfx(cpu_T[1], shift, imm);
+                            gen_sbfx(tmp, shift, imm);
                         break;
                     case 6: /* Unsigned bitfield extract.  */
                         imm++;
                         if (shift + imm > 32)
                             goto illegal_op;
                         if (imm < 32)
-                            gen_ubfx(cpu_T[1], shift, (1u << imm) - 1);
+                            gen_ubfx(tmp, shift, (1u << imm) - 1);
                         break;
                     case 3: /* Bitfield insert/clear.  */
                         if (imm < shift)
                             goto illegal_op;
                         imm = imm + 1 - shift;
                         if (imm != 32) {
-                            gen_movl_T0_reg(s, rd);
-                            gen_bfi(cpu_T[1], cpu_T[0], cpu_T[1],
+                            tmp2 = load_reg(s, rd);
+                            gen_bfi(tmp, tmp2, tmp,
                                     shift, ((1u << imm) - 1) << shift);
+                            dead_tmp(tmp2);
                         }
                         break;
                     case 7:
                         goto illegal_op;
                     default: /* Saturate.  */
-                        gen_movl_T1_reg(s, rn);
                         if (shift) {
                             if (op & 1)
-                                gen_op_sarl_T1_im(shift);
+                                tcg_gen_sari_i32(tmp, tmp, shift);
                             else
-                                gen_op_shll_T1_im(shift);
+                                tcg_gen_shli_i32(tmp, tmp, shift);
                         }
+                        tmp2 = tcg_const_i32(imm);
                         if (op & 4) {
                             /* Unsigned.  */
-                            gen_op_ssat_T1(imm);
                             if ((op & 1) && shift == 0)
-                                gen_op_usat16_T1(imm);
+                                gen_helper_usat16(tmp, tmp, tmp2);
                             else
-                                gen_op_usat_T1(imm);
+                                gen_helper_usat(tmp, tmp, tmp2);
                         } else {
                             /* Signed.  */
-                            gen_op_ssat_T1(imm);
                             if ((op & 1) && shift == 0)
-                                gen_op_ssat16_T1(imm);
+                                gen_helper_ssat16(tmp, tmp, tmp2);
                             else
-                                gen_op_ssat_T1(imm);
+                                gen_helper_ssat(tmp, tmp, tmp2);
                         }
                         break;
                     }
-                    gen_movl_reg_T1(s, rd);
+                    store_reg(s, rd, tmp);
                 } else {
                     imm = ((insn & 0x04000000) >> 15)
                           | ((insn & 0x7000) >> 4) | (insn & 0xff);
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index f05c135..27f83b5 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -237,6 +237,18 @@ static inline void tcg_gen_helper_1_2(void *func, TCGv ret,
                  1, &ret, 2, args);
 }
 
+static inline void tcg_gen_helper_1_3(void *func, TCGv ret,
+                                      TCGv arg1, TCGv arg2, TCGv arg3)
+{
+    TCGv args[3];
+    args[0] = arg1;
+    args[1] = arg2;
+    args[2] = arg3;
+    tcg_gen_call(&tcg_ctx,
+                 tcg_const_ptr((tcg_target_long)func), TCG_HELPER_CALL_FLAGS,
+                 1, &ret, 3, args);
+}
+
 static inline void tcg_gen_helper_1_4(void *func, TCGv ret,
                                       TCGv arg1, TCGv arg2, TCGv arg3,
                                       TCGv arg4)
@@ -1416,3 +1428,10 @@ static inline void tcg_gen_qemu_st64(TCGv arg, TCGv addr, int mem_index)
 #define tcg_gen_ext_tl_i64 tcg_gen_ext_i32_i64
 #define tcg_const_tl tcg_const_i32
 #endif
+
+#if TCG_TARGET_REG_BITS == 32
+#define tcg_gen_addi_ptr tcg_gen_addi_i32
+#else /* TCG_TARGET_REG_BITS == 32 */
+#define tcg_gen_addi_ptr tcg_gen_addi_i64
+#endif /* TCG_TARGET_REG_BITS != 32 */
+