;; Machine description for AArch64 SVE.
;; Copyright (C) 2009-2019 Free Software Foundation, Inc.
;; Contributed by ARM Ltd.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3. If not see
;; .
;; The file is organised into the following sections (search for the full
;; line):
;;
;; == General notes
;; ---- Note on the handling of big-endian SVE
;;
;; == Moves
;; ---- Moves of single vectors
;; ---- Moves of multiple vectors
;; ---- Moves of predicates
;;
;; == Loads
;; ---- Normal contiguous loads
;; ---- Normal gather loads
;;
;; == Stores
;; ---- Normal contiguous stores
;; ---- Normal scatter stores
;;
;; == Vector creation
;; ---- [INT,FP] Duplicate element
;; ---- [INT,FP] Initialize from individual elements
;; ---- [INT] Linear series
;; ---- [PRED] Duplicate element
;;
;; == Vector decomposition
;; ---- [INT,FP] Extract index
;; ---- [INT,FP] Extract active element
;; ---- [PRED] Extract index
;;
;; == Unary arithmetic
;; ---- [INT] General unary arithmetic corresponding to rtx codes
;; ---- [FP] General unary arithmetic corresponding to unspecs
;; ---- [PRED] Inverse
;; == Binary arithmetic
;; ---- [INT] General binary arithmetic corresponding to rtx codes
;; ---- [INT] Addition
;; ---- [INT] Subtraction
;; ---- [INT] Absolute difference
;; ---- [INT] Multiplication
;; ---- [INT] Highpart multiplication
;; ---- [INT] Division
;; ---- [INT] Binary logical operations
;; ---- [INT] Binary logical operations (inverted second input)
;; ---- [INT] Shifts
;; ---- [INT] Maximum and minimum
;; ---- [FP] General binary arithmetic corresponding to rtx codes
;; ---- [FP] General binary arithmetic corresponding to unspecs
;; ---- [FP] Addition
;; ---- [FP] Subtraction
;; ---- [FP] Absolute difference
;; ---- [FP] Multiplication
;; ---- [FP] Division
;; ---- [FP] Binary logical operations
;; ---- [FP] Sign copying
;; ---- [FP] Maximum and minimum
;; ---- [PRED] Binary logical operations
;; ---- [PRED] Binary logical operations (inverted second input)
;; ---- [PRED] Binary logical operations (inverted result)
;;
;; == Ternary arithmetic
;; ---- [INT] MLA and MAD
;; ---- [INT] MLS and MSB
;; ---- [INT] Dot product
;; ---- [INT] Sum of absolute differences
;; ---- [FP] General ternary arithmetic corresponding to unspecs
;;
;; == Comparisons and selects
;; ---- [INT,FP] Select based on predicates
;; ---- [INT,FP] Compare and select
;; ---- [INT] Comparisons
;; ---- [INT] While tests
;; ---- [FP] Comparisons
;; ---- [PRED] Test bits
;;
;; == Reductions
;; ---- [INT,FP] Conditional reductions
;; ---- [INT] Tree reductions
;; ---- [FP] Tree reductions
;; ---- [FP] Left-to-right reductions
;;
;; == Permutes
;; ---- [INT,FP] General permutes
;; ---- [INT,FP] Special-purpose unary permutes
;; ---- [INT,FP] Special-purpose binary permutes
;; ---- [PRED] Special-purpose binary permutes
;;
;; == Conversions
;; ---- [INT<-INT] Packs
;; ---- [INT<-INT] Unpacks
;; ---- [INT<-FP] Conversions
;; ---- [INT<-FP] Packs
;; ---- [INT<-FP] Unpacks
;; ---- [FP<-INT] Conversions
;; ---- [FP<-INT] Packs
;; ---- [FP<-INT] Unpacks
;; ---- [FP<-FP] Packs
;; ---- [FP<-FP] Unpacks
;; ---- [PRED<-PRED] Packs
;; ---- [PRED<-PRED] Unpacks
;; =========================================================================
;; == General notes
;; =========================================================================
;;
;; -------------------------------------------------------------------------
;; ---- Note on the handling of big-endian SVE
;; -------------------------------------------------------------------------
;;
;; On big-endian systems, Advanced SIMD mov patterns act in the
;; same way as movdi or movti would: the first byte of memory goes
;; into the most significant byte of the register and the last byte
;; of memory goes into the least significant byte of the register.
;; This is the most natural ordering for Advanced SIMD and matches
;; the ABI layout for 64-bit and 128-bit vector types.
;;
;; As a result, the order of bytes within the register is what GCC
;; expects for a big-endian target, and subreg offsets therefore work
;; as expected, with the first element in memory having subreg offset 0
;; and the last element in memory having the subreg offset associated
;; with a big-endian lowpart. However, this ordering also means that
;; GCC's lane numbering does not match the architecture's numbering:
;; GCC always treats the element at the lowest address in memory
;; (subreg offset 0) as element 0, while the architecture treats
;; the least significant end of the register as element 0.
;;
;; The situation for SVE is different. We want the layout of the
;; SVE register to be same for mov as it is for maskload:
;; logically, a mov load must be indistinguishable from a
;; maskload whose mask is all true. We therefore need the
;; register layout to match LD1 rather than LDR. The ABI layout of
;; SVE types also matches LD1 byte ordering rather than LDR byte ordering.
;;
;; As a result, the architecture lane numbering matches GCC's lane
;; numbering, with element 0 always being the first in memory.
;; However:
;;
;; - Applying a subreg offset to a register does not give the element
;; that GCC expects: the first element in memory has the subreg offset
;; associated with a big-endian lowpart while the last element in memory
;; has subreg offset 0. We handle this via TARGET_CAN_CHANGE_MODE_CLASS.
;;
;; - We cannot use LDR and STR for spill slots that might be accessed
;; via subregs, since although the elements have the order GCC expects,
;; the order of the bytes within the elements is different. We instead
;; access spill slots via LD1 and ST1, using secondary reloads to
;; reserve a predicate register.
;; =========================================================================
;; == Moves
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- Moves of single vectors
;; -------------------------------------------------------------------------
;; Includes:
;; - MOV (including aliases)
;; - LD1B (contiguous form)
;; - LD1D ( " " )
;; - LD1H ( " " )
;; - LD1W ( " " )
;; - LDR
;; - ST1B (contiguous form)
;; - ST1D ( " " )
;; - ST1H ( " " )
;; - ST1W ( " " )
;; - STR
;; -------------------------------------------------------------------------
(define_expand "mov"
[(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
(match_operand:SVE_ALL 1 "general_operand"))]
"TARGET_SVE"
{
/* Use the predicated load and store patterns where possible.
This is required for big-endian targets (see the comment at the
head of the file) and increases the addressing choices for
little-endian. */
if ((MEM_P (operands[0]) || MEM_P (operands[1]))
&& can_create_pseudo_p ())
{
aarch64_expand_sve_mem_move (operands[0], operands[1], mode);
DONE;
}
if (CONSTANT_P (operands[1]))
{
aarch64_expand_mov_immediate (operands[0], operands[1],
gen_vec_duplicate);
DONE;
}
/* Optimize subregs on big-endian targets: we can use REV[BHW]
instead of going through memory. */
if (BYTES_BIG_ENDIAN
&& aarch64_maybe_expand_sve_subreg_move (operands[0], operands[1]))
DONE;
}
)
(define_expand "movmisalign"
[(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
(match_operand:SVE_ALL 1 "general_operand"))]
"TARGET_SVE"
{
/* Equivalent to a normal move for our purpooses. */
emit_move_insn (operands[0], operands[1]);
DONE;
}
)
;; Unpredicated moves (little-endian). Only allow memory operations
;; during and after RA; before RA we want the predicated load and
;; store patterns to be used instead.
(define_insn "*aarch64_sve_mov_le"
[(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
(match_operand:SVE_ALL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
"TARGET_SVE
&& !BYTES_BIG_ENDIAN
&& ((lra_in_progress || reload_completed)
|| (register_operand (operands[0], mode)
&& nonmemory_operand (operands[1], mode)))"
"@
ldr\t%0, %1
str\t%1, %0
mov\t%0.d, %1.d
* return aarch64_output_sve_mov_immediate (operands[1]);"
)
;; Unpredicated moves (big-endian). Memory accesses require secondary
;; reloads.
(define_insn "*aarch64_sve_mov_be"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
(match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))]
"TARGET_SVE && BYTES_BIG_ENDIAN"
"@
mov\t%0.d, %1.d
* return aarch64_output_sve_mov_immediate (operands[1]);"
)
;; Handle big-endian memory reloads. We use byte PTRUE for all modes
;; to try to encourage reuse.
;; This pattern needs constraints due to TARGET_SECONDARY_RELOAD hook.
(define_expand "aarch64_sve_reload_be"
[(parallel
[(set (match_operand 0)
(match_operand 1))
(clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])]
"TARGET_SVE && BYTES_BIG_ENDIAN"
{
/* Create a PTRUE. */
emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
/* Refer to the PTRUE in the appropriate mode for this move. */
machine_mode mode = GET_MODE (operands[0]);
machine_mode pred_mode
= aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode)).require ();
rtx pred = gen_lowpart (pred_mode, operands[2]);
/* Emit a predicated load or store. */
aarch64_emit_sve_pred_move (operands[0], pred, operands[1]);
DONE;
}
)
;; A predicated move in which the predicate is known to be all-true.
;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move,
;; so changes to this pattern will need changes there as well.
(define_insn_and_split "@aarch64_pred_mov"
[(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m")
(unspec:SVE_ALL
[(match_operand: 1 "register_operand" "Upl, Upl, Upl")
(match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
UNSPEC_MERGE_PTRUE))]
"TARGET_SVE
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
"@
#
ld1\t%0., %1/z, %2
st1\t%2., %1, %0"
"&& register_operand (operands[0], mode)
&& register_operand (operands[2], mode)"
[(set (match_dup 0) (match_dup 2))]
)
;; A pattern for optimizing SUBREGs that have a reinterpreting effect
;; on big-endian targets; see aarch64_maybe_expand_sve_subreg_move
;; for details. We use a special predicate for operand 2 to reduce
;; the number of patterns.
(define_insn_and_split "*aarch64_sve_mov_subreg_be"
[(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
(unspec:SVE_ALL
[(match_operand:VNx16BI 1 "register_operand" "Upl")
(match_operand 2 "aarch64_any_register_operand" "w")]
UNSPEC_REV_SUBREG))]
"TARGET_SVE && BYTES_BIG_ENDIAN"
"#"
"&& reload_completed"
[(const_int 0)]
{
aarch64_split_sve_subreg_move (operands[0], operands[1], operands[2]);
DONE;
}
)
;; -------------------------------------------------------------------------
;; ---- Moves of multiple vectors
;; -------------------------------------------------------------------------
;; All patterns in this section are synthetic and split to real
;; instructions after reload.
;; -------------------------------------------------------------------------
(define_expand "mov"
[(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
(match_operand:SVE_STRUCT 1 "general_operand"))]
"TARGET_SVE"
{
/* Big-endian loads and stores need to be done via LD1 and ST1;
see the comment at the head of the file for details. */
if ((MEM_P (operands[0]) || MEM_P (operands[1]))
&& BYTES_BIG_ENDIAN)
{
gcc_assert (can_create_pseudo_p ());
aarch64_expand_sve_mem_move (operands[0], operands[1], mode);
DONE;
}
if (CONSTANT_P (operands[1]))
{
aarch64_expand_mov_immediate (operands[0], operands[1]);
DONE;
}
}
)
;; Unpredicated structure moves (little-endian).
(define_insn "*aarch64_sve_mov_le"
[(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
"TARGET_SVE && !BYTES_BIG_ENDIAN"
"#"
[(set_attr "length" "")]
)
;; Unpredicated structure moves (big-endian). Memory accesses require
;; secondary reloads.
(define_insn "*aarch64_sve_mov_be"
[(set (match_operand:SVE_STRUCT 0 "register_operand" "=w, w")
(match_operand:SVE_STRUCT 1 "aarch64_nonmemory_operand" "w, Dn"))]
"TARGET_SVE && BYTES_BIG_ENDIAN"
"#"
[(set_attr "length" "")]
)
;; Split unpredicated structure moves into pieces. This is the same
;; for both big-endian and little-endian code, although it only needs
;; to handle memory operands for little-endian code.
(define_split
[(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand")
(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand"))]
"TARGET_SVE && reload_completed"
[(const_int 0)]
{
rtx dest = operands[0];
rtx src = operands[1];
if (REG_P (dest) && REG_P (src))
aarch64_simd_emit_reg_reg_move (operands, mode, );
else
for (unsigned int i = 0; i < ; ++i)
{
rtx subdest = simplify_gen_subreg (mode, dest, mode,
i * BYTES_PER_SVE_VECTOR);
rtx subsrc = simplify_gen_subreg (mode, src, mode,
i * BYTES_PER_SVE_VECTOR);
emit_insn (gen_rtx_SET (subdest, subsrc));
}
DONE;
}
)
;; Predicated structure moves. This works for both endiannesses but in
;; practice is only useful for big-endian.
(define_insn_and_split "@aarch64_pred_mov"
[(set (match_operand:SVE_STRUCT 0 "aarch64_sve_struct_nonimmediate_operand" "=w, w, Utx")
(unspec:SVE_STRUCT
[(match_operand: 1 "register_operand" "Upl, Upl, Upl")
(match_operand:SVE_STRUCT 2 "aarch64_sve_struct_nonimmediate_operand" "w, Utx, w")]
UNSPEC_MERGE_PTRUE))]
"TARGET_SVE
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
"#"
"&& reload_completed"
[(const_int 0)]
{
for (unsigned int i = 0; i < ; ++i)
{
rtx subdest = simplify_gen_subreg (mode, operands[0],
mode,
i * BYTES_PER_SVE_VECTOR);
rtx subsrc = simplify_gen_subreg (mode, operands[2],
mode,
i * BYTES_PER_SVE_VECTOR);
aarch64_emit_sve_pred_move (subdest, operands[1], subsrc);
}
DONE;
}
[(set_attr "length" "")]
)
;; -------------------------------------------------------------------------
;; ---- Moves of predicates
;; -------------------------------------------------------------------------
;; Includes:
;; - MOV
;; - LDR
;; - PFALSE
;; - PTRUE
;; - STR
;; -------------------------------------------------------------------------
(define_expand "mov"
[(set (match_operand:PRED_ALL 0 "nonimmediate_operand")
(match_operand:PRED_ALL 1 "general_operand"))]
"TARGET_SVE"
{
if (GET_CODE (operands[0]) == MEM)
operands[1] = force_reg (mode, operands[1]);
}
)
(define_insn "*aarch64_sve_mov"
[(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa, Upa")
(match_operand:PRED_ALL 1 "general_operand" "Upa, Upa, m, Dz, Dm"))]
"TARGET_SVE
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
"@
mov\t%0.b, %1.b
str\t%1, %0
ldr\t%0, %1
pfalse\t%0.b
* return aarch64_output_ptrue (mode, '');"
)
;; =========================================================================
;; == Loads
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- Normal contiguous loads
;; -------------------------------------------------------------------------
;; Includes contiguous forms of:
;; - LD1B
;; - LD1D
;; - LD1H
;; - LD1W
;; - LD2B
;; - LD2D
;; - LD2H
;; - LD2W
;; - LD3B
;; - LD3D
;; - LD3H
;; - LD3W
;; - LD4B
;; - LD4D
;; - LD4H
;; - LD4W
;; -------------------------------------------------------------------------
;; Predicated LD1.
(define_insn "maskload"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_ALL 1 "memory_operand" "m")]
UNSPEC_LD1_SVE))]
"TARGET_SVE"
"ld1\t%0., %2/z, %1"
)
;; Unpredicated LD[234].
(define_expand "vec_load_lanes"
[(set (match_operand:SVE_STRUCT 0 "register_operand")
(unspec:SVE_STRUCT
[(match_dup 2)
(match_operand:SVE_STRUCT 1 "memory_operand")]
UNSPEC_LDN))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (mode);
}
)
;; Predicated LD[234].
(define_insn "vec_mask_load_lanes"
[(set (match_operand:SVE_STRUCT 0 "register_operand" "=w")
(unspec:SVE_STRUCT
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_STRUCT 1 "memory_operand" "m")]
UNSPEC_LDN))]
"TARGET_SVE"
"ld\t%0, %2/z, %1"
)
;; -------------------------------------------------------------------------
;; ---- Normal gather loads
;; -------------------------------------------------------------------------
;; Includes gather forms of:
;; - LD1D
;; - LD1W
;; -------------------------------------------------------------------------
;; Unpredicated gather loads.
(define_expand "gather_load"
[(set (match_operand:SVE_SD 0 "register_operand")
(unspec:SVE_SD
[(match_dup 5)
(match_operand:DI 1 "aarch64_reg_or_zero")
(match_operand: 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
{
operands[5] = aarch64_ptrue_reg (mode);
}
)
;; Predicated gather loads for 32-bit elements. Operand 3 is true for
;; unsigned extension and false for signed extension.
(define_insn "mask_gather_load"
[(set (match_operand:SVE_S 0 "register_operand" "=w, w, w, w, w")
(unspec:SVE_S
[(match_operand: 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
(match_operand: 2 "register_operand" "w, w, w, w, w")
(match_operand:DI 3 "const_int_operand" "i, Z, Ui1, Z, Ui1")
(match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
ld1w\t%0.s, %5/z, [%2.s]
ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
)
;; Predicated gather loads for 64-bit elements. The value of operand 3
;; doesn't matter in this case.
(define_insn "mask_gather_load"
[(set (match_operand:SVE_D 0 "register_operand" "=w, w, w")
(unspec:SVE_D
[(match_operand: 5 "register_operand" "Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk")
(match_operand: 2 "register_operand" "w, w, w")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
ld1d\t%0.d, %5/z, [%2.d]
ld1d\t%0.d, %5/z, [%1, %2.d]
ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
)
;; =========================================================================
;; == Stores
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- Normal contiguous stores
;; -------------------------------------------------------------------------
;; Includes contiguous forms of:
;; - ST1B
;; - ST1D
;; - ST1H
;; - ST1W
;; - ST2B
;; - ST2D
;; - ST2H
;; - ST2W
;; - ST3B
;; - ST3D
;; - ST3H
;; - ST3W
;; - ST4B
;; - ST4D
;; - ST4H
;; - ST4W
;; -------------------------------------------------------------------------
;; Predicated ST1.
(define_insn "maskstore"
[(set (match_operand:SVE_ALL 0 "memory_operand" "+m")
(unspec:SVE_ALL [(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_ALL 1 "register_operand" "w")
(match_dup 0)]
UNSPEC_ST1_SVE))]
"TARGET_SVE"
"st1\t%1., %2, %0"
)
;; Unpredicated ST[234]. This is always a full update, so the dependence
;; on the old value of the memory location (via (match_dup 0)) is redundant.
;; There doesn't seem to be any obvious benefit to treating the all-true
;; case differently though. In particular, it's very unlikely that we'll
;; only find out during RTL that a store_lanes is dead.
(define_expand "vec_store_lanes"
[(set (match_operand:SVE_STRUCT 0 "memory_operand")
(unspec:SVE_STRUCT
[(match_dup 2)
(match_operand:SVE_STRUCT 1 "register_operand")
(match_dup 0)]
UNSPEC_STN))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (mode);
}
)
;; Predicated ST[234].
(define_insn "vec_mask_store_lanes"
[(set (match_operand:SVE_STRUCT 0 "memory_operand" "+m")
(unspec:SVE_STRUCT
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_STRUCT 1 "register_operand" "w")
(match_dup 0)]
UNSPEC_STN))]
"TARGET_SVE"
"st\t%1, %2, %0"
)
;; -------------------------------------------------------------------------
;; ---- Normal scatter stores
;; -------------------------------------------------------------------------
;; Includes scatter forms of:
;; - ST1D
;; - ST1W
;; -------------------------------------------------------------------------
;; Unpredicated scatter stores.
(define_expand "scatter_store"
[(set (mem:BLK (scratch))
(unspec:BLK
[(match_dup 5)
(match_operand:DI 0 "aarch64_reg_or_zero")
(match_operand: 1 "register_operand")
(match_operand:DI 2 "const_int_operand")
(match_operand:DI 3 "aarch64_gather_scale_operand_")
(match_operand:SVE_SD 4 "register_operand")]
UNSPEC_ST1_SCATTER))]
"TARGET_SVE"
{
operands[5] = aarch64_ptrue_reg (mode);
}
)
;; Predicated scatter stores for 32-bit elements. Operand 2 is true for
;; unsigned extension and false for signed extension.
(define_insn "mask_scatter_store"
[(set (mem:BLK (scratch))
(unspec:BLK
[(match_operand: 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
(match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
(match_operand: 1 "register_operand" "w, w, w, w, w")
(match_operand:DI 2 "const_int_operand" "i, Z, Ui1, Z, Ui1")
(match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
(match_operand:SVE_S 4 "register_operand" "w, w, w, w, w")]
UNSPEC_ST1_SCATTER))]
"TARGET_SVE"
"@
st1w\t%4.s, %5, [%1.s]
st1w\t%4.s, %5, [%0, %1.s, sxtw]
st1w\t%4.s, %5, [%0, %1.s, uxtw]
st1w\t%4.s, %5, [%0, %1.s, sxtw %p3]
st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]"
)
;; Predicated scatter stores for 64-bit elements. The value of operand 2
;; doesn't matter in this case.
(define_insn "mask_scatter_store"
[(set (mem:BLK (scratch))
(unspec:BLK
[(match_operand: 5 "register_operand" "Upl, Upl, Upl")
(match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk")
(match_operand: 1 "register_operand" "w, w, w")
(match_operand:DI 2 "const_int_operand")
(match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
(match_operand:SVE_D 4 "register_operand" "w, w, w")]
UNSPEC_ST1_SCATTER))]
"TARGET_SVE"
"@
st1d\t%4.d, %5, [%1.d]
st1d\t%4.d, %5, [%0, %1.d]
st1d\t%4.d, %5, [%0, %1.d, lsl %p3]"
)
;; =========================================================================
;; == Vector creation
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- [INT,FP] Duplicate element
;; -------------------------------------------------------------------------
;; Includes:
;; - MOV
;; - LD1RB
;; - LD1RD
;; - LD1RH
;; - LD1RW
;; - LD1RQB
;; - LD1RQD
;; - LD1RQH
;; - LD1RQW
;; -------------------------------------------------------------------------
(define_expand "vec_duplicate"
[(parallel
[(set (match_operand:SVE_ALL 0 "register_operand")
(vec_duplicate:SVE_ALL
(match_operand: 1 "aarch64_sve_dup_operand")))
(clobber (scratch:))])]
"TARGET_SVE"
{
if (MEM_P (operands[1]))
{
rtx ptrue = aarch64_ptrue_reg (mode);
emit_insn (gen_sve_ld1r (operands[0], ptrue, operands[1],
CONST0_RTX (mode)));
DONE;
}
}
)
;; Accept memory operands for the benefit of combine, and also in case
;; the scalar input gets spilled to memory during RA. We want to split
;; the load at the first opportunity in order to allow the PTRUE to be
;; optimized with surrounding code.
(define_insn_and_split "*vec_duplicate_reg"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
(vec_duplicate:SVE_ALL
(match_operand: 1 "aarch64_sve_dup_operand" "r, w, Uty")))
(clobber (match_scratch: 2 "=X, X, Upl"))]
"TARGET_SVE"
"@
mov\t%0., %1
mov\t%0., %1
#"
"&& MEM_P (operands[1])"
[(const_int 0)]
{
if (GET_CODE (operands[2]) == SCRATCH)
operands[2] = gen_reg_rtx (mode);
emit_move_insn (operands[2], CONSTM1_RTX (mode));
emit_insn (gen_sve_ld1r (operands[0], operands[2], operands[1],
CONST0_RTX (mode)));
DONE;
}
[(set_attr "length" "4,4,8")]
)
;; This is used for vec_duplicates from memory, but can also
;; be used by combine to optimize selects of a a vec_duplicate
;; with zero.
(define_insn "sve_ld1r"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand: 1 "register_operand" "Upl")
(vec_duplicate:SVE_ALL
(match_operand: 2 "aarch64_sve_ld1r_operand" "Uty"))
(match_operand:SVE_ALL 3 "aarch64_simd_imm_zero")]
UNSPEC_SEL))]
"TARGET_SVE"
"ld1r\t%0., %1/z, %2"
)
;; Load 128 bits from memory and duplicate to fill a vector. Since there
;; are so few operations on 128-bit "elements", we don't define a VNx1TI
;; and simply use vectors of bytes instead.
(define_insn "*sve_ld1rq"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand: 1 "register_operand" "Upl")
(match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
UNSPEC_LD1RQ))]
"TARGET_SVE"
"ld1rq\t%0., %1/z, %2"
)
;; -------------------------------------------------------------------------
;; ---- [INT,FP] Initialize from individual elements
;; -------------------------------------------------------------------------
;; Includes:
;; - INSR
;; -------------------------------------------------------------------------
(define_expand "vec_init"
[(match_operand:SVE_ALL 0 "register_operand")
(match_operand 1 "")]
"TARGET_SVE"
{
aarch64_sve_expand_vector_init (operands[0], operands[1]);
DONE;
}
)
;; Shift an SVE vector left and insert a scalar into element 0.
(define_insn "vec_shl_insert_"
[(set (match_operand:SVE_ALL 0 "register_operand" "=?w, w, ??&w, ?&w")
(unspec:SVE_ALL
[(match_operand:SVE_ALL 1 "register_operand" "0, 0, w, w")
(match_operand: 2 "aarch64_reg_or_zero" "rZ, w, rZ, w")]
UNSPEC_INSR))]
"TARGET_SVE"
"@
insr\t%0., %2
insr\t%0., %2
movprfx\t%0, %1\;insr\t%0., %2
movprfx\t%0, %1\;insr\t%0., %2"
[(set_attr "movprfx" "*,*,yes,yes")]
)
;; -------------------------------------------------------------------------
;; ---- [INT] Linear series
;; -------------------------------------------------------------------------
;; Includes:
;; - INDEX
;; -------------------------------------------------------------------------
(define_insn "vec_series"
[(set (match_operand:SVE_I 0 "register_operand" "=w, w, w")
(vec_series:SVE_I
(match_operand: 1 "aarch64_sve_index_operand" "Usi, r, r")
(match_operand: 2 "aarch64_sve_index_operand" "r, Usi, r")))]
"TARGET_SVE"
"@
index\t%0., #%1, %2
index\t%0., %1, #%2
index\t%0., %1, %2"
)
;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
;; of an INDEX instruction.
(define_insn "*vec_series_plus"
[(set (match_operand:SVE_I 0 "register_operand" "=w")
(plus:SVE_I
(vec_duplicate:SVE_I
(match_operand: 1 "register_operand" "r"))
(match_operand:SVE_I 2 "immediate_operand")))]
"TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
{
operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
return "index\t%0., %1, #%2";
}
)
;; -------------------------------------------------------------------------
;; ---- [PRED] Duplicate element
;; -------------------------------------------------------------------------
;; The patterns in this section are synthetic.
;; -------------------------------------------------------------------------
;; Implement a predicate broadcast by shifting the low bit of the scalar
;; input into the top bit and using a WHILELO. An alternative would be to
;; duplicate the input and do a compare with zero.
(define_expand "vec_duplicate"
[(set (match_operand:PRED_ALL 0 "register_operand")
(vec_duplicate:PRED_ALL (match_operand 1 "register_operand")))]
"TARGET_SVE"
{
rtx tmp = gen_reg_rtx (DImode);
rtx op1 = gen_lowpart (DImode, operands[1]);
emit_insn (gen_ashldi3 (tmp, op1, gen_int_mode (63, DImode)));
emit_insn (gen_while_ultdi (operands[0], const0_rtx, tmp));
DONE;
}
)
;; =========================================================================
;; == Vector decomposition
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- [INT,FP] Extract index
;; -------------------------------------------------------------------------
;; Includes:
;; - DUP (Advanced SIMD)
;; - DUP (SVE)
;; - EXT (SVE)
;; - ST1 (Advanced SIMD)
;; - UMOV (Advanced SIMD)
;; -------------------------------------------------------------------------
(define_expand "vec_extract"
[(set (match_operand: 0 "register_operand")
(vec_select:
(match_operand:SVE_ALL 1 "register_operand")
(parallel [(match_operand:SI 2 "nonmemory_operand")])))]
"TARGET_SVE"
{
poly_int64 val;
if (poly_int_rtx_p (operands[2], &val)
&& known_eq (val, GET_MODE_NUNITS (mode) - 1))
{
/* The last element can be extracted with a LASTB and a false
predicate. */
rtx sel = aarch64_pfalse_reg (mode);
emit_insn (gen_extract_last_ (operands[0], sel, operands[1]));
DONE;
}
if (!CONST_INT_P (operands[2]))
{
/* Create an index with operand[2] as the base and -1 as the step.
It will then be zero for the element we care about. */
rtx index = gen_lowpart (mode, operands[2]);
index = force_reg (mode, index);
rtx series = gen_reg_rtx (mode);
emit_insn (gen_vec_series (series, index, constm1_rtx));
/* Get a predicate that is true for only that element. */
rtx zero = CONST0_RTX (mode);
rtx cmp = gen_rtx_EQ (mode, series, zero);
rtx sel = gen_reg_rtx (mode);
emit_insn (gen_vec_cmp (sel, cmp, series, zero));
/* Select the element using LASTB. */
emit_insn (gen_extract_last_ (operands[0], sel, operands[1]));
DONE;
}
}
)
;; Extract element zero. This is a special case because we want to force
;; the registers to be the same for the second alternative, and then
;; split the instruction into nothing after RA.
(define_insn_and_split "*vec_extract_0"
[(set (match_operand: 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
(vec_select:
(match_operand:SVE_ALL 1 "register_operand" "w, 0, w")
(parallel [(const_int 0)])))]
"TARGET_SVE"
{
operands[1] = gen_rtx_REG (mode, REGNO (operands[1]));
switch (which_alternative)
{
case 0:
return "umov\\t%0, %1.[0]";
case 1:
return "#";
case 2:
return "st1\\t{%1.}[0], %0";
default:
gcc_unreachable ();
}
}
"&& reload_completed
&& REG_P (operands[0])
&& REGNO (operands[0]) == REGNO (operands[1])"
[(const_int 0)]
{
emit_note (NOTE_INSN_DELETED);
DONE;
}
[(set_attr "type" "neon_to_gp_q, untyped, neon_store1_one_lane_q")]
)
;; Extract an element from the Advanced SIMD portion of the register.
;; We don't just reuse the aarch64-simd.md pattern because we don't
;; want any change in lane number on big-endian targets.
(define_insn "*vec_extract_v128"
[(set (match_operand: 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
(vec_select:
(match_operand:SVE_ALL 1 "register_operand" "w, w, w")
(parallel [(match_operand:SI 2 "const_int_operand")])))]
"TARGET_SVE
&& IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (mode), 1, 15)"
{
operands[1] = gen_rtx_REG (mode, REGNO (operands[1]));
switch (which_alternative)
{
case 0:
return "umov\\t%0, %1.[%2]";
case 1:
return "dup\\t%0, %1.[%2]";
case 2:
return "st1\\t{%1.}[%2], %0";
default:
gcc_unreachable ();
}
}
[(set_attr "type" "neon_to_gp_q, neon_dup_q, neon_store1_one_lane_q")]
)
;; Extract an element in the range of DUP. This pattern allows the
;; source and destination to be different.
(define_insn "*vec_extract_dup"
[(set (match_operand: 0 "register_operand" "=w")
(vec_select:
(match_operand:SVE_ALL 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "const_int_operand")])))]
"TARGET_SVE
&& IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (mode), 16, 63)"
{
operands[0] = gen_rtx_REG (mode, REGNO (operands[0]));
return "dup\t%0., %1.[%2]";
}
)
;; Extract an element outside the range of DUP. This pattern requires the
;; source and destination to be the same.
(define_insn "*vec_extract_ext"
[(set (match_operand: 0 "register_operand" "=w")
(vec_select:
(match_operand:SVE_ALL 1 "register_operand" "0")
(parallel [(match_operand:SI 2 "const_int_operand")])))]
"TARGET_SVE && INTVAL (operands[2]) * GET_MODE_SIZE (mode) >= 64"
{
operands[0] = gen_rtx_REG (mode, REGNO (operands[0]));
operands[2] = GEN_INT (INTVAL (operands[2]) * GET_MODE_SIZE (mode));
return "ext\t%0.b, %0.b, %0.b, #%2";
}
)
;; -------------------------------------------------------------------------
;; ---- [INT,FP] Extract active element
;; -------------------------------------------------------------------------
;; Includes:
;; - LASTB
;; -------------------------------------------------------------------------
;; Extract the last active element of operand 1 into operand 0.
;; If no elements are active, extract the last inactive element instead.
(define_insn "extract_last_"
[(set (match_operand: 0 "register_operand" "=r, w")
(unspec:
[(match_operand: 1 "register_operand" "Upl, Upl")
(match_operand:SVE_ALL 2 "register_operand" "w, w")]
UNSPEC_LASTB))]
"TARGET_SVE"
"@
lastb\t%0, %1, %2.
lastb\t%0, %1, %2."
)
;; -------------------------------------------------------------------------
;; ---- [PRED] Extract index
;; -------------------------------------------------------------------------
;; The patterns in this section are synthetic.
;; -------------------------------------------------------------------------
;; Handle extractions from a predicate by converting to an integer vector
;; and extracting from there.
(define_expand "vec_extract"
[(match_operand: 0 "register_operand")
(match_operand: 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")
;; Dummy operand to which we can attach the iterator.
(reg:SVE_I V0_REGNUM)]
"TARGET_SVE"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_aarch64_sve_dup_const (tmp, operands[1],
CONST1_RTX (mode),
CONST0_RTX (mode)));
emit_insn (gen_vec_extract (operands[0], tmp, operands[2]));
DONE;
}
)
;; =========================================================================
;; == Unary arithmetic
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- [INT] General unary arithmetic corresponding to rtx codes
;; -------------------------------------------------------------------------
;; Includes:
;; - ABS
;; - CNT (= popcount)
;; - NEG
;; - NOT
;; -------------------------------------------------------------------------
;; Unpredicated integer unary arithmetic.
(define_expand "2"
[(set (match_operand:SVE_I 0 "register_operand")
(unspec:SVE_I
[(match_dup 2)
(SVE_INT_UNARY:SVE_I (match_operand:SVE_I 1 "register_operand"))]
UNSPEC_MERGE_PTRUE))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (mode);
}
)
;; Integer unary arithmetic predicated with a PTRUE.
(define_insn "*2"
[(set (match_operand:SVE_I 0 "register_operand" "=w")
(unspec:SVE_I
[(match_operand: 1 "register_operand" "Upl")
(SVE_INT_UNARY:SVE_I
(match_operand:SVE_I 2 "register_operand" "w"))]
UNSPEC_MERGE_PTRUE))]
"TARGET_SVE"
"\t%0., %1/m, %2."
)
;; -------------------------------------------------------------------------
;; ---- [FP] General unary arithmetic corresponding to unspecs
;; -------------------------------------------------------------------------
;; Includes:
;; - FABS
;; - FNEG
;; - FRINTA
;; - FRINTI
;; - FRINTM
;; - FRINTN
;; - FRINTP
;; - FRINTX
;; - FRINTZ
;; - FSQRT
;; -------------------------------------------------------------------------
;; Unpredicated floating-point unary operations.
(define_expand "2"
[(set (match_operand:SVE_F 0 "register_operand")
(unspec:SVE_F
[(match_dup 2)
(match_operand:SVE_F 1 "register_operand")]
SVE_COND_FP_UNARY))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (mode);
}
)
;; Predicated floating-point unary operations.
(define_insn "*2"
[(set (match_operand:SVE_F 0 "register_operand" "=w")
(unspec:SVE_F
[(match_operand: 1 "register_operand" "Upl")
(match_operand:SVE_F 2 "register_operand" "w")]
SVE_COND_FP_UNARY))]
"TARGET_SVE"
"\t%0., %1/m, %2."
)
;; -------------------------------------------------------------------------
;; ---- [PRED] Inverse
;; -------------------------------------------------------------------------
;; Includes:
;; - NOT
;; -------------------------------------------------------------------------
;; Unpredicated predicate inverse.
(define_expand "one_cmpl2"
[(set (match_operand:PRED_ALL 0 "register_operand")
(and:PRED_ALL
(not:PRED_ALL (match_operand:PRED_ALL 1 "register_operand"))
(match_dup 2)))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (mode);
}
)
;; Predicated predicate inverse.
(define_insn "*one_cmpl3"
[(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
(and:PRED_ALL
(not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
(match_operand:PRED_ALL 1 "register_operand" "Upa")))]
"TARGET_SVE"
"not\t%0.b, %1/z, %2.b"
)
;; =========================================================================
;; == Binary arithmetic
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- [INT] General binary arithmetic corresponding to rtx codes
;; -------------------------------------------------------------------------
;; Includes merging patterns for:
;; - ADD
;; - AND
;; - EOR
;; - MUL
;; - ORR
;; - SMAX
;; - SMIN
;; - SUB
;; - UMAX
;; - UMIN
;; -------------------------------------------------------------------------
;; Predicated integer operations with merging.
(define_expand "cond_"
[(set (match_operand:SVE_I 0 "register_operand")
(unspec:SVE_I
[(match_operand: 1 "register_operand")
(SVE_INT_BINARY:SVE_I
(match_operand:SVE_I 2 "register_operand")
(match_operand:SVE_I 3 "register_operand"))
(match_operand:SVE_I 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE"
)
;; Predicated integer operations, merging with the first input.
(define_insn "*cond__2"
[(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
(unspec:SVE_I
[(match_operand: 1 "register_operand" "Upl, Upl")
(SVE_INT_BINARY:SVE_I
(match_operand:SVE_I 2 "register_operand" "0, w")
(match_operand:SVE_I 3 "register_operand" "w, w"))
(match_dup 2)]
UNSPEC_SEL))]
"TARGET_SVE"
"@
\t%0., %1/m, %0.