/* Subroutines used to expand string operations for RISC-V. Copyright (C) 2023-2024 Free Software Foundation, Inc. This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ #define IN_TARGET_CODE 1 #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "rtl.h" #include "tree.h" #include "memmodel.h" #include "tm_p.h" #include "ira.h" #include "print-tree.h" #include "varasm.h" #include "explow.h" #include "expr.h" #include "output.h" #include "target.h" #include "predict.h" #include "optabs.h" #include "riscv-protos.h" #include "recog.h" #include "tm-constrs.h" /* Emit proper instruction depending on mode of dest. */ #define GEN_EMIT_HELPER2(name) \ static rtx_insn * \ do_## name ## 2(rtx dest, rtx src) \ { \ rtx_insn *insn; \ if (GET_MODE (dest) == DImode) \ insn = emit_insn (gen_ ## name ## di2 (dest, src)); \ else \ insn = emit_insn (gen_ ## name ## si2 (dest, src)); \ return insn; \ } /* Emit proper instruction depending on mode of dest. */ #define GEN_EMIT_HELPER3(name) \ static rtx_insn * \ do_## name ## 3(rtx dest, rtx src1, rtx src2) \ { \ rtx_insn *insn; \ if (GET_MODE (dest) == DImode) \ insn = emit_insn (gen_ ## name ## di3 (dest, src1, src2)); \ else \ insn = emit_insn (gen_ ## name ## si3 (dest, src1, src2)); \ return insn; \ } GEN_EMIT_HELPER3(add) /* do_add3 */ GEN_EMIT_HELPER3(and) /* do_and3 */ GEN_EMIT_HELPER3(ashl) /* do_ashl3 */ GEN_EMIT_HELPER2(bswap) /* do_bswap2 */ GEN_EMIT_HELPER2(clz) /* do_clz2 */ GEN_EMIT_HELPER2(ctz) /* do_ctz2 */ GEN_EMIT_HELPER3(ior) /* do_ior3 */ GEN_EMIT_HELPER3(ior_not) /* do_ior_not3 */ GEN_EMIT_HELPER3(lshr) /* do_lshr3 */ GEN_EMIT_HELPER2(neg) /* do_neg2 */ GEN_EMIT_HELPER2(orcb) /* do_orcb2 */ GEN_EMIT_HELPER2(one_cmpl) /* do_one_cmpl2 */ GEN_EMIT_HELPER3(rotr) /* do_rotr3 */ GEN_EMIT_HELPER3(sub) /* do_sub3 */ GEN_EMIT_HELPER2(th_rev) /* do_th_rev2 */ GEN_EMIT_HELPER2(th_tstnbz) /* do_th_tstnbz2 */ GEN_EMIT_HELPER3(xor) /* do_xor3 */ GEN_EMIT_HELPER2(zero_extendqi) /* do_zero_extendqi2 */ GEN_EMIT_HELPER2(zero_extendhi) /* do_zero_extendhi2 */ #undef GEN_EMIT_HELPER2 #undef GEN_EMIT_HELPER3 /* Helper function to emit zero-extended loads. MODE is the mode to use for the load. DEST is the destination register for the data. MEM is the source to load from. */ static void do_load (machine_mode mode, rtx dest, rtx mem) { if (mode == QImode) do_zero_extendqi2 (dest, mem); else if (mode == HImode) do_zero_extendhi2 (dest, mem); else if (mode == SImode && TARGET_64BIT) emit_insn (gen_zero_extendsidi2 (dest, mem)); else if (mode == Xmode) emit_move_insn (dest, mem); else gcc_unreachable (); } /* Helper function to emit zero-extended loads. MODE is the mode to use for the load (QImode or Pmode). DEST is the destination register for the data. ADDR_REG is the register that holds the address. ADDR is the address expression to load from. */ static void do_load_from_addr (machine_mode mode, rtx dest, rtx addr_reg, rtx addr) { rtx mem = gen_rtx_MEM (mode, addr_reg); MEM_COPY_ATTRIBUTES (mem, addr); set_mem_size (mem, GET_MODE_SIZE (mode)); do_load (mode, dest, mem); } /* Generate a sequence to compare single characters in data1 and data2. RESULT is the register where the return value of str(n)cmp will be stored. DATA1 is a register which contains character1. DATA2 is a register which contains character2. FINAL_LABEL is the location after the calculation of the return value. */ static void emit_strcmp_scalar_compare_byte (rtx result, rtx data1, rtx data2, rtx final_label) { do_sub3 (result, data1, data2); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ } /* Generate a sequence to compare two strings in data1 and data2. DATA1 is a register which contains string1. DATA2 is a register which contains string2. ORC1 is a register where orc.b(data1) will be stored. CMP_BYTES is the length of the strings. END_LABEL is the location of the code that calculates the return value. */ static void emit_strcmp_scalar_compare_subword (rtx data1, rtx data2, rtx orc1, unsigned HOST_WIDE_INT cmp_bytes, rtx end_label) { /* Set a NUL-byte after the relevant data (behind the string). */ long long im = -256ll; rtx imask = gen_rtx_CONST_INT (Xmode, im); rtx m_reg = gen_reg_rtx (Xmode); emit_insn (gen_rtx_SET (m_reg, imask)); do_rotr3 (m_reg, m_reg, GEN_INT (BITS_PER_WORD - cmp_bytes * BITS_PER_UNIT)); do_and3 (data1, m_reg, data1); do_and3 (data2, m_reg, data2); if (TARGET_ZBB) do_orcb2 (orc1, data1); else do_th_tstnbz2 (orc1, data1); emit_jump_insn (gen_jump (end_label)); emit_barrier (); /* No fall-through. */ } /* Generate a sequence to compare two strings in data1 and data2. DATA1 is a register which contains string1. DATA2 is a register which contains string2. ORC1 is a register where orc.b(data1) will be stored. TESTVAL is the value to test ORC1 against. END_LABEL is the location of the code that calculates the return value. NONUL_END_LABEL is the location of the code that calculates the return value in case the first string does not contain a NULL-byte. */ static void emit_strcmp_scalar_compare_word (rtx data1, rtx data2, rtx orc1, rtx testval, rtx end_label, rtx nonul_end_label) { /* Check if data1 contains a NUL character. */ if (TARGET_ZBB) do_orcb2 (orc1, data1); else do_th_tstnbz2 (orc1, data1); rtx cond1 = gen_rtx_NE (VOIDmode, orc1, testval); emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond1, orc1, testval, end_label)); /* Break out if u1 != u2 */ rtx cond2 = gen_rtx_NE (VOIDmode, data1, data2); emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond2, data1, data2, nonul_end_label)); /* Fall-through on equality. */ } /* Generate the sequence of compares for strcmp/strncmp using zbb instructions. RESULT is the register where the return value of str(n)cmp will be stored. The strings are referenced by SRC1 and SRC2. The number of bytes to compare is defined by NBYTES. DATA1 is a register where string1 will be stored. DATA2 is a register where string2 will be stored. ORC1 is a register where orc.b(data1) will be stored. END_LABEL is the location of the code that calculates the return value. NONUL_END_LABEL is the location of the code that calculates the return value in case the first string does not contain a NULL-byte. FINAL_LABEL is the location of the code that comes after the calculation of the return value. */ static void emit_strcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2, unsigned HOST_WIDE_INT nbytes, rtx data1, rtx data2, rtx orc1, rtx end_label, rtx nonul_end_label, rtx final_label) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); unsigned HOST_WIDE_INT offset = 0; rtx testval = gen_reg_rtx (Xmode); if (TARGET_ZBB) emit_insn (gen_rtx_SET (testval, constm1_rtx)); else emit_insn (gen_rtx_SET (testval, const0_rtx)); while (nbytes > 0) { unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes; machine_mode load_mode; if (cmp_bytes == 1) load_mode = QImode; else load_mode = Xmode; rtx addr1 = adjust_address (src1, load_mode, offset); do_load (load_mode, data1, addr1); rtx addr2 = adjust_address (src2, load_mode, offset); do_load (load_mode, data2, addr2); if (cmp_bytes == 1) { emit_strcmp_scalar_compare_byte (result, data1, data2, final_label); return; } else if (cmp_bytes < xlen) { emit_strcmp_scalar_compare_subword (data1, data2, orc1, cmp_bytes, end_label); return; } else emit_strcmp_scalar_compare_word (data1, data2, orc1, testval, end_label, nonul_end_label); offset += cmp_bytes; nbytes -= cmp_bytes; } } /* Fixup pointers and generate a call to strcmp. RESULT is the register where the return value of str(n)cmp will be stored. The strings are referenced by SRC1 and SRC2. The number of already compared bytes is defined by NBYTES. */ static void emit_strcmp_scalar_call_to_libc (rtx result, rtx src1, rtx src2, unsigned HOST_WIDE_INT nbytes) { /* Update pointers past what has been compared already. */ rtx src1_addr = force_reg (Pmode, XEXP (src1, 0)); rtx src2_addr = force_reg (Pmode, XEXP (src2, 0)); rtx src1_new = force_reg (Pmode, gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (nbytes))); rtx src2_new = force_reg (Pmode, gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (nbytes))); /* Construct call to strcmp to compare the rest of the string. */ tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); emit_library_call_value (XEXP (DECL_RTL (fun), 0), result, LCT_NORMAL, GET_MODE (result), src1_new, Pmode, src2_new, Pmode); } /* Fast strcmp-result calculation if no NULL-byte in string1. RESULT is the register where the return value of str(n)cmp will be stored. The mismatching strings are stored in DATA1 and DATA2. */ static void emit_strcmp_scalar_result_calculation_nonul (rtx result, rtx data1, rtx data2) { /* Words don't match, and no NUL byte in one word. Get bytes in big-endian order and compare as words. */ do_bswap2 (data1, data1); do_bswap2 (data2, data2); /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ rtx tmp = gen_reg_rtx (Xmode); emit_insn (gen_slt_3 (LTU, Xmode, Xmode, tmp, data1, data2)); do_neg2 (tmp, tmp); do_ior3 (result, tmp, const1_rtx); } /* strcmp-result calculation. RESULT is the register where the return value of str(n)cmp will be stored. The strings are stored in DATA1 and DATA2. ORC1 contains orc.b(DATA1). */ static void emit_strcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2, rtx orc1) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); /* Convert non-equal bytes into non-NUL bytes. */ rtx diff = gen_reg_rtx (Xmode); do_xor3 (diff, data1, data2); rtx shift = gen_reg_rtx (Xmode); if (TARGET_ZBB) { /* Convert non-equal or NUL-bytes into non-NUL bytes. */ rtx syndrome = gen_reg_rtx (Xmode); do_orcb2 (diff, diff); do_ior_not3 (syndrome, orc1, diff); /* Count the number of equal bits from the beginning of the word. */ do_ctz2 (shift, syndrome); } else { /* Convert non-equal or NUL-bytes into non-NUL bytes. */ rtx syndrome = gen_reg_rtx (Xmode); do_th_tstnbz2 (diff, diff); do_one_cmpl2 (diff, diff); do_ior3 (syndrome, orc1, diff); /* Count the number of equal bits from the beginning of the word. */ do_th_rev2 (syndrome, syndrome); do_clz2 (shift, syndrome); } do_bswap2 (data1, data1); do_bswap2 (data2, data2); /* The most-significant-non-zero bit of the syndrome marks either the first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ do_ashl3 (data1, data1, gen_lowpart (QImode, shift)); do_ashl3 (data2, data2, gen_lowpart (QImode, shift)); /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ unsigned int shiftr = (xlen - 1) * BITS_PER_UNIT; do_lshr3 (data1, data1, GEN_INT (shiftr)); do_lshr3 (data2, data2, GEN_INT (shiftr)); do_sub3 (result, data1, data2); } /* Expand str(n)cmp using Zbb/TheadBb instructions. The result will be stored in RESULT. The strings are referenced by SRC1 and SRC2. The number of bytes to compare is defined by NBYTES. The alignment is defined by ALIGNMENT. If NCOMPARE is false then libc's strcmp() will be called if comparing NBYTES of both strings did not find differences or NULL-bytes. Return true if expansion was successful, or false otherwise. */ static bool riscv_expand_strcmp_scalar (rtx result, rtx src1, rtx src2, unsigned HOST_WIDE_INT nbytes, unsigned HOST_WIDE_INT alignment, bool ncompare) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); gcc_assert (TARGET_ZBB || TARGET_XTHEADBB); gcc_assert (nbytes > 0); gcc_assert ((int)nbytes <= riscv_strcmp_inline_limit); gcc_assert (ncompare || (nbytes & (xlen - 1)) == 0); /* Limit to 12-bits (maximum load-offset). */ if (nbytes > IMM_REACH) nbytes = IMM_REACH; /* We don't support big endian. */ if (BYTES_BIG_ENDIAN) return false; /* We need xlen-aligned strings. */ if (alignment < xlen) return false; /* Overall structure of emitted code: Load-and-compare: - Load data1 and data2 - Set orc1 := orc.b (data1) (or th.tstnbz) - Compare strings and either: - Fall-through on equality - Jump to nonul_end_label if data1 !or end_label - Calculate result value and jump to final_label // Fall-through Call-to-libc or set result to 0 (depending on ncompare) Jump to final_label nonul_end_label: // words don't match, and no null byte in first word. Calculate result value with the use of data1, data2 and orc1 Jump to final_label end_label: Calculate result value with the use of data1, data2 and orc1 Jump to final_label final_label: // Nothing. */ rtx data1 = gen_reg_rtx (Xmode); rtx data2 = gen_reg_rtx (Xmode); rtx orc1 = gen_reg_rtx (Xmode); rtx nonul_end_label = gen_label_rtx (); rtx end_label = gen_label_rtx (); rtx final_label = gen_label_rtx (); /* Generate a sequence of zbb instructions to compare out to the length specified. */ emit_strcmp_scalar_load_and_compare (result, src1, src2, nbytes, data1, data2, orc1, end_label, nonul_end_label, final_label); /* All compared and everything was equal. */ if (ncompare) { emit_insn (gen_rtx_SET (result, CONST0_RTX (GET_MODE (result)))); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ } else { emit_strcmp_scalar_call_to_libc (result, src1, src2, nbytes); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ } emit_label (nonul_end_label); emit_strcmp_scalar_result_calculation_nonul (result, data1, data2); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ emit_label (end_label); emit_strcmp_scalar_result_calculation (result, data1, data2, orc1); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ emit_label (final_label); return true; } /* Expand a string compare operation. The result will be stored in RESULT. The strings are referenced by SRC1 and SRC2. The argument BYTES_RTX either holds the number of characters to compare, or is NULL_RTX. The argument ALIGN_RTX holds the alignment. Return true if expansion was successful, or false otherwise. */ bool riscv_expand_strcmp (rtx result, rtx src1, rtx src2, rtx bytes_rtx, rtx align_rtx) { unsigned HOST_WIDE_INT compare_max; unsigned HOST_WIDE_INT nbytes; unsigned HOST_WIDE_INT alignment; bool ncompare = bytes_rtx != NULL_RTX; const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); if (riscv_strcmp_inline_limit == 0) return false; /* Round down the comparision limit to a multiple of xlen. */ compare_max = riscv_strcmp_inline_limit & ~(xlen - 1); /* Decide how many bytes to compare inline. */ if (bytes_rtx == NULL_RTX) { nbytes = compare_max; } else { /* If we have a length, it must be constant. */ if (!CONST_INT_P (bytes_rtx)) return false; nbytes = UINTVAL (bytes_rtx); /* If NBYTES is zero the result of strncmp will always be zero, but that would require special casing in the caller. So for now just don't do an inline expansion. This probably rarely happens in practice, but it is tested by the testsuite. */ if (nbytes == 0) return false; /* We don't emit parts of a strncmp() call. */ if (nbytes > compare_max) return false; } /* Guarantees: - nbytes > 0 - nbytes <= riscv_strcmp_inline_limit - nbytes is a multiple of xlen if !ncompare */ if (!CONST_INT_P (align_rtx)) return false; alignment = UINTVAL (align_rtx); if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR) { bool ok = riscv_vector::expand_strcmp (result, src1, src2, bytes_rtx, alignment, ncompare); if (ok) return true; } if ((TARGET_ZBB || TARGET_XTHEADBB) && stringop_strategy & STRATEGY_SCALAR) return riscv_expand_strcmp_scalar (result, src1, src2, nbytes, alignment, ncompare); return false; } /* If the provided string is aligned, then read XLEN bytes in a loop and use orc.b to find NUL-bytes. */ static bool riscv_expand_strlen_scalar (rtx result, rtx src, rtx align) { rtx testval, addr, addr_plus_regsz, word, zeros; rtx loop_label, cond; gcc_assert (TARGET_ZBB || TARGET_XTHEADBB); /* The alignment needs to be known and big enough. */ if (!CONST_INT_P (align) || UINTVAL (align) < GET_MODE_SIZE (Xmode)) return false; testval = gen_reg_rtx (Xmode); addr = copy_addr_to_reg (XEXP (src, 0)); addr_plus_regsz = gen_reg_rtx (Pmode); word = gen_reg_rtx (Xmode); zeros = gen_reg_rtx (Xmode); if (TARGET_ZBB) emit_insn (gen_rtx_SET (testval, constm1_rtx)); else emit_insn (gen_rtx_SET (testval, const0_rtx)); do_add3 (addr_plus_regsz, addr, GEN_INT (UNITS_PER_WORD)); loop_label = gen_label_rtx (); emit_label (loop_label); /* Load a word and use orc.b/th.tstnbz to find a zero-byte. */ do_load_from_addr (Xmode, word, addr, src); do_add3 (addr, addr, GEN_INT (UNITS_PER_WORD)); if (TARGET_ZBB) do_orcb2 (word, word); else do_th_tstnbz2 (word, word); cond = gen_rtx_EQ (VOIDmode, word, testval); emit_unlikely_jump_insn (gen_cbranch4 (Xmode, cond, word, testval, loop_label)); /* Calculate the return value by counting zero-bits. */ if (TARGET_ZBB) do_one_cmpl2 (word, word); if (TARGET_BIG_ENDIAN) do_clz2 (zeros, word); else if (TARGET_ZBB) do_ctz2 (zeros, word); else { do_th_rev2 (word, word); do_clz2 (zeros, word); } do_lshr3 (zeros, zeros, GEN_INT (exact_log2 (BITS_PER_UNIT))); do_add3 (addr, addr, zeros); do_sub3 (result, addr, addr_plus_regsz); return true; } /* Expand a strlen operation and return true if successful. Return false if we should let the compiler generate normal code, probably a strlen call. */ bool riscv_expand_strlen (rtx result, rtx src, rtx search_char, rtx align) { if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR) { riscv_vector::expand_rawmemchr (E_QImode, result, src, search_char, /* strlen */ true); return true; } gcc_assert (search_char == const0_rtx); if ((TARGET_ZBB || TARGET_XTHEADBB) && stringop_strategy & STRATEGY_SCALAR) return riscv_expand_strlen_scalar (result, src, align); return false; } /* Generate the sequence of load and compares for memcmp using Zbb. RESULT is the register where the return value of memcmp will be stored. The source pointers are SRC1 and SRC2 (NBYTES bytes to compare). DATA1 and DATA2 are registers where the data chunks will be stored. DIFF_LABEL is the location of the code that calculates the return value. FINAL_LABEL is the location of the code that comes after the calculation of the return value. */ static void emit_memcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2, unsigned HOST_WIDE_INT nbytes, rtx data1, rtx data2, rtx diff_label, rtx final_label) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); unsigned HOST_WIDE_INT offset = 0; while (nbytes > 0) { unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes; machine_mode load_mode; /* Special cases to avoid masking of trailing bytes. */ if (cmp_bytes == 1) load_mode = QImode; else if (cmp_bytes == 2) load_mode = HImode; else if (cmp_bytes == 4) load_mode = SImode; else load_mode = Xmode; rtx addr1 = adjust_address (src1, load_mode, offset); do_load (load_mode, data1, addr1); rtx addr2 = adjust_address (src2, load_mode, offset); do_load (load_mode, data2, addr2); /* Fast-path for a single byte. */ if (cmp_bytes == 1) { do_sub3 (result, data1, data2); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ return; } /* Shift off trailing bytes in words if needed. */ unsigned int load_bytes = GET_MODE_SIZE (load_mode).to_constant (); if (cmp_bytes < load_bytes) { int shamt = (load_bytes - cmp_bytes) * BITS_PER_UNIT; do_ashl3 (data1, data1, GEN_INT (shamt)); do_ashl3 (data2, data2, GEN_INT (shamt)); } /* Break out if data1 != data2 */ rtx cond = gen_rtx_NE (VOIDmode, data1, data2); emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond, data1, data2, diff_label)); /* Fall-through on equality. */ offset += cmp_bytes; nbytes -= cmp_bytes; } } /* memcmp result calculation. RESULT is the register where the return value will be stored. The two data chunks are in DATA1 and DATA2. */ static void emit_memcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2) { /* Get bytes in big-endian order and compare as words. */ do_bswap2 (data1, data1); do_bswap2 (data2, data2); /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ emit_insn (gen_slt_3 (LTU, Xmode, Xmode, result, data1, data2)); do_neg2 (result, result); do_ior3 (result, result, const1_rtx); } /* Expand memcmp using scalar instructions (incl. Zbb). RESULT is the register where the return value will be stored. The source pointers are SRC1 and SRC2 (NBYTES bytes to compare). */ static bool riscv_expand_block_compare_scalar (rtx result, rtx src1, rtx src2, rtx nbytes) { const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode); if (optimize_function_for_size_p (cfun)) return false; /* We don't support big endian. */ if (BYTES_BIG_ENDIAN) return false; if (!CONST_INT_P (nbytes)) return false; /* We need the rev (bswap) instruction. */ if (!TARGET_ZBB) return false; unsigned HOST_WIDE_INT length = UINTVAL (nbytes); /* Limit to 12-bits (maximum load-offset). */ if (length > IMM_REACH) length = IMM_REACH; /* We need xlen-aligned memory. */ unsigned HOST_WIDE_INT align = MIN (MEM_ALIGN (src1), MEM_ALIGN (src2)); if (align < (xlen * BITS_PER_UNIT)) return false; if (length > RISCV_MAX_MOVE_BYTES_STRAIGHT) return false; /* Overall structure of emitted code: Load-and-compare: - Load data1 and data2 - Compare strings and either: - Fall-through on equality - Jump to end_label if data1 != data2 // Fall-through Set result to 0 and jump to final_label diff_label: Calculate result value with the use of data1 and data2 Jump to final_label final_label: // Nothing. */ rtx data1 = gen_reg_rtx (Xmode); rtx data2 = gen_reg_rtx (Xmode); rtx diff_label = gen_label_rtx (); rtx final_label = gen_label_rtx (); /* Generate a sequence of zbb instructions to compare out to the length specified. */ emit_memcmp_scalar_load_and_compare (result, src1, src2, length, data1, data2, diff_label, final_label); emit_move_insn (result, CONST0_RTX (GET_MODE (result))); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ emit_label (diff_label); emit_memcmp_scalar_result_calculation (result, data1, data2); emit_jump_insn (gen_jump (final_label)); emit_barrier (); /* No fall-through. */ emit_label (final_label); return true; } /* Expand memcmp operation. RESULT is the register where the return value will be stored. The source pointers are SRC1 and SRC2 (NBYTES bytes to compare). */ bool riscv_expand_block_compare (rtx result, rtx src1, rtx src2, rtx nbytes) { if (stringop_strategy & STRATEGY_SCALAR) return riscv_expand_block_compare_scalar (result, src1, src2, nbytes); return false; } /* Emit straight-line code to move LENGTH bytes from SRC to DEST with accesses that are ALIGN bytes aligned. Assume that the areas do not overlap. */ static void riscv_block_move_straight (rtx dest, rtx src, unsigned HOST_WIDE_INT length, unsigned HOST_WIDE_INT align) { unsigned HOST_WIDE_INT offset, delta; unsigned HOST_WIDE_INT bits; int i; enum machine_mode mode; rtx *regs; bits = MAX (BITS_PER_UNIT, MIN (BITS_PER_WORD, align)); mode = mode_for_size (bits, MODE_INT, 0).require (); delta = bits / BITS_PER_UNIT; /* Allocate a buffer for the temporary registers. */ regs = XALLOCAVEC (rtx, length / delta - 1); /* Load as many BITS-sized chunks as possible. Use a normal load if the source has enough alignment, otherwise use left/right pairs. */ for (offset = 0, i = 0; offset + 2 * delta <= length; offset += delta, i++) { regs[i] = gen_reg_rtx (mode); riscv_emit_move (regs[i], adjust_address (src, mode, offset)); } /* Copy the chunks to the destination. */ for (offset = 0, i = 0; offset + 2 * delta <= length; offset += delta, i++) riscv_emit_move (adjust_address (dest, mode, offset), regs[i]); /* Mop up any left-over bytes. */ if (offset < length) { src = adjust_address (src, BLKmode, offset); dest = adjust_address (dest, BLKmode, offset); move_by_pieces (dest, src, length - offset, align, RETURN_BEGIN); } } /* Helper function for doing a loop-based block operation on memory reference MEM. Create a new base register for use within the loop and point it to the start of MEM. Create a new memory reference that uses this register and has an alignment of ALIGN. Store them in *LOOP_REG and *LOOP_MEM respectively. */ static void riscv_adjust_block_mem (rtx mem, unsigned HOST_WIDE_INT align, rtx *loop_reg, rtx *loop_mem) { *loop_reg = copy_addr_to_reg (XEXP (mem, 0)); /* Although the new mem does not refer to a known location, it does keep up to LENGTH bytes of alignment. */ *loop_mem = change_address (mem, BLKmode, *loop_reg); set_mem_align (*loop_mem, align); } /* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER bytes at a time. LENGTH must be at least BYTES_PER_ITER. The alignment of the access can be set by ALIGN. Assume that the memory regions do not overlap. */ static void riscv_block_move_loop (rtx dest, rtx src, unsigned HOST_WIDE_INT length, unsigned HOST_WIDE_INT align, unsigned HOST_WIDE_INT bytes_per_iter) { rtx label, src_reg, dest_reg, final_src, test; unsigned HOST_WIDE_INT leftover; leftover = length % bytes_per_iter; length -= leftover; /* Create registers and memory references for use within the loop. */ riscv_adjust_block_mem (src, align, &src_reg, &src); riscv_adjust_block_mem (dest, align, &dest_reg, &dest); /* Calculate the value that SRC_REG should have after the last iteration of the loop. */ final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length), 0, 0, OPTAB_WIDEN); /* Emit the start of the loop. */ label = gen_label_rtx (); emit_label (label); /* Emit the loop body. */ riscv_block_move_straight (dest, src, bytes_per_iter, align); /* Move on to the next block. */ riscv_emit_move (src_reg, plus_constant (Pmode, src_reg, bytes_per_iter)); riscv_emit_move (dest_reg, plus_constant (Pmode, dest_reg, bytes_per_iter)); /* Emit the loop condition. */ test = gen_rtx_NE (VOIDmode, src_reg, final_src); emit_jump_insn (gen_cbranch4 (Pmode, test, src_reg, final_src, label)); /* Mop up any left-over bytes. */ if (leftover) riscv_block_move_straight (dest, src, leftover, align); else emit_insn(gen_nop ()); } /* Expand a cpymemsi instruction, which copies LENGTH bytes from memory reference SRC to memory reference DEST. */ static bool riscv_expand_block_move_scalar (rtx dest, rtx src, rtx length) { if (!CONST_INT_P (length)) return false; unsigned HOST_WIDE_INT hwi_length = UINTVAL (length); unsigned HOST_WIDE_INT factor, align; if (riscv_slow_unaligned_access_p) { align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD); factor = BITS_PER_WORD / align; } else { /* Pretend word-alignment. */ align = BITS_PER_WORD; factor = 1; } if (optimize_function_for_size_p (cfun) && hwi_length * factor * UNITS_PER_WORD > MOVE_RATIO (false)) return false; if (hwi_length <= (RISCV_MAX_MOVE_BYTES_STRAIGHT / factor)) { riscv_block_move_straight (dest, src, hwi_length, align); return true; } else if (optimize && align >= BITS_PER_WORD) { unsigned min_iter_words = RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD; unsigned iter_words = min_iter_words; unsigned HOST_WIDE_INT bytes = hwi_length; unsigned HOST_WIDE_INT words = bytes / UNITS_PER_WORD; /* Lengthen the loop body if it shortens the tail. */ for (unsigned i = min_iter_words; i < min_iter_words * 2 - 1; i++) { unsigned cur_cost = iter_words + words % iter_words; unsigned new_cost = i + words % i; if (new_cost <= cur_cost) iter_words = i; } riscv_block_move_loop (dest, src, bytes, align, iter_words * UNITS_PER_WORD); return true; } return false; } /* This function delegates block-move expansion to either the vector implementation or the scalar one. Return TRUE if successful or FALSE otherwise. Assume that the memory regions do not overlap. */ bool riscv_expand_block_move (rtx dest, rtx src, rtx length) { if ((TARGET_VECTOR && !TARGET_XTHEADVECTOR) && stringop_strategy & STRATEGY_VECTOR) { bool ok = riscv_vector::expand_block_move (dest, src, length, false); if (ok) return true; } if (stringop_strategy & STRATEGY_SCALAR) return riscv_expand_block_move_scalar (dest, src, length); return false; } /* Expand a block-clear instruction via cbo.zero instructions. */ static bool riscv_expand_block_clear_zicboz_zic64b (rtx dest, rtx length) { unsigned HOST_WIDE_INT hwi_length; unsigned HOST_WIDE_INT align; const unsigned HOST_WIDE_INT cbo_bytes = 64; gcc_assert (TARGET_ZICBOZ && TARGET_ZIC64B); if (!CONST_INT_P (length)) return false; hwi_length = UINTVAL (length); if (hwi_length < cbo_bytes) return false; align = MEM_ALIGN (dest) / BITS_PER_UNIT; if (align < cbo_bytes) return false; /* We don't emit loops. Instead apply move-bytes limitation. */ unsigned HOST_WIDE_INT max_bytes = RISCV_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD * cbo_bytes; if (hwi_length > max_bytes) return false; unsigned HOST_WIDE_INT offset = 0; while (offset + cbo_bytes <= hwi_length) { rtx mem = adjust_address (dest, BLKmode, offset); rtx addr = force_reg (Pmode, XEXP (mem, 0)); if (TARGET_64BIT) emit_insn (gen_riscv_zero_di (addr)); else emit_insn (gen_riscv_zero_si (addr)); offset += cbo_bytes; } if (offset < hwi_length) { rtx mem = adjust_address (dest, BLKmode, offset); clear_by_pieces (mem, hwi_length - offset, align); } return true; } bool riscv_expand_block_clear (rtx dest, rtx length) { /* Only use setmem-zero expansion for Zicboz + Zic64b. */ if (!TARGET_ZICBOZ || !TARGET_ZIC64B) return false; if (optimize_function_for_size_p (cfun)) return false; return riscv_expand_block_clear_zicboz_zic64b (dest, length); } /* --- Vector expanders --- */ namespace riscv_vector { struct stringop_info { rtx avl; bool need_loop; machine_mode vmode; }; /* If a vectorized stringop should be used populate INFO and return TRUE. Otherwise return false and leave INFO unchanged. MAX_EW is the maximum element width that the caller wants to use and LENGTH_IN is the length of the stringop in bytes. */ static bool use_vector_stringop_p (struct stringop_info &info, HOST_WIDE_INT max_ew, rtx length_in) { bool need_loop = true; machine_mode vmode = VOIDmode; /* The number of elements in the stringop. */ rtx avl = length_in; HOST_WIDE_INT potential_ew = max_ew; if (!TARGET_VECTOR || !(stringop_strategy & STRATEGY_VECTOR)) return false; if (CONST_INT_P (length_in)) { HOST_WIDE_INT length = INTVAL (length_in); /* If the VLEN and preferred LMUL allow the entire block to be copied in one go then no loop is needed. */ if (known_le (length, BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL)) { need_loop = false; /* If a single scalar load / store pair can do the job, leave it to the scalar code to do that. */ /* ??? If fast unaligned access is supported, the scalar code could use suitably sized scalars irrespective of alignment. If that gets fixed, we have to adjust the test here. */ if (pow2p_hwi (length) && length <= potential_ew) return false; } /* Find the vector mode to use. Using the largest possible element size is likely to give smaller constants, and thus potentially reducing code size. However, if we need a loop, we need to update the pointers, and that is more complicated with a larger element size, unless we use an immediate, which prevents us from dynamically using the targets transfer size that the hart supports. And then, unless we know the *exact* vector size of the hart, we'd need multiple vsetvli / branch statements, so it's not even a size win. If, in the future, we find an RISCV-V implementation that is slower for small element widths, we might allow larger element widths for loops too. */ if (need_loop) potential_ew = 1; for (; potential_ew; potential_ew >>= 1) { scalar_int_mode elem_mode; unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT; poly_uint64 per_iter; poly_int64 nunits; if (need_loop) per_iter = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL; else per_iter = length; /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL may not be divisible by this potential_ew. */ if (!multiple_p (per_iter, potential_ew, &nunits)) continue; /* Unless we get an implementation that's slow for small element size / non-word-aligned accesses, we assume that the hardware handles this well, and we don't want to complicate the code with shifting word contents around or handling extra bytes at the start and/or end. So we want the total transfer size and alignment to fit with the element size. */ if (length % potential_ew != 0 || !int_mode_for_size (bits, 0).exists (&elem_mode)) continue; poly_uint64 mode_units; /* Find the mode to use for the copy inside the loop - or the sole copy, if there is no loop. */ if (!need_loop) { /* Try if we have an exact mode for the copy. */ if (riscv_vector::get_vector_mode (elem_mode, nunits).exists (&vmode)) break; /* Since we don't have a mode that exactly matches the transfer size, we'll need to use pred_store, which is not available for all vector modes, but only iE_RVV_M* modes, hence trying to find a vector mode for a merely rounded-up size is pointless. Still, by choosing a lower LMUL factor that still allows an entire transfer, we can reduce register pressure. */ for (unsigned lmul = 1; lmul < TARGET_MAX_LMUL; lmul <<= 1) if (known_le (length * BITS_PER_UNIT, TARGET_MIN_VLEN * lmul) && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew, &mode_units) && (riscv_vector::get_vector_mode (elem_mode, mode_units).exists (&vmode))) break; } /* Stop searching if a suitable vmode has been found. */ if (vmode != VOIDmode) break; /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL will at least be divisible by potential_ew 1, so this should succeed eventually. */ if (multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL, potential_ew, &mode_units) && riscv_vector::get_vector_mode (elem_mode, mode_units).exists (&vmode)) break; /* We may get here if we tried an element size that's larger than the hardware supports, but we should at least find a suitable byte vector mode. */ gcc_assert (potential_ew > 1); } if (potential_ew > 1) avl = GEN_INT (length / potential_ew); } else { gcc_assert (get_lmul_mode (QImode, TARGET_MAX_LMUL).exists (&vmode)); } /* A memcpy libcall in the worst case takes 3 instructions to prepare the arguments + 1 for the call. When RVV should take 7 instructions and we're optimizing for size a libcall may be preferable. */ if (optimize_function_for_size_p (cfun) && need_loop) return false; info.need_loop = need_loop; info.vmode = vmode; info.avl = avl; return true; } /* Used by cpymemsi in riscv.md . */ bool expand_block_move (rtx dst_in, rtx src_in, rtx length_in, bool movmem_p) { /* memcpy: mv a3, a0 # Copy destination loop: vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b vle8.v v0, (a1) # Load bytes add a1, a1, t0 # Bump pointer sub a2, a2, t0 # Decrement count vse8.v v0, (a3) # Store bytes add a3, a3, t0 # Bump pointer bnez a2, loop # Any more? ret # Return */ struct stringop_info info; HOST_WIDE_INT potential_ew = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD) / BITS_PER_UNIT); if (!use_vector_stringop_p (info, potential_ew, length_in)) return false; /* Inlining general memmove is a pessimisation: we can't avoid having to decide which direction to go at runtime, which is costly in instruction count however for situations where the entire move fits in one vector operation we can do all reads before doing any writes so we don't have to worry so generate the inline vector code in such situations. */ if (info.need_loop && movmem_p) return false; rtx src, dst; rtx vec; /* avl holds the (remaining) length of the required copy. cnt holds the length we copy with the current load/store pair. */ rtx cnt = info.avl; rtx label = NULL_RTX; rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0)); if (info.need_loop) { info.avl = copy_to_mode_reg (Pmode, info.avl); cnt = gen_reg_rtx (Pmode); label = gen_label_rtx (); emit_label (label); emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt, info.avl)); } vec = gen_reg_rtx (info.vmode); src = change_address (src_in, info.vmode, src_addr); dst = change_address (dst_in, info.vmode, dst_addr); /* If we don't need a loop and have a suitable mode to describe the size, just do a load / store pair and leave it up to the later lazy code motion pass to insert the appropriate vsetvli. */ if (!info.need_loop && known_eq (GET_MODE_SIZE (info.vmode), INTVAL (length_in))) { emit_move_insn (vec, src); emit_move_insn (dst, vec); } else { machine_mode mask_mode = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (info.vmode)).require (); rtx mask = CONSTM1_RTX (mask_mode); if (!satisfies_constraint_K (cnt)) cnt= force_reg (Pmode, cnt); rtx m_ops[] = {vec, mask, src}; emit_nonvlmax_insn (code_for_pred_mov (info.vmode), riscv_vector::UNARY_OP_TAMA, m_ops, cnt); emit_insn (gen_pred_store (info.vmode, dst, mask, vec, cnt, get_avl_type_rtx (riscv_vector::NONVLMAX))); } if (info.need_loop) { emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt))); emit_insn (gen_rtx_SET (info.avl, gen_rtx_MINUS (Pmode, info.avl, cnt))); /* Emit the loop condition. */ rtx test = gen_rtx_NE (VOIDmode, info.avl, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test, info.avl, const0_rtx, label)); emit_insn (gen_nop ()); } return true; } /* Implement rawmemchr and strlen using vector instructions. It can be assumed that the needle is in the haystack, otherwise the behavior is undefined. */ void expand_rawmemchr (machine_mode mode, rtx dst, rtx haystack, rtx needle, bool strlen) { /* rawmemchr: loop: vsetvli a1, zero, e[8,16,32,64], m1, ta, ma vle[8,16,32,64]ff.v v8, (a0) # Load. csrr a1, vl # Get number of bytes read. vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...}) vfirst.m a2, v0 # Find first hit. add a0, a0, a1 # Bump pointer. bltz a2, loop # Not found? sub a0, a0, a1 # Go back by a1. shll a2, a2, [0,1,2,3] # Shift to get byte offset. add a0, a0, a2 # Add the offset. ret */ gcc_assert (TARGET_VECTOR); if (strlen) gcc_assert (mode == E_QImode); unsigned int isize = GET_MODE_SIZE (mode).to_constant (); int lmul = TARGET_MAX_LMUL; poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize); machine_mode vmode; if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode), nunits).exists (&vmode)) gcc_unreachable (); machine_mode mask_mode = riscv_vector::get_mask_mode (vmode); rtx cnt = gen_reg_rtx (Pmode); emit_move_insn (cnt, CONST0_RTX (Pmode)); rtx end = gen_reg_rtx (Pmode); rtx vec = gen_reg_rtx (vmode); rtx mask = gen_reg_rtx (mask_mode); /* After finding the first vector element matching the needle, we need to multiply by the vector element width (SEW) in order to return a pointer to the matching byte. */ unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ()); rtx src_addr = copy_addr_to_reg (XEXP (haystack, 0)); rtx start_addr = copy_addr_to_reg (XEXP (haystack, 0)); rtx loop = gen_label_rtx (); emit_label (loop); rtx vsrc = change_address (haystack, vmode, src_addr); /* Bump the pointer. */ rtx step = gen_reg_rtx (Pmode); emit_insn (gen_rtx_SET (step, gen_rtx_ASHIFT (Pmode, cnt, GEN_INT (shift)))); emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, step))); /* Emit a first-fault load. */ rtx vlops[] = {vec, vsrc}; emit_vlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops); /* Read how far we read. */ if (Pmode == SImode) emit_insn (gen_read_vlsi (cnt)); else emit_insn (gen_read_vldi_zero_extend (cnt)); /* Compare needle with haystack and store in a mask. */ rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, needle), vec); rtx vmsops[] = {mask, eq, vec, needle}; emit_nonvlmax_insn (code_for_pred_cmp_scalar (vmode), riscv_vector::COMPARE_OP, vmsops, cnt); /* Find the first bit in the mask. */ rtx vfops[] = {end, mask}; emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode), riscv_vector::CPOP_OP, vfops, cnt); /* Emit the loop condition. */ rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop)); if (strlen) { /* For strlen, return the length. */ emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end))); emit_insn (gen_rtx_SET (dst, gen_rtx_MINUS (Pmode, dst, start_addr))); } else { /* For rawmemchr, return the position at SRC + END * [1,2,4,8]. */ emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT (shift)))); emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end))); } } /* Implement cmpstr using vector instructions. The ALIGNMENT and NCOMPARE parameters are unused for now. */ bool expand_strcmp (rtx result, rtx src1, rtx src2, rtx nbytes, unsigned HOST_WIDE_INT, bool) { gcc_assert (TARGET_VECTOR); /* We don't support big endian. */ if (BYTES_BIG_ENDIAN) return false; bool with_length = nbytes != NULL_RTX; if (with_length && (!REG_P (nbytes) && !SUBREG_P (nbytes) && !CONST_INT_P (nbytes))) return false; if (with_length && CONST_INT_P (nbytes)) nbytes = force_reg (Pmode, nbytes); machine_mode mode = E_QImode; unsigned int isize = GET_MODE_SIZE (mode).to_constant (); int lmul = TARGET_MAX_LMUL; poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize); machine_mode vmode; if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode), nunits) .exists (&vmode)) gcc_unreachable (); machine_mode mask_mode = riscv_vector::get_mask_mode (vmode); /* Prepare addresses. */ rtx src_addr1 = copy_addr_to_reg (XEXP (src1, 0)); rtx vsrc1 = change_address (src1, vmode, src_addr1); rtx src_addr2 = copy_addr_to_reg (XEXP (src2, 0)); rtx vsrc2 = change_address (src2, vmode, src_addr2); /* Set initial pointer bump to 0. */ rtx cnt = gen_reg_rtx (Pmode); emit_move_insn (cnt, CONST0_RTX (Pmode)); rtx sub = gen_reg_rtx (Pmode); emit_move_insn (sub, CONST0_RTX (Pmode)); /* Create source vectors. */ rtx vec1 = gen_reg_rtx (vmode); rtx vec2 = gen_reg_rtx (vmode); rtx done = gen_label_rtx (); rtx loop = gen_label_rtx (); emit_label (loop); /* Bump the pointers. */ emit_insn (gen_rtx_SET (src_addr1, gen_rtx_PLUS (Pmode, src_addr1, cnt))); emit_insn (gen_rtx_SET (src_addr2, gen_rtx_PLUS (Pmode, src_addr2, cnt))); rtx vlops1[] = {vec1, vsrc1}; rtx vlops2[] = {vec2, vsrc2}; if (!with_length) { emit_vlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops1); emit_vlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops2); } else { nbytes = gen_lowpart (Pmode, nbytes); emit_nonvlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops1, nbytes); emit_nonvlmax_insn (code_for_pred_fault_load (vmode), riscv_vector::UNARY_OP, vlops2, nbytes); } /* Read the vl for the next pointer bump. */ if (Pmode == SImode) emit_insn (gen_read_vlsi (cnt)); else emit_insn (gen_read_vldi_zero_extend (cnt)); if (with_length) { rtx test_done = gen_rtx_EQ (VOIDmode, cnt, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test_done, cnt, const0_rtx, done)); emit_insn (gen_rtx_SET (nbytes, gen_rtx_MINUS (Pmode, nbytes, cnt))); } /* Look for a \0 in the first string. */ rtx mask0 = gen_reg_rtx (mask_mode); rtx eq0 = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, CONST0_RTX (mode)), vec1); rtx vmsops1[] = {mask0, eq0, vec1, CONST0_RTX (mode)}; emit_nonvlmax_insn (code_for_pred_cmp_scalar (vmode), riscv_vector::COMPARE_OP, vmsops1, cnt); /* Look for vec1 != vec2 (includes vec2[i] == 0). */ rtx maskne = gen_reg_rtx (mask_mode); rtx ne = gen_rtx_NE (mask_mode, vec1, vec2); rtx vmsops[] = {maskne, ne, vec1, vec2}; emit_nonvlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP, vmsops, cnt); /* Combine both masks into one. */ rtx mask = gen_reg_rtx (mask_mode); rtx vmorops[] = {mask, mask0, maskne}; emit_nonvlmax_insn (code_for_pred (IOR, mask_mode), riscv_vector::BINARY_MASK_OP, vmorops, cnt); /* Find the first bit in the mask (the first unequal element). */ rtx found_at = gen_reg_rtx (Pmode); rtx vfops[] = {found_at, mask}; emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode), riscv_vector::CPOP_OP, vfops, cnt); /* Emit the loop condition. */ rtx test = gen_rtx_LT (VOIDmode, found_at, const0_rtx); emit_jump_insn (gen_cbranch4 (Pmode, test, found_at, const0_rtx, loop)); /* Walk up to the difference point. */ emit_insn ( gen_rtx_SET (src_addr1, gen_rtx_PLUS (Pmode, src_addr1, found_at))); emit_insn ( gen_rtx_SET (src_addr2, gen_rtx_PLUS (Pmode, src_addr2, found_at))); /* Load the respective byte and compute the difference. */ rtx c1 = gen_reg_rtx (Pmode); rtx c2 = gen_reg_rtx (Pmode); do_load_from_addr (mode, c1, src_addr1, src1); do_load_from_addr (mode, c2, src_addr2, src2); do_sub3 (sub, c1, c2); if (with_length) emit_label (done); emit_move_insn (result, sub); return true; } /* Check we are permitted to vectorise a memory operation. If so, return true and populate lmul_out. Otherwise, return false and leave lmul_out unchanged. */ static bool check_vectorise_memory_operation (rtx length_in, HOST_WIDE_INT &lmul_out) { /* If we either can't or have been asked not to vectorise, respect this. */ if (!TARGET_VECTOR) return false; if (!(stringop_strategy & STRATEGY_VECTOR)) return false; /* If we can't reason about the length, don't vectorise. */ if (!CONST_INT_P (length_in)) return false; HOST_WIDE_INT length = INTVAL (length_in); /* If it's tiny, default operation is likely better; maybe worth considering fractional lmul in the future as well. */ if (length < (TARGET_MIN_VLEN / 8)) return false; /* If we've been asked to use a specific LMUL, check the operation fits and do that. */ if (rvv_max_lmul != RVV_DYNAMIC) { lmul_out = TARGET_MAX_LMUL; return (length <= ((TARGET_MAX_LMUL * TARGET_MIN_VLEN) / 8)); } /* Find smallest lmul large enough for entire op. */ HOST_WIDE_INT lmul = 1; while ((lmul <= 8) && (length > ((lmul * TARGET_MIN_VLEN) / 8))) { lmul <<= 1; } if (lmul > 8) return false; lmul_out = lmul; return true; } /* Used by setmemdi in riscv.md. */ bool expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in) { HOST_WIDE_INT lmul; /* Check we are able and allowed to vectorise this operation; bail if not. */ if (!check_vectorise_memory_operation (length_in, lmul)) return false; machine_mode vmode = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul) .require (); rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); rtx dst = change_address (dst_in, vmode, dst_addr); rtx fill_value = gen_reg_rtx (vmode); rtx broadcast_ops[] = { fill_value, fill_value_in }; /* If the length is exactly vlmax for the selected mode, do that. Otherwise, use a predicated store. */ if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) { emit_vlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP, broadcast_ops); emit_move_insn (dst, fill_value); } else { if (!satisfies_constraint_K (length_in)) length_in = force_reg (Pmode, length_in); emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP, broadcast_ops, length_in); machine_mode mask_mode = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (vmode)) .require (); rtx mask = CONSTM1_RTX (mask_mode); emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in, get_avl_type_rtx (riscv_vector::NONVLMAX))); } return true; } /* Used by cmpmemsi in riscv.md. */ bool expand_vec_cmpmem (rtx result_out, rtx blk_a_in, rtx blk_b_in, rtx length_in) { HOST_WIDE_INT lmul; /* Check we are able and allowed to vectorise this operation; bail if not. */ if (!check_vectorise_memory_operation (length_in, lmul)) return false; /* Strategy: load entire blocks at a and b into vector regs generate mask of bytes that differ find first set bit in mask find offset of first set bit in mask, use 0 if none set result is ((char*)a[offset] - (char*)b[offset]) */ machine_mode vmode = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul) .require (); rtx blk_a_addr = copy_addr_to_reg (XEXP (blk_a_in, 0)); rtx blk_a = change_address (blk_a_in, vmode, blk_a_addr); rtx blk_b_addr = copy_addr_to_reg (XEXP (blk_b_in, 0)); rtx blk_b = change_address (blk_b_in, vmode, blk_b_addr); rtx vec_a = gen_reg_rtx (vmode); rtx vec_b = gen_reg_rtx (vmode); machine_mode mask_mode = get_mask_mode (vmode); rtx mask = gen_reg_rtx (mask_mode); rtx mismatch_ofs = gen_reg_rtx (Pmode); rtx ne = gen_rtx_NE (mask_mode, vec_a, vec_b); rtx vmsops[] = { mask, ne, vec_a, vec_b }; rtx vfops[] = { mismatch_ofs, mask }; /* If the length is exactly vlmax for the selected mode, do that. Otherwise, use a predicated store. */ if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) { emit_move_insn (vec_a, blk_a); emit_move_insn (vec_b, blk_b); emit_vlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP, vmsops); emit_vlmax_insn (code_for_pred_ffs (mask_mode, Pmode), riscv_vector::CPOP_OP, vfops); } else { if (!satisfies_constraint_K (length_in)) length_in = force_reg (Pmode, length_in); rtx memmask = CONSTM1_RTX (mask_mode); rtx m_ops_a[] = { vec_a, memmask, blk_a }; rtx m_ops_b[] = { vec_b, memmask, blk_b }; emit_nonvlmax_insn (code_for_pred_mov (vmode), riscv_vector::UNARY_OP_TAMA, m_ops_a, length_in); emit_nonvlmax_insn (code_for_pred_mov (vmode), riscv_vector::UNARY_OP_TAMA, m_ops_b, length_in); emit_nonvlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP, vmsops, length_in); emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode), riscv_vector::CPOP_OP, vfops, length_in); } /* Mismatch_ofs is -1 if blocks match, or the offset of the first mismatch otherwise. */ rtx ltz = gen_reg_rtx (Xmode); emit_insn (gen_slt_3 (LT, Xmode, Xmode, ltz, mismatch_ofs, const0_rtx)); /* mismatch_ofs += (mismatch_ofs < 0) ? 1 : 0. */ emit_insn ( gen_rtx_SET (mismatch_ofs, gen_rtx_PLUS (Pmode, mismatch_ofs, ltz))); /* Unconditionally load the bytes at mismatch_ofs and subtract them to get our result. */ emit_insn (gen_rtx_SET (blk_a_addr, gen_rtx_PLUS (Pmode, mismatch_ofs, blk_a_addr))); emit_insn (gen_rtx_SET (blk_b_addr, gen_rtx_PLUS (Pmode, mismatch_ofs, blk_b_addr))); blk_a = change_address (blk_a, QImode, blk_a_addr); blk_b = change_address (blk_b, QImode, blk_b_addr); rtx byte_a = gen_reg_rtx (SImode); rtx byte_b = gen_reg_rtx (SImode); do_zero_extendqi2 (byte_a, blk_a); do_zero_extendqi2 (byte_b, blk_b); emit_insn (gen_rtx_SET (result_out, gen_rtx_MINUS (SImode, byte_a, byte_b))); return true; } }