aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-dfa.c
diff options
context:
space:
mode:
authorMaciej W. Rozycki <macro@linux-mips.org>2020-12-05 18:26:24 +0000
committerMaciej W. Rozycki <macro@linux-mips.org>2020-12-05 18:26:24 +0000
commitdfb21f37fde4f9e11e9b3b6b6ea6dc9d5fbed644 (patch)
tree14d88856c2d52b48d55c91dd18c3c720f594171a /gcc/tree-dfa.c
parent7920fe3d81ece3a23360ce0d8899f1568a9037c8 (diff)
downloadgcc-dfb21f37fde4f9e11e9b3b6b6ea6dc9d5fbed644.zip
gcc-dfb21f37fde4f9e11e9b3b6b6ea6dc9d5fbed644.tar.gz
gcc-dfb21f37fde4f9e11e9b3b6b6ea6dc9d5fbed644.tar.bz2
VAX: Rationalize expression and address costs
Expression costs are required to be given in terms of COSTS_N_INSNS (n), which is defined to stand for the count of single fast instructions, and actually returns `n * 4'. The VAX backend however instead operates on naked numbers, causing an anomaly for the integer const zero rtx, where the cost given is 4 as opposed to 1 for integers in the [1:63] range, as well as -1 for comparisons. This is because the value of 0 returned by `vax_rtx_costs' is converted to COSTS_N_INSNS (1) in `pattern_cost': return cost > 0 ? cost : COSTS_N_INSNS (1); Consequently, where feasible, 1 or -1 are preferred over 0 by the middle end causing code pessimization, e.g. rather than producing this: subl2 $4,%sp movl 4(%ap),%r0 jgtr .L2 addl2 $2,%r0 .L2: ret or this: subl2 $4,%sp addl3 4(%ap),8(%ap),%r0 jlss .L6 addl2 $2,%r0 .L6: ret code is produced like this: subl2 $4,%sp movl 4(%ap),%r0 cmpl %r0,$1 jgeq .L2 addl2 $2,%r0 .L2: ret or this: subl2 $4,%sp addl3 4(%ap),8(%ap),%r0 cmpl %r0,$-1 jleq .L6 addl2 $2,%r0 .L6: ret from this: int compare_mov (int x) { if (x > 0) return x; else return x + 2; } and this: int compare_add (int x, int y) { int z; z = x + y; if (z < 0) return z; else return z + 2; } respectively, which is slower and larger both at a time. Furthermore once the backend is converted to MODE_CC this anomaly makes it usually impossible to remove redundant comparisons in the comparison elimination pass, because most VAX instructions set the condition codes as per the relation of the instruction's result to 0 and not -1. The middle end has some other assumptions as to rtx costs being given in terms of COSTS_N_INSNS, so wrap all the VAX rtx costs then as they stand into COSTS_N_INSNS invocations, effectively scaling the costs by 4 while preserving their relative values, except for the integer const zero rtx given the value of `COSTS_N_INSNS (1) / 2', half of a fast instruction (this can be further halved if needed in the future). Adjust address costs likewise so that they remain proportional to the new absolute values of rtx costs. Code size stats are as follows, collected from 17639 executables built in `check-c' GCC testing: samples average median -------------------------------------- regressions 1420 0.400% 0.195% unchanged 13811 0.000% 0.000% progressions 2408 -0.504% -0.201% -------------------------------------- total 17639 -0.037% 0.000% with a small number of outliers only (over 5% size change): old new change %change filename ---------------------------------------------------- 4991 5249 258 5.1693 981001-1.exe 2637 2777 140 5.3090 interchange-6.exe 2187 2307 120 5.4869 sprintf.x7 3969 4197 228 5.7445 pr28982a.exe 8264 8816 552 6.6795 vector-compare-1.exe 5199 5575 376 7.2321 pr28982b.exe 2113 2411 298 14.1031 20030323-1.exe 2113 2411 298 14.1031 20030323-1.exe 2113 2411 298 14.1031 20030323-1.exe so it seems we are looking good, and we have complementing reductions to compensate: old new change %change filename ---------------------------------------------------- 2919 2631 -288 -9.8663 pr57521.exe 3427 3167 -260 -7.5868 sabd_1.exe 2985 2765 -220 -7.3701 ssad-run.exe 2985 2765 -220 -7.3701 ssad-run.exe 2985 2765 -220 -7.3701 usad-run.exe 2985 2765 -220 -7.3701 usad-run.exe 4509 4253 -256 -5.6775 vshuf-v2sf.exe 4541 4285 -256 -5.6375 vshuf-v2si.exe 4673 4417 -256 -5.4782 vshuf-v2df.exe 2993 2841 -152 -5.0785 abs-2.x4 2993 2841 -152 -5.0785 abs-3.x4 This actually causes `loop-8.c' to regress: FAIL: gcc.dg/loop-8.c scan-rtl-dump-times loop2_invariant "Decided" 1 FAIL: gcc.dg/loop-8.c scan-rtl-dump-not loop2_invariant "without introducing a new temporary register" but upon a closer inspection this is a red herring. Old code looks as follows: .file "loop-8.c" .text .align 1 .globl f .type f, @function f: .word 0 subl2 $4,%sp movl 4(%ap),%r2 movl 8(%ap),%r3 movl $42,(%r2) clrl %r0 movl $42,%r1 movl %r1,%r4 jbr .L2 .L5: movl %r4,%r1 .L2: movl %r1,(%r3)[%r0] incl %r0 cmpl %r0,$100 jeql .L6 movl $42,(%r2)[%r0] bicl3 $-2,%r0,%r1 jeql .L5 movl %r0,%r1 jbr .L2 .L6: ret .size f, .-f while new one is like below: .file "loop-8.c" .text .align 1 .globl f .type f, @function f: .word 0 subl2 $4,%sp movl 4(%ap),%r2 movl $42,(%r2)+ movl 8(%ap),%r1 clrl %r0 movl $42,%r3 movzbl $100,%r4 movl %r3,%r5 jbr .L2 .L5: movl %r5,%r3 .L2: movl %r3,(%r1)+ incl %r0 cmpl %r0,%r4 jeql .L6 movl $42,(%r2)+ bicl3 $-2,%r0,%r3 jeql .L5 movl %r0,%r3 jbr .L2 .L6: ret .size f, .-f and is clearly better: not only it is smaller, but it also uses the post-increment rather than indexed addressing mode in the loop, of which the former comes for free in terms of both performance and code size while the latter causes an extra byte per operand to be produced for the index register and also incurs an execution penalty for the extra address calculation. Exclude the case from VAX testing then, as already done for some other targets and discussed with commit d242fdaec186 ("gcc.dg/loop-8.c: Skip for mmix."). gcc/ * config/vax/vax.c (vax_address_cost): Express the cost in terms of COSTS_N_INSNS. (vax_rtx_costs): Likewise. gcc/testsuite/ * gcc.dg/loop-8.c: Exclude for `vax-*-*'. * gcc.target/vax/compare-add-zero.c: New test. * gcc.target/vax/compare-mov-zero.c: New test.
Diffstat (limited to 'gcc/tree-dfa.c')
0 files changed, 0 insertions, 0 deletions