From e395166cb75a8fadd72ac24fa87c73edac0970a5 Mon Sep 17 00:00:00 2001 From: Rika Ichinose Date: Thu, 8 Feb 2024 11:30:32 +0300 Subject: [PATCH] Check for Move overlaps in more obvious way (that also does no jumps in forward case). --- rtl/i386/fastmove.inc | 77 ++++++++++++++++++++----------------------- rtl/x86_64/x86_64.inc | 13 ++++---- 2 files changed, 41 insertions(+), 49 deletions(-) diff --git a/rtl/i386/fastmove.inc b/rtl/i386/fastmove.inc index 8eab5aa0ee..84b59e1844 100644 --- a/rtl/i386/fastmove.inc +++ b/rtl/i386/fastmove.inc @@ -7,35 +7,34 @@ procedure Move_8OrMore_Valgrind; assembler; nostackframe; { eax = source, edx = dest, ecx = count (ecx >= 8). If FPC_PIC: ebx pushed. } asm -{$ifndef FPC_PIC} - push %ebx -{$endif} - sub %edx, %eax - jae .LForward - mov %ecx, %ebx - add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } - jb .LBack { if no overlap, still do forward move } + sub %eax, %edx { edx = dest - src } + cmp %edx, %ecx + ja .LBack { count (ecx) > unsigned(dest - src) (edx) if regions overlap } -.LForward: {$ifdef FPC_ENABLED_CLD} cld {$endif FPC_ENABLED_CLD} push %esi push %edi - lea (%eax,%edx), %esi - mov %edx, %edi + mov %eax, %esi + lea (%edx,%eax), %edi rep movsb pop %edi pop %esi +{$ifdef FPC_PIC} pop %ebx +{$endif} ret .LBack: - add %ecx, %edx +{$ifndef FPC_PIC} + push %ebx +{$endif} + add %ecx, %eax .LNextb: - dec %edx - mov (%eax,%edx), %bl - mov %bl, (%edx) + dec %eax + mov (%eax), %bl + mov %bl, (%edx,%eax) dec %ecx jnz .LNextb pop %ebx @@ -77,13 +76,11 @@ asm {$ifndef FPC_PIC} push %ebx {$endif} - jnb .LForward { src>dest => forward move } + mov %eax, %ebx + neg %ebx + cmp %ebx, %ecx + ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap } - mov %ecx, %ebx - add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } - jb .Lback { if no overlap, still do forward move } - -.LForward: mov %edx, %ebx { remember original dest to write first 16 bytes } add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } add $8, %edx @@ -161,13 +158,11 @@ asm movq -8(%eax,%ecx), %mm5 sub %edx, %eax { eax = src - dest } jz .Lquit { exit if src=dest } - jnb .LForward { src>dest => forward move } + mov %eax, %ebx + neg %ebx + cmp %ebx, %ecx + ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap } - mov %ecx, %ebx - add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } - jb .Lback { if no overlap, still do forward move } - -.LForward: mov %edx, %ebx { remember original dest to write first 16 bytes } add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } add $8, %edx @@ -237,7 +232,7 @@ end; {$ifndef FASTMOVE_DISABLE_SSE} label - Move_8OrMore_SSE_9to16, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards; + Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards; const Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) } @@ -248,20 +243,20 @@ procedure Move_8OrMore_SSE; assembler; nostackframe; const PrefetchDistance = 512; asm - cmp $16, %ecx - jle Move_8OrMore_SSE_9to16 + cmp $15, %ecx + jle Move_8OrMore_SSE_9to15 movups (%eax), %xmm4 { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. } movups -16(%eax,%ecx), %xmm5 cmp $32, %ecx jg Move_8OrMore_SSE_33OrMore - movups %xmm4, (%edx) { 17–32 bytes } + movups %xmm4, (%edx) { 16–32 bytes } movups %xmm5, -16(%edx,%ecx) {$ifdef FPC_PIC} pop %ebx {$endif} ret -Move_8OrMore_SSE_9to16: +Move_8OrMore_SSE_9to15: movlps (%eax), %xmm0 movlps -8(%eax,%ecx), %xmm1 movlps %xmm0, (%edx) @@ -271,7 +266,7 @@ Move_8OrMore_SSE_9to16: pop %ebx {$endif} ret - .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } + .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } Move_8OrMore_SSE_33OrMore: sub %edx, %eax { eax = src - dest } @@ -279,13 +274,11 @@ Move_8OrMore_SSE_33OrMore: {$ifndef FPC_PIC} push %ebx {$endif} - jnb .LForward { src>dest => forward move } + mov %eax, %ebx + neg %ebx + cmp %ebx, %ecx + ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap } - lea -1(%ecx), %ebx - add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } - jb .Lback { if no overlap, still do forward move } - -.LForward: mov %edx, %ebx { remember original dest to write first 16 bytes } add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } add $16, %edx @@ -466,15 +459,15 @@ procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe; const ErmsThreshold = 1536; asm - cmp $16, %ecx - jle Move_8OrMore_SSE_9to16 + cmp $15, %ecx + jle Move_8OrMore_SSE_9to15 cmp $ErmsThreshold, %ecx jae .LRepMovs movups (%eax), %xmm4 { Same as in Move_8OrMore_SSE. } movups -16(%eax,%ecx), %xmm5 cmp $32, %ecx jg Move_8OrMore_SSE_33OrMore - movups %xmm4, (%edx) { 17–32 bytes } + movups %xmm4, (%edx) { 16–32 bytes } movups %xmm5, -16(%edx,%ecx) {$ifdef FPC_PIC} pop %ebx diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc index baa974b94f..850aa127d7 100644 --- a/rtl/x86_64/x86_64.inc +++ b/rtl/x86_64/x86_64.inc @@ -134,18 +134,17 @@ asm mov %r9, -8(%rdx,%r8) .Lquit: ret - .byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32f into a no-op. } + .byte 102,144 { Turns .balign 16 before .Lloop32f into a no-op. } .L33OrMore: sub %rdx, %rcx { rcx = src - dest } jz .Lquit { exit if src=dest } - jnb .LForward { src>dest => forward move } - mov %r8, %rax - add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap } - jb .Lback { if no overlap, still do forward move } + mov %rcx, %rax + neg %rax + cmp %rax, %r8 + ja .Lback { count (r8) > unsigned(dest - src) (rax) if regions overlap } -.LForward: mov %rdx, %r9 { remember original dest to write first 16 bytes } add %rdx, %r8 { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } add $16, %rdx @@ -217,7 +216,7 @@ asm mfence add $0x1000, %r8 jmpq .LRestAfterNTf { go handle remaining bytes } - .byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32b into a no-op. } + .byte 102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } { backwards move } .Lback: