* Implement low level alpha blend function in win32 64bit

git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@2544 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
blikblum 2012-10-03 00:55:46 +00:00
parent fef43dbcff
commit faa98477d0

View File

@ -1,4 +1,6 @@
{$ASMMODE INTEL}
procedure AlphaBlendLineConstant(Source, Destination: Pointer; Count: Integer; ConstantAlpha, Bias: Integer);
// Blends a line of Count pixels from Source to Destination using a constant alpha value.
@ -10,6 +12,62 @@ procedure AlphaBlendLineConstant(Source, Destination: Pointer; Count: Integer; C
asm
{$ifdef CPU64}
// RCX contains Source
// RDX contains Destination
// R8D contains Count
// R9D contains ConstantAlpha
// Bias is on the stack
//.NOFRAME
// Load XMM3 with the constant alpha value (replicate it for every component).
// Expand it to word size.
MOVD XMM3, R9D // ConstantAlpha
PUNPCKLWD XMM3, XMM3
PUNPCKLDQ XMM3, XMM3
// Load XMM5 with the bias value.
MOVD XMM5, [Bias]
PUNPCKLWD XMM5, XMM5
PUNPCKLDQ XMM5, XMM5
// Load XMM4 with 128 to allow for saturated biasing.
MOV R10D, 128
MOVD XMM4, R10D
PUNPCKLWD XMM4, XMM4
PUNPCKLDQ XMM4, XMM4
@1: // The pixel loop calculates an entire pixel in one run.
// Note: The pixel byte values are expanded into the higher bytes of a word due
// to the way unpacking works. We compensate for this with an extra shift.
MOVD XMM1, DWORD PTR [RCX] // data is unaligned
MOVD XMM2, DWORD PTR [RDX] // data is unaligned
PXOR XMM0, XMM0 // clear source pixel register for unpacking
PUNPCKLBW XMM0, XMM1{[RCX]} // unpack source pixel byte values into words
PSRLW XMM0, 8 // move higher bytes to lower bytes
PXOR XMM1, XMM1 // clear target pixel register for unpacking
PUNPCKLBW XMM1, XMM2{[RDX]} // unpack target pixel byte values into words
MOVQ XMM2, XMM1 // make a copy of the shifted values, we need them again
PSRLW XMM1, 8 // move higher bytes to lower bytes
// calculation is: target = (alpha * (source - target) + 256 * target) / 256
PSUBW XMM0, XMM1 // source - target
PMULLW XMM0, XMM3 // alpha * (source - target)
PADDW XMM0, XMM2 // add target (in shifted form)
PSRLW XMM0, 8 // divide by 256
// Bias is accounted for by conversion of range 0..255 to -128..127,
// doing a saturated add and convert back to 0..255.
PSUBW XMM0, XMM4
PADDSW XMM0, XMM5
PADDW XMM0, XMM4
PACKUSWB XMM0, XMM0 // convert words to bytes with saturation
MOVD DWORD PTR [RDX], XMM0 // store the result
@3:
ADD RCX, 4
ADD RDX, 4
DEC R8D
JNZ @1
{$else}
@ -89,6 +147,61 @@ procedure AlphaBlendLinePerPixel(Source, Destination: Pointer; Count, Bias: Inte
asm
{$ifdef CPU64}
// RCX contains Source
// RDX contains Destination
// R8D contains Count
// R9D contains Bias
//.NOFRAME
// Load XMM5 with the bias value.
MOVD XMM5, R9D // Bias
PUNPCKLWD XMM5, XMM5
PUNPCKLDQ XMM5, XMM5
// Load XMM4 with 128 to allow for saturated biasing.
MOV R10D, 128
MOVD XMM4, R10D
PUNPCKLWD XMM4, XMM4
PUNPCKLDQ XMM4, XMM4
@1: // The pixel loop calculates an entire pixel in one run.
// Note: The pixel byte values are expanded into the higher bytes of a word due
// to the way unpacking works. We compensate for this with an extra shift.
MOVD XMM1, DWORD PTR [RCX] // data is unaligned
MOVD XMM2, DWORD PTR [RDX] // data is unaligned
PXOR XMM0, XMM0 // clear source pixel register for unpacking
PUNPCKLBW XMM0, XMM1{[RCX]} // unpack source pixel byte values into words
PSRLW XMM0, 8 // move higher bytes to lower bytes
PXOR XMM1, XMM1 // clear target pixel register for unpacking
PUNPCKLBW XMM1, XMM2{[RDX]} // unpack target pixel byte values into words
MOVQ XMM2, XMM1 // make a copy of the shifted values, we need them again
PSRLW XMM1, 8 // move higher bytes to lower bytes
// Load XMM3 with the source alpha value (replicate it for every component).
// Expand it to word size.
MOVQ XMM3, XMM0
PUNPCKHWD XMM3, XMM3
PUNPCKHDQ XMM3, XMM3
// calculation is: target = (alpha * (source - target) + 256 * target) / 256
PSUBW XMM0, XMM1 // source - target
PMULLW XMM0, XMM3 // alpha * (source - target)
PADDW XMM0, XMM2 // add target (in shifted form)
PSRLW XMM0, 8 // divide by 256
// Bias is accounted for by conversion of range 0..255 to -128..127,
// doing a saturated add and convert back to 0..255.
PSUBW XMM0, XMM4
PADDSW XMM0, XMM5
PADDW XMM0, XMM4
PACKUSWB XMM0, XMM0 // convert words to bytes with saturation
MOVD DWORD PTR [RDX], XMM0 // store the result
@3:
ADD RCX, 4
ADD RDX, 4
DEC R8D
JNZ @1
{$else}
@ -168,7 +281,71 @@ procedure AlphaBlendLineMaster(Source, Destination: Pointer; Count: Integer; Con
asm
{$ifdef CPU64}
// RCX contains Source
// RDX contains Destination
// R8D contains Count
// R9D contains ConstantAlpha
// Bias is on the stack
//.SAVENV XMM6 //todo see how implement in fpc AlphaBlendLineMaster
// Load XMM3 with the constant alpha value (replicate it for every component).
// Expand it to word size.
MOVD XMM3, R9D // ConstantAlpha
PUNPCKLWD XMM3, XMM3
PUNPCKLDQ XMM3, XMM3
// Load XMM5 with the bias value.
MOV R10D, [Bias]
MOVD XMM5, R10D
PUNPCKLWD XMM5, XMM5
PUNPCKLDQ XMM5, XMM5
// Load XMM4 with 128 to allow for saturated biasing.
MOV R10D, 128
MOVD XMM4, R10D
PUNPCKLWD XMM4, XMM4
PUNPCKLDQ XMM4, XMM4
@1: // The pixel loop calculates an entire pixel in one run.
// Note: The pixel byte values are expanded into the higher bytes of a word due
// to the way unpacking works. We compensate for this with an extra shift.
MOVD XMM1, DWORD PTR [RCX] // data is unaligned
MOVD XMM2, DWORD PTR [RDX] // data is unaligned
PXOR XMM0, XMM0 // clear source pixel register for unpacking
PUNPCKLBW XMM0, XMM1{[RCX]} // unpack source pixel byte values into words
PSRLW XMM0, 8 // move higher bytes to lower bytes
PXOR XMM1, XMM1 // clear target pixel register for unpacking
PUNPCKLBW XMM1, XMM2{[RCX]} // unpack target pixel byte values into words
MOVQ XMM2, XMM1 // make a copy of the shifted values, we need them again
PSRLW XMM1, 8 // move higher bytes to lower bytes
// Load XMM6 with the source alpha value (replicate it for every component).
// Expand it to word size.
MOVQ XMM6, XMM0
PUNPCKHWD XMM6, XMM6
PUNPCKHDQ XMM6, XMM6
PMULLW XMM6, XMM3 // source alpha * master alpha
PSRLW XMM6, 8 // divide by 256
// calculation is: target = (alpha * master alpha * (source - target) + 256 * target) / 256
PSUBW XMM0, XMM1 // source - target
PMULLW XMM0, XMM6 // alpha * (source - target)
PADDW XMM0, XMM2 // add target (in shifted form)
PSRLW XMM0, 8 // divide by 256
// Bias is accounted for by conversion of range 0..255 to -128..127,
// doing a saturated add and convert back to 0..255.
PSUBW XMM0, XMM4
PADDSW XMM0, XMM5
PADDW XMM0, XMM4
PACKUSWB XMM0, XMM0 // convert words to bytes with saturation
MOVD DWORD PTR [RDX], XMM0 // store the result
@3:
ADD RCX, 4
ADD RDX, 4
DEC R8D
JNZ @1
{$else}
@ -256,6 +433,51 @@ procedure AlphaBlendLineMasterAndColor(Destination: Pointer; Count: Integer; Con
asm
{$ifdef CPU64}
// RCX contains Destination
// EDX contains Count
// R8D contains ConstantAlpha
// R9D contains Color
//.NOFRAME
// The used formula is: target = (alpha * color + (256 - alpha) * target) / 256.
// alpha * color (factor 1) and 256 - alpha (factor 2) are constant values which can be calculated in advance.
// The remaining calculation is therefore: target = (F1 + F2 * target) / 256
// Load XMM3 with the constant alpha value (replicate it for every component).
// Expand it to word size. (Every calculation here works on word sized operands.)
MOVD XMM3, R8D // ConstantAlpha
PUNPCKLWD XMM3, XMM3
PUNPCKLDQ XMM3, XMM3
// Calculate factor 2.
MOV R10D, $100
MOVD XMM2, R10D
PUNPCKLWD XMM2, XMM2
PUNPCKLDQ XMM2, XMM2
PSUBW XMM2, XMM3 // XMM2 contains now: 255 - alpha = F2
// Now calculate factor 1. Alpha is still in XMM3, but the r and b components of Color must be swapped.
BSWAP R9D // Color
ROR R9D, 8
MOVD XMM1, R9D // Load the color and convert to word sized values.
PXOR XMM4, XMM4
PUNPCKLBW XMM1, XMM4
PMULLW XMM1, XMM3 // XMM1 contains now: color * alpha = F1
@1: // The pixel loop calculates an entire pixel in one run.
MOVD XMM0, DWORD PTR [RCX]
PUNPCKLBW XMM0, XMM4
PMULLW XMM0, XMM2 // calculate F1 + F2 * target
PADDW XMM0, XMM1
PSRLW XMM0, 8 // divide by 256
PACKUSWB XMM0, XMM0 // convert words to bytes with saturation
MOVD DWORD PTR [RCX], XMM0 // store the result
ADD RCX, 4
DEC EDX
JNZ @1
{$else}
@ -421,6 +643,11 @@ var
begin
if not IsRectEmpty(R) then
begin
{$ifdef CPU64}
//avoid MasterAlpha due to incomplete AlphaBlendLineMaster. See comment in procedure
if Mode = bmMasterAlpha then
Mode := bmConstantAlpha;
{$endif}
// Note: it is tempting to optimize the special cases for constant alpha 0 and 255 by just ignoring soure
// (alpha = 0) or simply do a blit (alpha = 255). But this does not take the bias into account.
case Mode of