Repo created
This commit is contained in:
parent
81b91f4139
commit
f8c34fa5ee
22732 changed files with 4815320 additions and 2 deletions
578
TMessagesProj/jni/mozjpeg/simd/i386/jccolext-avx2.asm
Normal file
578
TMessagesProj/jni/mozjpeg/simd/i386/jccolext-avx2.asm
Normal file
|
|
@ -0,0 +1,578 @@
|
|||
;
|
||||
; jccolext.asm - colorspace conversion (AVX2)
|
||||
;
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b) + 20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
|
||||
; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_avx2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)]
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edx
|
||||
push ebx
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
mov ebx, JSAMPROW [ebx] ; outptr1
|
||||
mov edx, JSAMPROW [edx] ; outptr2
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
vmovd xmmA, eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
vmovd xmmF, XMM_DWORD [esi+ecx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_DWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
vmovq xmmB, XMM_MMWORD [esi+ecx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
vmovdqu xmmB, XMM_MMWORD [esi+ecx]
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
vpor ymmA, ymmB
|
||||
.column_ld32:
|
||||
test cl, SIZEOF_YMMWORD
|
||||
jz short .column_ld64
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
.column_ld64:
|
||||
test cl, 2*SIZEOF_YMMWORD
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
vmovdqa ymmB, ymmA
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vmovdqu ymmC, ymmA
|
||||
vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
|
||||
vmovdqa ymmG, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
|
||||
; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
|
||||
vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
|
||||
; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
|
||||
vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
|
||||
; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
|
||||
|
||||
vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
|
||||
; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
|
||||
vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
|
||||
; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
|
||||
; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
|
||||
vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
|
||||
; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
|
||||
vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
|
||||
; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
|
||||
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
|
||||
; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
|
||||
; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
|
||||
; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
|
||||
vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
|
||||
; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
|
||||
; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
|
||||
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
|
||||
; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
vpxor ymmH, ymmH, ymmH
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmB, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
|
||||
vmovdqa ymmF, ymmD
|
||||
vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_XMMWORD/16
|
||||
vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
vmovdqa xmmF, xmmA
|
||||
vperm2i128 ymmF, ymmF, ymmF, 1
|
||||
vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
vpor ymmA, ymmA, ymmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
jz short .column_ld16
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
vmovdqa ymmE, ymmA
|
||||
vmovdqa ymmH, ymmF
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
|
||||
vmovdqa ymmB, ymmF
|
||||
vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
|
||||
; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
|
||||
vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
|
||||
; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
|
||||
|
||||
vmovdqa ymmC, ymmF
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
|
||||
; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
|
||||
; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
|
||||
vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
|
||||
; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
|
||||
|
||||
vmovdqa ymmG, ymmD
|
||||
vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
|
||||
; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
|
||||
vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
|
||||
; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vmovdqa ymmH, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
|
||||
; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
|
||||
; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
vpxor ymmF, ymmF, ymmF
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmD, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
|
||||
vmovdqa ymmG, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vpunpcklbw ymmF, ymmF, ymmH
|
||||
vpunpckhbw ymmH, ymmH, ymmH
|
||||
vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
|
||||
; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE
|
||||
vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO
|
||||
vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE
|
||||
vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO
|
||||
|
||||
vmovdqa ymm6, ymm1
|
||||
vpunpcklwd ymm1, ymm1, ymm3
|
||||
vpunpckhwd ymm6, ymm6, ymm3
|
||||
vmovdqa ymm7, ymm1
|
||||
vmovdqa ymm4, ymm6
|
||||
vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||
|
||||
vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
vpxor ymm6, ymm6, ymm6
|
||||
vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL
|
||||
vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH
|
||||
vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500)
|
||||
vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm5=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm7, ymm7, ymm1
|
||||
vpaddd ymm4, ymm4, ymm6
|
||||
vpaddd ymm7, ymm7, ymm5
|
||||
vpaddd ymm4, ymm4, ymm5
|
||||
vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH
|
||||
vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO
|
||||
|
||||
vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE
|
||||
|
||||
vmovdqa ymm6, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm2
|
||||
vpunpckhwd ymm6, ymm6, ymm2
|
||||
vmovdqa ymm5, ymm0
|
||||
vmovdqa ymm4, ymm6
|
||||
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||
|
||||
vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
vpxor ymm0, ymm0, ymm0
|
||||
vpxor ymm6, ymm6, ymm6
|
||||
vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL
|
||||
vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH
|
||||
vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500)
|
||||
vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm5, ymm5, ymm0
|
||||
vpaddd ymm4, ymm4, ymm6
|
||||
vpaddd ymm5, ymm5, ymm1
|
||||
vpaddd ymm4, ymm4, ymm1
|
||||
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH
|
||||
vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE
|
||||
|
||||
vpsllw ymm7, ymm7, BYTE_BIT
|
||||
vpor ymm5, ymm5, ymm7 ; ymm5=Cb
|
||||
vmovdqu YMMWORD [ebx], ymm5 ; Save Cb
|
||||
|
||||
vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO
|
||||
vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE
|
||||
vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO
|
||||
|
||||
vmovdqa ymm4, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm3
|
||||
vpunpckhwd ymm4, ymm4, ymm3
|
||||
vmovdqa ymm7, ymm0
|
||||
vmovdqa ymm5, ymm4
|
||||
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||
|
||||
vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm0, ymm0, YMMWORD [wk(4)]
|
||||
vpaddd ymm4, ymm4, YMMWORD [wk(5)]
|
||||
vpaddd ymm0, ymm0, ymm3
|
||||
vpaddd ymm4, ymm4, ymm3
|
||||
vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
|
||||
vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
|
||||
|
||||
vpxor ymm3, ymm3, ymm3
|
||||
vpxor ymm4, ymm4, ymm4
|
||||
vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL
|
||||
vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH
|
||||
vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500)
|
||||
vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm7, ymm7, ymm3
|
||||
vpaddd ymm5, ymm5, ymm4
|
||||
vpaddd ymm7, ymm7, ymm1
|
||||
vpaddd ymm5, ymm5, ymm1
|
||||
vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL
|
||||
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH
|
||||
vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO
|
||||
|
||||
vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE
|
||||
|
||||
vmovdqa ymm4, ymm6
|
||||
vpunpcklwd ymm6, ymm6, ymm2
|
||||
vpunpckhwd ymm4, ymm4, ymm2
|
||||
vmovdqa ymm1, ymm6
|
||||
vmovdqa ymm5, ymm4
|
||||
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||
|
||||
vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm6, ymm6, YMMWORD [wk(6)]
|
||||
vpaddd ymm4, ymm4, YMMWORD [wk(7)]
|
||||
vpaddd ymm6, ymm6, ymm2
|
||||
vpaddd ymm4, ymm4, ymm2
|
||||
vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
|
||||
vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
|
||||
|
||||
vpsllw ymm0, ymm0, BYTE_BIT
|
||||
vpor ymm6, ymm6, ymm0 ; ymm6=Y
|
||||
vmovdqu YMMWORD [edi], ymm6 ; Save Y
|
||||
|
||||
vpxor ymm2, ymm2, ymm2
|
||||
vpxor ymm4, ymm4, ymm4
|
||||
vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL
|
||||
vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH
|
||||
vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500)
|
||||
vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm0=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm1, ymm1, ymm2
|
||||
vpaddd ymm5, ymm5, ymm4
|
||||
vpaddd ymm1, ymm1, ymm0
|
||||
vpaddd ymm5, ymm5, ymm0
|
||||
vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL
|
||||
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH
|
||||
vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE
|
||||
|
||||
vpsllw ymm7, ymm7, BYTE_BIT
|
||||
vpor ymm1, ymm1, ymm7 ; ymm1=Cr
|
||||
vmovdqu YMMWORD [edx], ymm1 ; Save Cr
|
||||
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
|
||||
add edi, byte SIZEOF_YMMWORD ; outptr0
|
||||
add ebx, byte SIZEOF_YMMWORD ; outptr1
|
||||
add edx, byte SIZEOF_YMMWORD ; outptr2
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
pop ebx
|
||||
pop edx
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
476
TMessagesProj/jni/mozjpeg/simd/i386/jccolext-mmx.asm
Normal file
476
TMessagesProj/jni/mozjpeg/simd/i386/jccolext-mmx.asm
Normal file
|
|
@ -0,0 +1,476 @@
|
|||
;
|
||||
; jccolext.asm - colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b) + 20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
|
||||
; mmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)] ; num_cols
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edx
|
||||
push ebx
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
mov ebx, JSAMPROW [ebx] ; outptr1
|
||||
mov edx, JSAMPROW [edx] ; outptr2
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
xor eax, eax
|
||||
mov al, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
xor edx, edx
|
||||
mov dx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
movd mmA, eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd mmG, dword [esi+ecx]
|
||||
psllq mmA, DWORD_BIT
|
||||
por mmA, mmG
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
movq mmG, mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jmp short .rgb_ycc_cnv
|
||||
.column_ld16:
|
||||
test cl, 2*SIZEOF_MMWORD
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movq mmF, mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; mmA=(00 10 20 01 11 21 02 12)
|
||||
; mmG=(22 03 13 23 04 14 24 05)
|
||||
; mmF=(15 25 06 16 26 07 17 27)
|
||||
|
||||
movq mmD, mmA
|
||||
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
|
||||
psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
|
||||
|
||||
punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
|
||||
psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
|
||||
|
||||
punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
|
||||
punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
|
||||
|
||||
movq mmE, mmA
|
||||
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
|
||||
psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
|
||||
|
||||
punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
|
||||
|
||||
punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
|
||||
punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
|
||||
|
||||
pxor mmH, mmH
|
||||
|
||||
movq mmC, mmA
|
||||
punpcklbw mmA, mmH ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC, mmH ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmB, mmE
|
||||
punpcklbw mmE, mmH ; mmE=(20 22 24 26)
|
||||
punpckhbw mmB, mmH ; mmB=(01 03 05 07)
|
||||
|
||||
movq mmF, mmD
|
||||
punpcklbw mmD, mmH ; mmD=(11 13 15 17)
|
||||
punpckhbw mmF, mmH ; mmF=(21 23 25 27)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_MMWORD/8
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_MMWORD/8
|
||||
movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_MMWORD/4
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
movq mmF, mmA
|
||||
movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_MMWORD/2
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movq mmD, mmA
|
||||
movq mmC, mmF
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; mmA=(00 10 20 30 01 11 21 31)
|
||||
; mmF=(02 12 22 32 03 13 23 33)
|
||||
; mmD=(04 14 24 34 05 15 25 35)
|
||||
; mmC=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movq mmB, mmA
|
||||
punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
|
||||
punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
|
||||
|
||||
movq mmG, mmD
|
||||
punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
|
||||
punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
|
||||
|
||||
movq mmE, mmA
|
||||
punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
|
||||
|
||||
movq mmH, mmB
|
||||
punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
|
||||
punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
|
||||
|
||||
pxor mmF, mmF
|
||||
|
||||
movq mmC, mmA
|
||||
punpcklbw mmA, mmF ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC, mmF ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmD, mmB
|
||||
punpcklbw mmB, mmF ; mmB=(01 03 05 07)
|
||||
punpckhbw mmD, mmF ; mmD=(11 13 15 17)
|
||||
|
||||
movq mmG, mmE
|
||||
punpcklbw mmE, mmF ; mmE=(20 22 24 26)
|
||||
punpckhbw mmG, mmF ; mmG=(30 32 34 36)
|
||||
|
||||
punpcklbw mmF, mmH
|
||||
punpckhbw mmH, mmH
|
||||
psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
|
||||
psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
|
||||
; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=RE
|
||||
movq MMWORD [wk(1)], mm1 ; wk(1)=RO
|
||||
movq MMWORD [wk(2)], mm4 ; wk(2)=BE
|
||||
movq MMWORD [wk(3)], mm5 ; wk(3)=BO
|
||||
|
||||
movq mm6, mm1
|
||||
punpcklwd mm1, mm3
|
||||
punpckhwd mm6, mm3
|
||||
movq mm7, mm1
|
||||
movq mm4, mm6
|
||||
pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||
pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||
|
||||
movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
pxor mm1, mm1
|
||||
pxor mm6, mm6
|
||||
punpcklwd mm1, mm5 ; mm1=BOL
|
||||
punpckhwd mm6, mm5 ; mm6=BOH
|
||||
psrld mm1, 1 ; mm1=BOL*FIX(0.500)
|
||||
psrld mm6, 1 ; mm6=BOH*FIX(0.500)
|
||||
|
||||
movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm7, mm1
|
||||
paddd mm4, mm6
|
||||
paddd mm7, mm5
|
||||
paddd mm4, mm5
|
||||
psrld mm7, SCALEBITS ; mm7=CbOL
|
||||
psrld mm4, SCALEBITS ; mm4=CbOH
|
||||
packssdw mm7, mm4 ; mm7=CbO
|
||||
|
||||
movq mm1, MMWORD [wk(2)] ; mm1=BE
|
||||
|
||||
movq mm6, mm0
|
||||
punpcklwd mm0, mm2
|
||||
punpckhwd mm6, mm2
|
||||
movq mm5, mm0
|
||||
movq mm4, mm6
|
||||
pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||
pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||
|
||||
movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
pxor mm0, mm0
|
||||
pxor mm6, mm6
|
||||
punpcklwd mm0, mm1 ; mm0=BEL
|
||||
punpckhwd mm6, mm1 ; mm6=BEH
|
||||
psrld mm0, 1 ; mm0=BEL*FIX(0.500)
|
||||
psrld mm6, 1 ; mm6=BEH*FIX(0.500)
|
||||
|
||||
movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm5, mm0
|
||||
paddd mm4, mm6
|
||||
paddd mm5, mm1
|
||||
paddd mm4, mm1
|
||||
psrld mm5, SCALEBITS ; mm5=CbEL
|
||||
psrld mm4, SCALEBITS ; mm4=CbEH
|
||||
packssdw mm5, mm4 ; mm5=CbE
|
||||
|
||||
psllw mm7, BYTE_BIT
|
||||
por mm5, mm7 ; mm5=Cb
|
||||
movq MMWORD [ebx], mm5 ; Save Cb
|
||||
|
||||
movq mm0, MMWORD [wk(3)] ; mm0=BO
|
||||
movq mm6, MMWORD [wk(2)] ; mm6=BE
|
||||
movq mm1, MMWORD [wk(1)] ; mm1=RO
|
||||
|
||||
movq mm4, mm0
|
||||
punpcklwd mm0, mm3
|
||||
punpckhwd mm4, mm3
|
||||
movq mm7, mm0
|
||||
movq mm5, mm4
|
||||
pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||
pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||
|
||||
movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
|
||||
|
||||
paddd mm0, MMWORD [wk(4)]
|
||||
paddd mm4, MMWORD [wk(5)]
|
||||
paddd mm0, mm3
|
||||
paddd mm4, mm3
|
||||
psrld mm0, SCALEBITS ; mm0=YOL
|
||||
psrld mm4, SCALEBITS ; mm4=YOH
|
||||
packssdw mm0, mm4 ; mm0=YO
|
||||
|
||||
pxor mm3, mm3
|
||||
pxor mm4, mm4
|
||||
punpcklwd mm3, mm1 ; mm3=ROL
|
||||
punpckhwd mm4, mm1 ; mm4=ROH
|
||||
psrld mm3, 1 ; mm3=ROL*FIX(0.500)
|
||||
psrld mm4, 1 ; mm4=ROH*FIX(0.500)
|
||||
|
||||
movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm7, mm3
|
||||
paddd mm5, mm4
|
||||
paddd mm7, mm1
|
||||
paddd mm5, mm1
|
||||
psrld mm7, SCALEBITS ; mm7=CrOL
|
||||
psrld mm5, SCALEBITS ; mm5=CrOH
|
||||
packssdw mm7, mm5 ; mm7=CrO
|
||||
|
||||
movq mm3, MMWORD [wk(0)] ; mm3=RE
|
||||
|
||||
movq mm4, mm6
|
||||
punpcklwd mm6, mm2
|
||||
punpckhwd mm4, mm2
|
||||
movq mm1, mm6
|
||||
movq mm5, mm4
|
||||
pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||
pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||
|
||||
movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
|
||||
|
||||
paddd mm6, MMWORD [wk(6)]
|
||||
paddd mm4, MMWORD [wk(7)]
|
||||
paddd mm6, mm2
|
||||
paddd mm4, mm2
|
||||
psrld mm6, SCALEBITS ; mm6=YEL
|
||||
psrld mm4, SCALEBITS ; mm4=YEH
|
||||
packssdw mm6, mm4 ; mm6=YE
|
||||
|
||||
psllw mm0, BYTE_BIT
|
||||
por mm6, mm0 ; mm6=Y
|
||||
movq MMWORD [edi], mm6 ; Save Y
|
||||
|
||||
pxor mm2, mm2
|
||||
pxor mm4, mm4
|
||||
punpcklwd mm2, mm3 ; mm2=REL
|
||||
punpckhwd mm4, mm3 ; mm4=REH
|
||||
psrld mm2, 1 ; mm2=REL*FIX(0.500)
|
||||
psrld mm4, 1 ; mm4=REH*FIX(0.500)
|
||||
|
||||
movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd mm1, mm2
|
||||
paddd mm5, mm4
|
||||
paddd mm1, mm0
|
||||
paddd mm5, mm0
|
||||
psrld mm1, SCALEBITS ; mm1=CrEL
|
||||
psrld mm5, SCALEBITS ; mm5=CrEH
|
||||
packssdw mm1, mm5 ; mm1=CrE
|
||||
|
||||
psllw mm7, BYTE_BIT
|
||||
por mm1, mm7 ; mm1=Cr
|
||||
movq MMWORD [edx], mm1 ; Save Cr
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte SIZEOF_MMWORD ; outptr0
|
||||
add ebx, byte SIZEOF_MMWORD ; outptr1
|
||||
add edx, byte SIZEOF_MMWORD ; outptr2
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
pop ebx
|
||||
pop edx
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
503
TMessagesProj/jni/mozjpeg/simd/i386/jccolext-sse2.asm
Normal file
503
TMessagesProj/jni/mozjpeg/simd/i386/jccolext-sse2.asm
Normal file
|
|
@ -0,0 +1,503 @@
|
|||
;
|
||||
; jccolext.asm - colorspace conversion (SSE2)
|
||||
;
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b) + 20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)]
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edx
|
||||
push ebx
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
mov ebx, JSAMPROW [ebx] ; outptr1
|
||||
mov edx, JSAMPROW [edx] ; outptr2
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
movd xmmA, eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd xmmF, XMM_DWORD [esi+ecx]
|
||||
pslldq xmmA, SIZEOF_DWORD
|
||||
por xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
movq xmmB, XMM_MMWORD [esi+ecx]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
movdqa xmmF, xmmA
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jmp short .rgb_ycc_cnv
|
||||
.column_ld32:
|
||||
test cl, 2*SIZEOF_XMMWORD
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movdqa xmmB, xmmA
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||
|
||||
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
pxor xmmH, xmmH
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmB, xmmE
|
||||
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
|
||||
movdqa xmmF, xmmD
|
||||
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_XMMWORD/16
|
||||
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmE
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
movdqa xmmE, xmmA
|
||||
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movdqa xmmF, xmmA
|
||||
movdqa xmmH, xmmE
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
|
||||
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||
|
||||
movdqa xmmC, xmmF
|
||||
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||
|
||||
movdqa xmmB, xmmA
|
||||
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||
|
||||
movdqa xmmG, xmmD
|
||||
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
movdqa xmmH, xmmB
|
||||
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
pxor xmmF, xmmF
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmD, xmmB
|
||||
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
|
||||
movdqa xmmG, xmmE
|
||||
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
punpcklbw xmmF, xmmH
|
||||
punpckhbw xmmH, xmmH
|
||||
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
||||
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
|
||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
|
||||
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm6, xmm3
|
||||
movdqa xmm7, xmm1
|
||||
movdqa xmm4, xmm6
|
||||
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm6, xmm6
|
||||
punpcklwd xmm1, xmm5 ; xmm1=BOL
|
||||
punpckhwd xmm6, xmm5 ; xmm6=BOH
|
||||
psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
|
||||
psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
|
||||
|
||||
movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm4, xmm6
|
||||
paddd xmm7, xmm5
|
||||
paddd xmm4, xmm5
|
||||
psrld xmm7, SCALEBITS ; xmm7=CbOL
|
||||
psrld xmm4, SCALEBITS ; xmm4=CbOH
|
||||
packssdw xmm7, xmm4 ; xmm7=CbO
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm6, xmm2
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm4, xmm6
|
||||
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||
|
||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm6, xmm6
|
||||
punpcklwd xmm0, xmm1 ; xmm0=BEL
|
||||
punpckhwd xmm6, xmm1 ; xmm6=BEH
|
||||
psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
|
||||
psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
|
||||
|
||||
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm5, xmm0
|
||||
paddd xmm4, xmm6
|
||||
paddd xmm5, xmm1
|
||||
paddd xmm4, xmm1
|
||||
psrld xmm5, SCALEBITS ; xmm5=CbEL
|
||||
psrld xmm4, SCALEBITS ; xmm4=CbEH
|
||||
packssdw xmm5, xmm4 ; xmm5=CbE
|
||||
|
||||
psllw xmm7, BYTE_BIT
|
||||
por xmm5, xmm7 ; xmm5=Cb
|
||||
movdqa XMMWORD [ebx], xmm5 ; Save Cb
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
|
||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm3
|
||||
punpckhwd xmm4, xmm3
|
||||
movdqa xmm7, xmm0
|
||||
movdqa xmm5, xmm4
|
||||
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||
|
||||
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
|
||||
|
||||
paddd xmm0, XMMWORD [wk(4)]
|
||||
paddd xmm4, XMMWORD [wk(5)]
|
||||
paddd xmm0, xmm3
|
||||
paddd xmm4, xmm3
|
||||
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||
packssdw xmm0, xmm4 ; xmm0=YO
|
||||
|
||||
pxor xmm3, xmm3
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm3, xmm1 ; xmm3=ROL
|
||||
punpckhwd xmm4, xmm1 ; xmm4=ROH
|
||||
psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
|
||||
psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
|
||||
|
||||
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm7, xmm3
|
||||
paddd xmm5, xmm4
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm5, xmm1
|
||||
psrld xmm7, SCALEBITS ; xmm7=CrOL
|
||||
psrld xmm5, SCALEBITS ; xmm5=CrOH
|
||||
packssdw xmm7, xmm5 ; xmm7=CrO
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpcklwd xmm6, xmm2
|
||||
punpckhwd xmm4, xmm2
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm5, xmm4
|
||||
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||
|
||||
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
|
||||
|
||||
paddd xmm6, XMMWORD [wk(6)]
|
||||
paddd xmm4, XMMWORD [wk(7)]
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm4, xmm2
|
||||
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||
packssdw xmm6, xmm4 ; xmm6=YE
|
||||
|
||||
psllw xmm0, BYTE_BIT
|
||||
por xmm6, xmm0 ; xmm6=Y
|
||||
movdqa XMMWORD [edi], xmm6 ; Save Y
|
||||
|
||||
pxor xmm2, xmm2
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm2, xmm3 ; xmm2=REL
|
||||
punpckhwd xmm4, xmm3 ; xmm4=REH
|
||||
psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
|
||||
psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
|
||||
|
||||
movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm5, xmm4
|
||||
paddd xmm1, xmm0
|
||||
paddd xmm5, xmm0
|
||||
psrld xmm1, SCALEBITS ; xmm1=CrEL
|
||||
psrld xmm5, SCALEBITS ; xmm5=CrEH
|
||||
packssdw xmm1, xmm5 ; xmm1=CrE
|
||||
|
||||
psllw xmm7, BYTE_BIT
|
||||
por xmm1, xmm7 ; xmm1=Cr
|
||||
movdqa XMMWORD [edx], xmm1 ; Save Cr
|
||||
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr0
|
||||
add ebx, byte SIZEOF_XMMWORD ; outptr1
|
||||
add edx, byte SIZEOF_XMMWORD ; outptr2
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
pop ebx
|
||||
pop edx
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
121
TMessagesProj/jni/mozjpeg/simd/i386/jccolor-avx2.asm
Normal file
121
TMessagesProj/jni/mozjpeg/simd/i386/jccolor-avx2.asm
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
;
|
||||
; jccolor.asm - colorspace conversion (AVX2)
|
||||
;
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_081 equ 5329 ; FIX(0.08131)
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_168 equ 11059 ; FIX(0.16874)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_331 equ 21709 ; FIX(0.33126)
|
||||
F_0_418 equ 27439 ; FIX(0.41869)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
|
||||
|
||||
EXTN(jconst_rgb_ycc_convert_avx2):
|
||||
|
||||
PW_F0299_F0337 times 8 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 8 dw F_0_114, F_0_250
|
||||
PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331
|
||||
PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418
|
||||
PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
|
||||
(CENTERJSAMPLE << SCALEBITS)
|
||||
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
121
TMessagesProj/jni/mozjpeg/simd/i386/jccolor-mmx.asm
Normal file
121
TMessagesProj/jni/mozjpeg/simd/i386/jccolor-mmx.asm
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
;
|
||||
; jccolor.asm - colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_081 equ 5329 ; FIX(0.08131)
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_168 equ 11059 ; FIX(0.16874)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_331 equ 21709 ; FIX(0.33126)
|
||||
F_0_418 equ 27439 ; FIX(0.41869)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
|
||||
|
||||
EXTN(jconst_rgb_ycc_convert_mmx):
|
||||
|
||||
PW_F0299_F0337 times 2 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
|
||||
PW_MF016_MF033 times 2 dw -F_0_168, -F_0_331
|
||||
PW_MF008_MF041 times 2 dw -F_0_081, -F_0_418
|
||||
PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \
|
||||
(CENTERJSAMPLE << SCALEBITS)
|
||||
PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jccolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
|
||||
%include "jccolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
|
||||
%include "jccolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
|
||||
%include "jccolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
|
||||
%include "jccolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
|
||||
%include "jccolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
|
||||
%include "jccolext-mmx.asm"
|
||||
120
TMessagesProj/jni/mozjpeg/simd/i386/jccolor-sse2.asm
Normal file
120
TMessagesProj/jni/mozjpeg/simd/i386/jccolor-sse2.asm
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
;
|
||||
; jccolor.asm - colorspace conversion (SSE2)
|
||||
;
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_081 equ 5329 ; FIX(0.08131)
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_168 equ 11059 ; FIX(0.16874)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_331 equ 21709 ; FIX(0.33126)
|
||||
F_0_418 equ 27439 ; FIX(0.41869)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
|
||||
|
||||
EXTN(jconst_rgb_ycc_convert_sse2):
|
||||
|
||||
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
|
||||
PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331
|
||||
PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418
|
||||
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
|
||||
(CENTERJSAMPLE << SCALEBITS)
|
||||
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
113
TMessagesProj/jni/mozjpeg/simd/i386/jcgray-avx2.asm
Normal file
113
TMessagesProj/jni/mozjpeg/simd/i386/jcgray-avx2.asm
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
;
|
||||
; jcgray.asm - grayscale colorspace conversion (AVX2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_avx2):
|
||||
|
||||
PW_F0299_F0337 times 8 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 8 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
113
TMessagesProj/jni/mozjpeg/simd/i386/jcgray-mmx.asm
Normal file
113
TMessagesProj/jni/mozjpeg/simd/i386/jcgray-mmx.asm
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
;
|
||||
; jcgray.asm - grayscale colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_mmx):
|
||||
|
||||
PW_F0299_F0337 times 2 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jcgryext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
|
||||
%include "jcgryext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
|
||||
%include "jcgryext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
|
||||
%include "jcgryext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
|
||||
%include "jcgryext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
|
||||
%include "jcgryext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
|
||||
%include "jcgryext-mmx.asm"
|
||||
112
TMessagesProj/jni/mozjpeg/simd/i386/jcgray-sse2.asm
Normal file
112
TMessagesProj/jni/mozjpeg/simd/i386/jcgray-sse2.asm
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
;
|
||||
; jcgray.asm - grayscale colorspace conversion (SSE2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_sse2):
|
||||
|
||||
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
457
TMessagesProj/jni/mozjpeg/simd/i386/jcgryext-avx2.asm
Normal file
457
TMessagesProj/jni/mozjpeg/simd/i386/jcgryext-avx2.asm
Normal file
|
|
@ -0,0 +1,457 @@
|
|||
;
|
||||
; jcgryext.asm - grayscale colorspace conversion (AVX2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b) + 20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
|
||||
; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_avx2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)]
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
vmovd xmmA, eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
vmovd xmmF, XMM_DWORD [esi+ecx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_DWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
vmovq xmmB, XMM_MMWORD [esi+ecx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
vmovdqu xmmB, XMM_MMWORD [esi+ecx]
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
vpor ymmA, ymmB
|
||||
.column_ld32:
|
||||
test cl, SIZEOF_YMMWORD
|
||||
jz short .column_ld64
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
.column_ld64:
|
||||
test cl, 2*SIZEOF_YMMWORD
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
vmovdqa ymmB, ymmA
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vmovdqu ymmC, ymmA
|
||||
vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
|
||||
vmovdqa ymmG, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
|
||||
; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
|
||||
vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
|
||||
; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
|
||||
vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
|
||||
; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
|
||||
|
||||
vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
|
||||
; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
|
||||
vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
|
||||
; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
|
||||
; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
|
||||
vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
|
||||
; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
|
||||
vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
|
||||
; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
|
||||
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
|
||||
; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
|
||||
; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
|
||||
; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
|
||||
vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
|
||||
; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
|
||||
; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
|
||||
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
|
||||
; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
vpxor ymmH, ymmH, ymmH
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmB, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
|
||||
vmovdqa ymmF, ymmD
|
||||
vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_XMMWORD/16
|
||||
vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
vmovdqa xmmF, xmmA
|
||||
vperm2i128 ymmF, ymmF, ymmF, 1
|
||||
vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
vpor ymmA, ymmA, ymmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
jz short .column_ld16
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
vmovdqa ymmE, ymmA
|
||||
vmovdqa ymmH, ymmF
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
|
||||
vmovdqa ymmB, ymmF
|
||||
vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
|
||||
; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
|
||||
vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
|
||||
; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
|
||||
|
||||
vmovdqa ymmC, ymmF
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
|
||||
; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
|
||||
; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
|
||||
vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
|
||||
; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
|
||||
|
||||
vmovdqa ymmG, ymmD
|
||||
vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
|
||||
; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
|
||||
vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
|
||||
; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vmovdqa ymmH, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
|
||||
; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
|
||||
; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
vpxor ymmF, ymmF, ymmF
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmD, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
|
||||
vmovdqa ymmG, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vpunpcklbw ymmF, ymmF, ymmH
|
||||
vpunpckhbw ymmH, ymmH, ymmH
|
||||
vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
|
||||
; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
vmovdqa ymm6, ymm1
|
||||
vpunpcklwd ymm1, ymm1, ymm3
|
||||
vpunpckhwd ymm6, ymm6, ymm3
|
||||
vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
vmovdqa ymm6, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm2
|
||||
vpunpckhwd ymm6, ymm6, ymm2
|
||||
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
vmovdqa ymm0, ymm5 ; ymm0=BO
|
||||
vmovdqa ymm6, ymm4 ; ymm6=BE
|
||||
|
||||
vmovdqa ymm4, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm3
|
||||
vpunpckhwd ymm4, ymm4, ymm3
|
||||
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpaddd ymm4, ymm4, ymm7
|
||||
vpaddd ymm0, ymm0, ymm3
|
||||
vpaddd ymm4, ymm4, ymm3
|
||||
vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
|
||||
vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
|
||||
|
||||
vmovdqa ymm4, ymm6
|
||||
vpunpcklwd ymm6, ymm6, ymm2
|
||||
vpunpckhwd ymm4, ymm4, ymm2
|
||||
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm6, ymm6, YMMWORD [wk(0)]
|
||||
vpaddd ymm4, ymm4, YMMWORD [wk(1)]
|
||||
vpaddd ymm6, ymm6, ymm2
|
||||
vpaddd ymm4, ymm4, ymm2
|
||||
vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
|
||||
vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
|
||||
|
||||
vpsllw ymm0, ymm0, BYTE_BIT
|
||||
vpor ymm6, ymm6, ymm0 ; ymm6=Y
|
||||
vmovdqu YMMWORD [edi], ymm6 ; Save Y
|
||||
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
|
||||
add edi, byte SIZEOF_YMMWORD ; outptr0
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
355
TMessagesProj/jni/mozjpeg/simd/i386/jcgryext-mmx.asm
Normal file
355
TMessagesProj/jni/mozjpeg/simd/i386/jcgryext-mmx.asm
Normal file
|
|
@ -0,0 +1,355 @@
|
|||
;
|
||||
; jcgryext.asm - grayscale colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b) + 20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
|
||||
; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)] ; num_cols
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
xor eax, eax
|
||||
mov al, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
xor edx, edx
|
||||
mov dx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
movd mmA, eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd mmG, dword [esi+ecx]
|
||||
psllq mmA, DWORD_BIT
|
||||
por mmA, mmG
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
movq mmG, mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jmp short .rgb_gray_cnv
|
||||
.column_ld16:
|
||||
test cl, 2*SIZEOF_MMWORD
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movq mmF, mmA
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; mmA=(00 10 20 01 11 21 02 12)
|
||||
; mmG=(22 03 13 23 04 14 24 05)
|
||||
; mmF=(15 25 06 16 26 07 17 27)
|
||||
|
||||
movq mmD, mmA
|
||||
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
|
||||
psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
|
||||
|
||||
punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
|
||||
psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
|
||||
|
||||
punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
|
||||
punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
|
||||
|
||||
movq mmE, mmA
|
||||
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
|
||||
psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
|
||||
|
||||
punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
|
||||
|
||||
punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
|
||||
punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
|
||||
|
||||
pxor mmH, mmH
|
||||
|
||||
movq mmC, mmA
|
||||
punpcklbw mmA, mmH ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC, mmH ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmB, mmE
|
||||
punpcklbw mmE, mmH ; mmE=(20 22 24 26)
|
||||
punpckhbw mmB, mmH ; mmB=(01 03 05 07)
|
||||
|
||||
movq mmF, mmD
|
||||
punpcklbw mmD, mmH ; mmD=(11 13 15 17)
|
||||
punpckhbw mmF, mmH ; mmF=(21 23 25 27)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_MMWORD/8
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_MMWORD/8
|
||||
movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_MMWORD/4
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
movq mmF, mmA
|
||||
movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_MMWORD/2
|
||||
mov ecx, SIZEOF_MMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movq mmD, mmA
|
||||
movq mmC, mmF
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
|
||||
movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; mmA=(00 10 20 30 01 11 21 31)
|
||||
; mmF=(02 12 22 32 03 13 23 33)
|
||||
; mmD=(04 14 24 34 05 15 25 35)
|
||||
; mmC=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movq mmB, mmA
|
||||
punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
|
||||
punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
|
||||
|
||||
movq mmG, mmD
|
||||
punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
|
||||
punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
|
||||
|
||||
movq mmE, mmA
|
||||
punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
|
||||
punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
|
||||
|
||||
movq mmH, mmB
|
||||
punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
|
||||
punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
|
||||
|
||||
pxor mmF, mmF
|
||||
|
||||
movq mmC, mmA
|
||||
punpcklbw mmA, mmF ; mmA=(00 02 04 06)
|
||||
punpckhbw mmC, mmF ; mmC=(10 12 14 16)
|
||||
|
||||
movq mmD, mmB
|
||||
punpcklbw mmB, mmF ; mmB=(01 03 05 07)
|
||||
punpckhbw mmD, mmF ; mmD=(11 13 15 17)
|
||||
|
||||
movq mmG, mmE
|
||||
punpcklbw mmE, mmF ; mmE=(20 22 24 26)
|
||||
punpckhbw mmG, mmF ; mmG=(30 32 34 36)
|
||||
|
||||
punpcklbw mmF, mmH
|
||||
punpckhbw mmH, mmH
|
||||
psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
|
||||
psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
|
||||
; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
movq mm6, mm1
|
||||
punpcklwd mm1, mm3
|
||||
punpckhwd mm6, mm3
|
||||
pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movq mm6, mm0
|
||||
punpcklwd mm0, mm2
|
||||
punpckhwd mm6, mm2
|
||||
pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movq mm0, mm5 ; mm0=BO
|
||||
movq mm6, mm4 ; mm6=BE
|
||||
|
||||
movq mm4, mm0
|
||||
punpcklwd mm0, mm3
|
||||
punpckhwd mm4, mm3
|
||||
pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
|
||||
|
||||
paddd mm0, mm1
|
||||
paddd mm4, mm7
|
||||
paddd mm0, mm3
|
||||
paddd mm4, mm3
|
||||
psrld mm0, SCALEBITS ; mm0=YOL
|
||||
psrld mm4, SCALEBITS ; mm4=YOH
|
||||
packssdw mm0, mm4 ; mm0=YO
|
||||
|
||||
movq mm4, mm6
|
||||
punpcklwd mm6, mm2
|
||||
punpckhwd mm4, mm2
|
||||
pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
|
||||
|
||||
paddd mm6, MMWORD [wk(0)]
|
||||
paddd mm4, MMWORD [wk(1)]
|
||||
paddd mm6, mm2
|
||||
paddd mm4, mm2
|
||||
psrld mm6, SCALEBITS ; mm6=YEL
|
||||
psrld mm4, SCALEBITS ; mm4=YEH
|
||||
packssdw mm6, mm4 ; mm6=YE
|
||||
|
||||
psllw mm0, BYTE_BIT
|
||||
por mm6, mm0 ; mm6=Y
|
||||
movq MMWORD [edi], mm6 ; Save Y
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte SIZEOF_MMWORD ; outptr0
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
382
TMessagesProj/jni/mozjpeg/simd/i386/jcgryext-sse2.asm
Normal file
382
TMessagesProj/jni/mozjpeg/simd/i386/jcgryext-sse2.asm
Normal file
|
|
@ -0,0 +1,382 @@
|
|||
;
|
||||
; jcgryext.asm - grayscale colorspace conversion (SSE2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION img_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
|
||||
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
|
||||
%define output_row(b) (b) + 20 ; JDIMENSION output_row
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [img_width(eax)]
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov esi, JSAMPIMAGE [output_buf(eax)]
|
||||
mov ecx, JDIMENSION [output_row(eax)]
|
||||
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
|
||||
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov esi, JSAMPARRAY [input_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
pushpic eax
|
||||
push edi
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr0
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push eax
|
||||
push edx
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
movd xmmA, eax
|
||||
pop edx
|
||||
pop eax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd xmmF, XMM_DWORD [esi+ecx]
|
||||
pslldq xmmA, SIZEOF_DWORD
|
||||
por xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
movq xmmB, XMM_MMWORD [esi+ecx]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
movdqa xmmF, xmmA
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jmp short .rgb_gray_cnv
|
||||
.column_ld32:
|
||||
test cl, 2*SIZEOF_XMMWORD
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmB, xmmA
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||
|
||||
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
pxor xmmH, xmmH
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmB, xmmE
|
||||
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
|
||||
movdqa xmmF, xmmD
|
||||
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_XMMWORD/16
|
||||
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmE
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
movdqa xmmE, xmmA
|
||||
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmF, xmmA
|
||||
movdqa xmmH, xmmE
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
|
||||
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||
|
||||
movdqa xmmC, xmmF
|
||||
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||
|
||||
movdqa xmmB, xmmA
|
||||
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||
|
||||
movdqa xmmG, xmmD
|
||||
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
movdqa xmmH, xmmB
|
||||
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
pxor xmmF, xmmF
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmD, xmmB
|
||||
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
|
||||
movdqa xmmG, xmmE
|
||||
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
punpcklbw xmmF, xmmH
|
||||
punpckhbw xmmH, xmmH
|
||||
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
||||
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm6, xmm3
|
||||
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm6, xmm2
|
||||
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa xmm0, xmm5 ; xmm0=BO
|
||||
movdqa xmm6, xmm4 ; xmm6=BE
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm3
|
||||
punpckhwd xmm4, xmm3
|
||||
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
|
||||
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm4, xmm7
|
||||
paddd xmm0, xmm3
|
||||
paddd xmm4, xmm3
|
||||
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||
packssdw xmm0, xmm4 ; xmm0=YO
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpcklwd xmm6, xmm2
|
||||
punpckhwd xmm4, xmm2
|
||||
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
|
||||
|
||||
paddd xmm6, XMMWORD [wk(0)]
|
||||
paddd xmm4, XMMWORD [wk(1)]
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm4, xmm2
|
||||
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||
packssdw xmm6, xmm4 ; xmm6=YE
|
||||
|
||||
psllw xmm0, BYTE_BIT
|
||||
por xmm6, xmm0 ; xmm6=Y
|
||||
movdqa XMMWORD [edi], xmm6 ; Save Y
|
||||
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr0
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop ecx ; col
|
||||
pop esi
|
||||
pop edi
|
||||
poppic eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add edi, byte SIZEOF_JSAMPROW
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
424
TMessagesProj/jni/mozjpeg/simd/i386/jchuff-sse2.asm
Normal file
424
TMessagesProj/jni/mozjpeg/simd/i386/jchuff-sse2.asm
Normal file
|
|
@ -0,0 +1,424 @@
|
|||
;
|
||||
; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
|
||||
;
|
||||
; Copyright (C) 2009-2011, 2014-2017, D. R. Commander.
|
||||
; Copyright (C) 2015, Matthieu Darbois.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation for Huffman coding of one block.
|
||||
; The following code is based directly on jchuff.c; see jchuff.c for more
|
||||
; details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_huff_encode_one_block)
|
||||
|
||||
EXTN(jconst_huff_encode_one_block):
|
||||
|
||||
%include "jpeg_nbits_table.inc"
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
; These macros perform the same task as the emit_bits() function in the
|
||||
; original libjpeg code. In addition to reducing overhead by explicitly
|
||||
; inlining the code, additional performance is achieved by taking into
|
||||
; account the size of the bit buffer and waiting until it is almost full
|
||||
; before emptying it. This mostly benefits 64-bit platforms, since 6
|
||||
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
|
||||
|
||||
%macro EMIT_BYTE 0
|
||||
sub put_bits, 8 ; put_bits -= 8;
|
||||
mov edx, put_buffer
|
||||
mov ecx, put_bits
|
||||
shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
|
||||
mov byte [eax], dl ; *buffer++ = c;
|
||||
add eax, 1
|
||||
cmp dl, 0xFF ; need to stuff a zero byte?
|
||||
jne %%.EMIT_BYTE_END
|
||||
mov byte [eax], 0 ; *buffer++ = 0;
|
||||
add eax, 1
|
||||
%%.EMIT_BYTE_END:
|
||||
%endmacro
|
||||
|
||||
%macro PUT_BITS 1
|
||||
add put_bits, ecx ; put_bits += size;
|
||||
shl put_buffer, cl ; put_buffer = (put_buffer << size);
|
||||
or put_buffer, %1
|
||||
%endmacro
|
||||
|
||||
%macro CHECKBUF15 0
|
||||
cmp put_bits, 16 ; if (put_bits > 31) {
|
||||
jl %%.CHECKBUF15_END
|
||||
mov eax, POINTER [esp+buffer]
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
mov POINTER [esp+buffer], eax
|
||||
%%.CHECKBUF15_END:
|
||||
%endmacro
|
||||
|
||||
%macro EMIT_BITS 1
|
||||
PUT_BITS %1
|
||||
CHECKBUF15
|
||||
%endmacro
|
||||
|
||||
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
|
||||
pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
|
||||
pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
|
||||
pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
|
||||
pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
|
||||
pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
|
||||
pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
|
||||
pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
|
||||
pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
|
||||
pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
|
||||
pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
|
||||
pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
|
||||
pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
|
||||
pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
|
||||
pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
|
||||
pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
|
||||
pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
|
||||
pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
|
||||
pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
|
||||
pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
|
||||
pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
|
||||
pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
|
||||
pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
|
||||
pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
|
||||
pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
|
||||
pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
|
||||
pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
|
||||
pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
|
||||
pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
|
||||
pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
|
||||
pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
|
||||
pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
|
||||
pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
|
||||
%if %1 != 32
|
||||
pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
|
||||
%else
|
||||
pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
|
||||
%endif
|
||||
pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
|
||||
pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
|
||||
movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
|
||||
movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
|
||||
movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
|
||||
movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
|
||||
movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
|
||||
movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
|
||||
movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
|
||||
movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Encode a single block's worth of coefficients.
|
||||
;
|
||||
; GLOBAL(JOCTET *)
|
||||
; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
|
||||
; JCOEFPTR block, int last_dc_val,
|
||||
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
|
||||
;
|
||||
|
||||
; eax + 8 = working_state *state
|
||||
; eax + 12 = JOCTET *buffer
|
||||
; eax + 16 = JCOEFPTR block
|
||||
; eax + 20 = int last_dc_val
|
||||
; eax + 24 = c_derived_tbl *dctbl
|
||||
; eax + 28 = c_derived_tbl *actbl
|
||||
|
||||
%define pad 6 * SIZEOF_DWORD ; Align to 16 bytes
|
||||
%define t1 pad
|
||||
%define t2 t1 + (DCTSIZE2 * SIZEOF_WORD)
|
||||
%define block t2 + (DCTSIZE2 * SIZEOF_WORD)
|
||||
%define actbl block + SIZEOF_DWORD
|
||||
%define buffer actbl + SIZEOF_DWORD
|
||||
%define temp buffer + SIZEOF_DWORD
|
||||
%define temp2 temp + SIZEOF_DWORD
|
||||
%define temp3 temp2 + SIZEOF_DWORD
|
||||
%define temp4 temp3 + SIZEOF_DWORD
|
||||
%define temp5 temp4 + SIZEOF_DWORD
|
||||
%define gotptr temp5 + SIZEOF_DWORD ; void *gotptr
|
||||
%define put_buffer ebx
|
||||
%define put_bits edi
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
|
||||
|
||||
EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
sub esp, temp5+9*SIZEOF_DWORD-pad
|
||||
push ebx
|
||||
push ecx
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
|
||||
mov esi, POINTER [eax+8] ; (working_state *state)
|
||||
mov put_buffer, dword [esi+8] ; put_buffer = state->cur.put_buffer;
|
||||
mov put_bits, dword [esi+12] ; put_bits = state->cur.put_bits;
|
||||
push esi ; esi is now scratch
|
||||
|
||||
get_GOT edx ; get GOT address
|
||||
movpic POINTER [esp+gotptr], edx ; save GOT address
|
||||
|
||||
mov ecx, POINTER [eax+28]
|
||||
mov edx, POINTER [eax+16]
|
||||
mov esi, POINTER [eax+12]
|
||||
mov POINTER [esp+actbl], ecx
|
||||
mov POINTER [esp+block], edx
|
||||
mov POINTER [esp+buffer], esi
|
||||
|
||||
; Encode the DC coefficient difference per section F.1.2.1
|
||||
mov esi, POINTER [esp+block] ; block
|
||||
movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
|
||||
sub ecx, dword [eax+20]
|
||||
mov esi, ecx
|
||||
|
||||
; This is a well-known technique for obtaining the absolute value
|
||||
; with out a branch. It is derived from an assembly language technique
|
||||
; presented in "How to Optimize for the Pentium Processors",
|
||||
; Copyright (c) 1996, 1997 by Agner Fog.
|
||||
mov edx, ecx
|
||||
sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
|
||||
xor ecx, edx ; temp ^= temp3;
|
||||
sub ecx, edx ; temp -= temp3;
|
||||
|
||||
; For a negative input, want temp2 = bitwise complement of abs(input)
|
||||
; This code assumes we are on a two's complement machine
|
||||
add esi, edx ; temp2 += temp3;
|
||||
mov dword [esp+temp], esi ; backup temp2 in temp
|
||||
|
||||
; Find the number of bits needed for the magnitude of the coefficient
|
||||
movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
|
||||
movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
|
||||
mov dword [esp+temp2], edx ; backup nbits in temp2
|
||||
|
||||
; Emit the Huffman-coded symbol for the number of bits
|
||||
mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
|
||||
mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
|
||||
movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
|
||||
EMIT_BITS eax ; EMIT_BITS(code, size)
|
||||
|
||||
mov ecx, dword [esp+temp2] ; restore nbits
|
||||
|
||||
; Mask off any extra bits in code
|
||||
mov eax, 1
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, dword [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
|
||||
; Emit that number of bits of the value, if positive,
|
||||
; or the complement of its magnitude, if negative.
|
||||
EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
|
||||
|
||||
; Prepare data
|
||||
xor ecx, ecx
|
||||
mov esi, POINTER [esp+block]
|
||||
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
|
||||
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
|
||||
27, 20, 13, 6, 7, 14, 21, 28, 35, \
|
||||
xmm0, xmm1, xmm2, xmm3
|
||||
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
|
||||
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
|
||||
53, 60, 61, 54, 47, 55, 62, 63, 63, \
|
||||
xmm0, xmm1, xmm2, xmm3
|
||||
|
||||
pxor xmm7, xmm7
|
||||
movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
|
||||
movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
|
||||
movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
|
||||
movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
|
||||
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
|
||||
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
|
||||
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
|
||||
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
|
||||
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
|
||||
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
|
||||
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
|
||||
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
|
||||
shl ecx, 16
|
||||
or edx, ecx
|
||||
not edx ; index = ~index;
|
||||
|
||||
lea esi, [esp+t1]
|
||||
mov ebp, POINTER [esp+actbl] ; ebp = actbl
|
||||
|
||||
.BLOOP:
|
||||
bsf ecx, edx ; r = __builtin_ctzl(index);
|
||||
jz near .ELOOP
|
||||
lea esi, [esi+ecx*2] ; k += r;
|
||||
shr edx, cl ; index >>= r;
|
||||
mov dword [esp+temp3], edx
|
||||
.BRLOOP:
|
||||
cmp ecx, 16 ; while (r > 15) {
|
||||
jl near .ERLOOP
|
||||
sub ecx, 16 ; r -= 16;
|
||||
mov dword [esp+temp], ecx
|
||||
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
|
||||
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
|
||||
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
|
||||
mov ecx, dword [esp+temp]
|
||||
jmp .BRLOOP
|
||||
.ERLOOP:
|
||||
movsx eax, word [esi] ; temp = t1[k];
|
||||
movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
|
||||
movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
|
||||
mov dword [esp+temp2], eax
|
||||
; Emit Huffman symbol for run length / number of bits
|
||||
shl ecx, 4 ; temp3 = (r << 4) + nbits;
|
||||
add ecx, eax
|
||||
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
|
||||
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
|
||||
EMIT_BITS eax
|
||||
|
||||
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
|
||||
; Mask off any extra bits in code
|
||||
mov ecx, dword [esp+temp2]
|
||||
mov eax, 1
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
|
||||
mov edx, dword [esp+temp3]
|
||||
add esi, 2 ; ++k;
|
||||
shr edx, 1 ; index >>= 1;
|
||||
|
||||
jmp .BLOOP
|
||||
.ELOOP:
|
||||
movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
|
||||
movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
|
||||
movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
|
||||
movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
|
||||
pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
|
||||
pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
|
||||
pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
|
||||
pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
|
||||
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
|
||||
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
|
||||
pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
|
||||
pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
|
||||
shl ecx, 16
|
||||
or edx, ecx
|
||||
not edx ; index = ~index;
|
||||
|
||||
lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
|
||||
sub eax, esi
|
||||
shr eax, 1
|
||||
bsf ecx, edx ; r = __builtin_ctzl(index);
|
||||
jz near .ELOOP2
|
||||
shr edx, cl ; index >>= r;
|
||||
add ecx, eax
|
||||
lea esi, [esi+ecx*2] ; k += r;
|
||||
mov dword [esp+temp3], edx
|
||||
jmp .BRLOOP2
|
||||
.BLOOP2:
|
||||
bsf ecx, edx ; r = __builtin_ctzl(index);
|
||||
jz near .ELOOP2
|
||||
lea esi, [esi+ecx*2] ; k += r;
|
||||
shr edx, cl ; index >>= r;
|
||||
mov dword [esp+temp3], edx
|
||||
.BRLOOP2:
|
||||
cmp ecx, 16 ; while (r > 15) {
|
||||
jl near .ERLOOP2
|
||||
sub ecx, 16 ; r -= 16;
|
||||
mov dword [esp+temp], ecx
|
||||
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
|
||||
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
|
||||
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
|
||||
mov ecx, dword [esp+temp]
|
||||
jmp .BRLOOP2
|
||||
.ERLOOP2:
|
||||
movsx eax, word [esi] ; temp = t1[k];
|
||||
bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
|
||||
inc eax
|
||||
mov dword [esp+temp2], eax
|
||||
; Emit Huffman symbol for run length / number of bits
|
||||
shl ecx, 4 ; temp3 = (r << 4) + nbits;
|
||||
add ecx, eax
|
||||
mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
|
||||
movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
|
||||
EMIT_BITS eax
|
||||
|
||||
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
|
||||
; Mask off any extra bits in code
|
||||
mov ecx, dword [esp+temp2]
|
||||
mov eax, 1
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
|
||||
mov edx, dword [esp+temp3]
|
||||
add esi, 2 ; ++k;
|
||||
shr edx, 1 ; index >>= 1;
|
||||
|
||||
jmp .BLOOP2
|
||||
.ELOOP2:
|
||||
; If the last coef(s) were zero, emit an end-of-block code
|
||||
lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
|
||||
cmp edx, esi ; if (r > 0) {
|
||||
je .EFN
|
||||
mov eax, INT [ebp] ; code = actbl->ehufco[0];
|
||||
movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
|
||||
EMIT_BITS eax
|
||||
.EFN:
|
||||
mov eax, [esp+buffer]
|
||||
pop esi
|
||||
; Save put_buffer & put_bits
|
||||
mov dword [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
|
||||
mov dword [esi+12], put_bits ; state->cur.put_bits = put_bits;
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
pop ecx
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
660
TMessagesProj/jni/mozjpeg/simd/i386/jcphuff-sse2.asm
Normal file
660
TMessagesProj/jni/mozjpeg/simd/i386/jcphuff-sse2.asm
Normal file
|
|
@ -0,0 +1,660 @@
|
|||
;
|
||||
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
|
||||
;
|
||||
; Copyright (C) 2016, 2018, Matthieu Darbois
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation of data preparation for progressive
|
||||
; Huffman encoding. See jcphuff.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
|
||||
%macro LOAD16 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
|
||||
mov T0, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0, INT [LUT + 1*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
mov T0, INT [LUT + 2*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
mov T0, INT [LUT + 3*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
mov T0, INT [LUT + 4*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
mov T0, INT [LUT + 5*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
mov T0, INT [LUT + 6*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
|
||||
mov T0, INT [LUT + 7*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 15*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD15 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
pxor X1, X1
|
||||
|
||||
mov T0, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD15:
|
||||
%endmacro
|
||||
|
||||
%macro LOAD8 0
|
||||
pxor N0, N0
|
||||
|
||||
mov T0, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
|
||||
mov T0, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD7 0
|
||||
pxor N0, N0
|
||||
pxor X0, X0
|
||||
|
||||
mov T1, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 0
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD7:
|
||||
%endmacro
|
||||
|
||||
%macro REDUCE0 0
|
||||
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
|
||||
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
|
||||
movdqa xmm2, XMMWORD [VALUES + (16*2)]
|
||||
movdqa xmm3, XMMWORD [VALUES + (24*2)]
|
||||
movdqa xmm4, XMMWORD [VALUES + (32*2)]
|
||||
movdqa xmm5, XMMWORD [VALUES + (40*2)]
|
||||
movdqa xmm6, XMMWORD [VALUES + (48*2)]
|
||||
|
||||
pcmpeqw xmm0, ZERO
|
||||
pcmpeqw xmm1, ZERO
|
||||
pcmpeqw xmm2, ZERO
|
||||
pcmpeqw xmm3, ZERO
|
||||
pcmpeqw xmm4, ZERO
|
||||
pcmpeqw xmm5, ZERO
|
||||
pcmpeqw xmm6, ZERO
|
||||
pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
|
||||
|
||||
packsswb xmm0, xmm1
|
||||
packsswb xmm2, xmm3
|
||||
packsswb xmm4, xmm5
|
||||
packsswb xmm6, xmm7
|
||||
|
||||
pmovmskb eax, xmm0
|
||||
pmovmskb ecx, xmm2
|
||||
pmovmskb edx, xmm4
|
||||
pmovmskb esi, xmm6
|
||||
|
||||
shl ecx, 16
|
||||
shl esi, 16
|
||||
|
||||
or eax, ecx
|
||||
or edx, esi
|
||||
|
||||
not eax
|
||||
not edx
|
||||
|
||||
mov edi, ZEROBITS
|
||||
|
||||
mov INT [edi], eax
|
||||
mov INT [edi+SIZEOF_INT], edx
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_first().
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *values,
|
||||
; size_t *zerobits)
|
||||
;
|
||||
; eax + 8 = const JCOEF *block
|
||||
; eax + 12 = const int *jpeg_natural_order_start
|
||||
; eax + 16 = int Sl
|
||||
; eax + 20 = int Al
|
||||
; eax + 24 = JCOEF *values
|
||||
; eax + 28 = size_t *zerobits
|
||||
|
||||
%define ZERO xmm7
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define LENEND eax
|
||||
%define LUT ebx
|
||||
%define T0 ecx
|
||||
%define T1 edx
|
||||
%define BLOCK esi
|
||||
%define VALUES edi
|
||||
%define LEN ebp
|
||||
|
||||
%define ZEROBITS INT [esp + 5 * 4]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
sub esp, 4
|
||||
push ebx
|
||||
push ecx
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
|
||||
mov BLOCK, INT [eax + 8]
|
||||
mov LUT, INT [eax + 12]
|
||||
mov VALUES, INT [eax + 24]
|
||||
movd AL, INT [eax + 20]
|
||||
mov T0, INT [eax + 28]
|
||||
mov ZEROBITS, T0
|
||||
mov LEN, INT [eax + 16]
|
||||
pxor ZERO, ZERO
|
||||
mov K, LEN
|
||||
and K, -16
|
||||
shr K, 4
|
||||
jz .ELOOP16
|
||||
.BLOOP16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
dec K
|
||||
jnz .BLOOP16
|
||||
test LEN, 15
|
||||
je .PADDING
|
||||
.ELOOP16:
|
||||
mov LENEND, LEN
|
||||
and LENEND, 7
|
||||
|
||||
test LEN, 8
|
||||
jz .TRY7
|
||||
test LEN, 7
|
||||
jz .TRY8
|
||||
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
jmp .PADDING
|
||||
.TRY8:
|
||||
LOAD8
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
jmp .PADDING
|
||||
.TRY7:
|
||||
LOAD7
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
.PADDING:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDING
|
||||
align 16
|
||||
.ZEROLOOP:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOP
|
||||
.EPADDING:
|
||||
sub VALUES, DCTSIZE2*2
|
||||
|
||||
REDUCE0
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
pop ecx
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T1
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_refine().
|
||||
;
|
||||
; GLOBAL(int)
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *absvalues,
|
||||
; size_t *bits)
|
||||
;
|
||||
; eax + 8 = const JCOEF *block
|
||||
; eax + 12 = const int *jpeg_natural_order_start
|
||||
; eax + 16 = int Sl
|
||||
; eax + 20 = int Al
|
||||
; eax + 24 = JCOEF *values
|
||||
; eax + 28 = size_t *bits
|
||||
|
||||
%define ZERO xmm7
|
||||
%define ONE xmm5
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define LENEND eax
|
||||
%define LUT ebx
|
||||
%define T0 ecx
|
||||
%define T0w cx
|
||||
%define T1 edx
|
||||
%define BLOCK esi
|
||||
%define VALUES edi
|
||||
%define KK ebp
|
||||
|
||||
%define ZEROBITS INT [esp + 5 * 4]
|
||||
%define EOB INT [esp + 5 * 4 + 4]
|
||||
%define LEN INT [esp + 5 * 4 + 8]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
sub esp, 16
|
||||
push ebx
|
||||
push ecx
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
|
||||
pcmpeqw ONE, ONE
|
||||
psrlw ONE, 15
|
||||
mov BLOCK, INT [eax + 8]
|
||||
mov LUT, INT [eax + 12]
|
||||
mov VALUES, INT [eax + 24]
|
||||
movd AL, INT [eax + 20]
|
||||
mov T0, INT [eax + 28]
|
||||
mov K, INT [eax + 16]
|
||||
mov INT [T0 + 2 * SIZEOF_INT], -1
|
||||
mov INT [T0 + 3 * SIZEOF_INT], -1
|
||||
mov ZEROBITS, T0
|
||||
mov LEN, K
|
||||
pxor ZERO, ZERO
|
||||
and K, -16
|
||||
mov EOB, 0
|
||||
xor KK, KK
|
||||
shr K, 4
|
||||
jz .ELOOPR16
|
||||
.BLOOPR16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER16 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER16:
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
add KK, 2
|
||||
dec K
|
||||
jnz .BLOOPR16
|
||||
.ELOOPR16:
|
||||
mov LENEND, LEN
|
||||
|
||||
test LENEND, 8
|
||||
jz .TRYR7
|
||||
test LENEND, 7
|
||||
jz .TRYR8
|
||||
|
||||
and LENEND, 7
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER15 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER15:
|
||||
add VALUES, 16*2
|
||||
jmp .PADDINGR
|
||||
.TRYR8:
|
||||
LOAD8
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER8 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER8:
|
||||
add VALUES, 8*2
|
||||
jmp .PADDINGR
|
||||
.TRYR7:
|
||||
and LENEND, 7
|
||||
LOAD7
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER7 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER7:
|
||||
add VALUES, 8*2
|
||||
.PADDINGR:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDINGR
|
||||
align 16
|
||||
.ZEROLOOPR:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOPR
|
||||
.EPADDINGR:
|
||||
sub VALUES, DCTSIZE2*2
|
||||
|
||||
REDUCE0
|
||||
|
||||
mov eax, EOB
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
pop ecx
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef ONE
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef KK
|
||||
%undef EOB
|
||||
%undef SIGN
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T1
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
%undef LENEND
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
388
TMessagesProj/jni/mozjpeg/simd/i386/jcsample-avx2.asm
Normal file
388
TMessagesProj/jni/mozjpeg/simd/i386/jcsample-avx2.asm
Normal file
|
|
@ -0,0 +1,388 @@
|
|||
;
|
||||
; jcsample.asm - downsampling (AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
|
||||
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx, 1 ; output_cols * 2
|
||||
sub ecx, edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax, eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16, 7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi, edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v1_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov edx, 0x00010000 ; bias pattern
|
||||
vmovd xmm7, edx
|
||||
vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
||||
vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
|
||||
vpcmpeqw ymm6, ymm6, ymm6
|
||||
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_r24:
|
||||
; ecx can possibly be 8, 16, 24
|
||||
cmp ecx, 24
|
||||
jne .columnloop_r16
|
||||
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r16:
|
||||
cmp ecx, 16
|
||||
jne .columnloop_r8
|
||||
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r8:
|
||||
vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
|
||||
.downsample:
|
||||
vpsrlw ymm2, ymm0, BYTE_BIT
|
||||
vpand ymm0, ymm0, ymm6
|
||||
vpsrlw ymm3, ymm1, BYTE_BIT
|
||||
vpand ymm1, ymm1, ymm6
|
||||
|
||||
vpaddw ymm0, ymm0, ymm2
|
||||
vpaddw ymm1, ymm1, ymm3
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm1, ymm1, ymm7
|
||||
vpsrlw ymm0, ymm0, 1
|
||||
vpsrlw ymm1, ymm1, 1
|
||||
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
|
||||
|
||||
sub ecx, byte SIZEOF_YMMWORD ; outcol
|
||||
add esi, byte 2*SIZEOF_YMMWORD ; inptr
|
||||
add edi, byte 1*SIZEOF_YMMWORD ; outptr
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae short .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .columnloop_r24
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
|
||||
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx, 1 ; output_cols * 2
|
||||
sub ecx, edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax, eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16, 7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi, edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v2_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov edx, 0x00020001 ; bias pattern
|
||||
vmovd xmm7, edx
|
||||
vpcmpeqw ymm6, ymm6, ymm6
|
||||
vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
|
||||
vperm2i128 ymm7, ymm7, ymm7, 0
|
||||
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_r24:
|
||||
cmp ecx, 24
|
||||
jne .columnloop_r16
|
||||
vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
|
||||
vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r16:
|
||||
cmp ecx, 16
|
||||
jne .columnloop_r8
|
||||
vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vpxor ymm2, ymm2, ymm2
|
||||
vpxor ymm3, ymm3, ymm3
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r8:
|
||||
vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
||||
vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
vpxor ymm2, ymm2, ymm2
|
||||
vpxor ymm3, ymm3, ymm3
|
||||
mov ecx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
|
||||
.downsample:
|
||||
vpand ymm4, ymm0, ymm6
|
||||
vpsrlw ymm0, ymm0, BYTE_BIT
|
||||
vpand ymm5, ymm1, ymm6
|
||||
vpsrlw ymm1, ymm1, BYTE_BIT
|
||||
vpaddw ymm0, ymm0, ymm4
|
||||
vpaddw ymm1, ymm1, ymm5
|
||||
|
||||
vpand ymm4, ymm2, ymm6
|
||||
vpsrlw ymm2, ymm2, BYTE_BIT
|
||||
vpand ymm5, ymm3, ymm6
|
||||
vpsrlw ymm3, ymm3, BYTE_BIT
|
||||
vpaddw ymm2, ymm2, ymm4
|
||||
vpaddw ymm3, ymm3, ymm5
|
||||
|
||||
vpaddw ymm0, ymm0, ymm1
|
||||
vpaddw ymm2, ymm2, ymm3
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm2, ymm2, ymm7
|
||||
vpsrlw ymm0, ymm0, 2
|
||||
vpsrlw ymm2, ymm2, 2
|
||||
|
||||
vpackuswb ymm0, ymm0, ymm2
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
|
||||
|
||||
sub ecx, byte SIZEOF_YMMWORD ; outcol
|
||||
add edx, byte 2*SIZEOF_YMMWORD ; inptr0
|
||||
add esi, byte 2*SIZEOF_YMMWORD ; inptr1
|
||||
add edi, byte 1*SIZEOF_YMMWORD ; outptr
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .columnloop_r24
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
324
TMessagesProj/jni/mozjpeg/simd/i386/jcsample-mmx.asm
Normal file
324
TMessagesProj/jni/mozjpeg/simd/i386/jcsample-mmx.asm
Normal file
|
|
@ -0,0 +1,324 @@
|
|||
;
|
||||
; jcsample.asm - downsampling (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
|
||||
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx, 1 ; output_cols * 2
|
||||
sub ecx, edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax, eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16, 7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi, edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v1_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov edx, 0x00010000 ; bias pattern
|
||||
movd mm7, edx
|
||||
pcmpeqw mm6, mm6
|
||||
punpckldq mm7, mm7 ; mm7={0, 1, 0, 1}
|
||||
psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
movq mm2, mm0
|
||||
movq mm3, mm1
|
||||
|
||||
pand mm0, mm6
|
||||
psrlw mm2, BYTE_BIT
|
||||
pand mm1, mm6
|
||||
psrlw mm3, BYTE_BIT
|
||||
|
||||
paddw mm0, mm2
|
||||
paddw mm1, mm3
|
||||
paddw mm0, mm7
|
||||
paddw mm1, mm7
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
packuswb mm0, mm1
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte 1*SIZEOF_MMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_MMWORD ; outcol
|
||||
jnz short .columnloop
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
|
||||
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx, 1 ; output_cols * 2
|
||||
sub ecx, edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax, eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16, 7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi, edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v2_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov edx, 0x00020001 ; bias pattern
|
||||
movd mm7, edx
|
||||
pcmpeqw mm6, mm6
|
||||
punpckldq mm7, mm7 ; mm7={1, 2, 1, 2}
|
||||
psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
|
||||
movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
|
||||
movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm1
|
||||
pand mm0, mm6
|
||||
psrlw mm4, BYTE_BIT
|
||||
pand mm1, mm6
|
||||
psrlw mm5, BYTE_BIT
|
||||
paddw mm0, mm4
|
||||
paddw mm1, mm5
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm5, mm3
|
||||
pand mm2, mm6
|
||||
psrlw mm4, BYTE_BIT
|
||||
pand mm3, mm6
|
||||
psrlw mm5, BYTE_BIT
|
||||
paddw mm2, mm4
|
||||
paddw mm3, mm5
|
||||
|
||||
paddw mm0, mm1
|
||||
paddw mm2, mm3
|
||||
paddw mm0, mm7
|
||||
paddw mm2, mm7
|
||||
psrlw mm0, 2
|
||||
psrlw mm2, 2
|
||||
|
||||
packuswb mm0, mm2
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
|
||||
add edx, byte 2*SIZEOF_MMWORD ; inptr0
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr1
|
||||
add edi, byte 1*SIZEOF_MMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_MMWORD ; outcol
|
||||
jnz near .columnloop
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
351
TMessagesProj/jni/mozjpeg/simd/i386/jcsample-sse2.asm
Normal file
351
TMessagesProj/jni/mozjpeg/simd/i386/jcsample-sse2.asm
Normal file
|
|
@ -0,0 +1,351 @@
|
|||
;
|
||||
; jcsample.asm - downsampling (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
|
||||
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx, 1 ; output_cols * 2
|
||||
sub ecx, edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax, eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16, 7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi, edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v1_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov edx, 0x00010000 ; bias pattern
|
||||
movd xmm7, edx
|
||||
pcmpeqw xmm6, xmm6
|
||||
pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
||||
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_r8:
|
||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
pxor xmm1, xmm1
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jmp short .downsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
|
||||
.downsample:
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
|
||||
pand xmm0, xmm6
|
||||
psrlw xmm2, BYTE_BIT
|
||||
pand xmm1, xmm6
|
||||
psrlw xmm3, BYTE_BIT
|
||||
|
||||
paddw xmm0, xmm2
|
||||
paddw xmm1, xmm3
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm1, xmm7
|
||||
psrlw xmm0, 1
|
||||
psrlw xmm1, 1
|
||||
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub ecx, byte SIZEOF_XMMWORD ; outcol
|
||||
add esi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add edi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
test ecx, ecx
|
||||
jnz short .columnloop_r8
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
%define img_width(b) (b) + 8 ; JDIMENSION image_width
|
||||
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
|
||||
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
|
||||
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
|
||||
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
|
||||
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, JDIMENSION [width_blks(ebp)]
|
||||
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, JDIMENSION [img_width(ebp)]
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push ecx
|
||||
shl ecx, 1 ; output_cols * 2
|
||||
sub ecx, edx
|
||||
jle short .expand_end
|
||||
|
||||
mov eax, INT [max_v_samp(ebp)]
|
||||
test eax, eax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
alignx 16, 7
|
||||
.expandloop:
|
||||
push eax
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPROW [esi]
|
||||
add edi, edx
|
||||
mov al, JSAMPLE [edi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
dec eax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop ecx ; output_cols
|
||||
|
||||
; -- h2v2_downsample
|
||||
|
||||
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov edx, 0x00020001 ; bias pattern
|
||||
movd xmm7, edx
|
||||
pcmpeqw xmm6, xmm6
|
||||
pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
|
||||
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_r8:
|
||||
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
pxor xmm2, xmm2
|
||||
pxor xmm3, xmm3
|
||||
mov ecx, SIZEOF_XMMWORD
|
||||
jmp short .downsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
||||
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
|
||||
.downsample:
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm1
|
||||
pand xmm0, xmm6
|
||||
psrlw xmm4, BYTE_BIT
|
||||
pand xmm1, xmm6
|
||||
psrlw xmm5, BYTE_BIT
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm5
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm5, xmm3
|
||||
pand xmm2, xmm6
|
||||
psrlw xmm4, BYTE_BIT
|
||||
pand xmm3, xmm6
|
||||
psrlw xmm5, BYTE_BIT
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
|
||||
paddw xmm0, xmm1
|
||||
paddw xmm2, xmm3
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm2, xmm7
|
||||
psrlw xmm0, 2
|
||||
psrlw xmm2, 2
|
||||
|
||||
packuswb xmm0, xmm2
|
||||
|
||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub ecx, byte SIZEOF_XMMWORD ; outcol
|
||||
add edx, byte 2*SIZEOF_XMMWORD ; inptr0
|
||||
add esi, byte 2*SIZEOF_XMMWORD ; inptr1
|
||||
add edi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test ecx, ecx
|
||||
jnz near .columnloop_r8
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
|
||||
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
|
||||
dec eax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
515
TMessagesProj/jni/mozjpeg/simd/i386/jdcolext-avx2.asm
Normal file
515
TMessagesProj/jni/mozjpeg/simd/i386/jdcolext-avx2.asm
Normal file
|
|
@ -0,0 +1,515 @@
|
|||
;
|
||||
; jdcolext.asm - colorspace conversion (AVX2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
; JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
; int num_rows)
|
||||
;
|
||||
|
||||
%define out_width(b) (b) + 8 ; JDIMENSION out_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define input_row(b) (b) + 16 ; JDIMENSION input_row
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
|
||||
; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_avx2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [input_row(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax
|
||||
push edi
|
||||
push edx
|
||||
push ebx
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx] ; inptr1
|
||||
mov edx, JSAMPROW [edx] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
vmovdqu ymm1, YMMWORD [edx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpcmpeqw ymm0, ymm0, ymm0
|
||||
vpcmpeqw ymm7, ymm7, ymm7
|
||||
vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..}
|
||||
vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
|
||||
vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
|
||||
vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
|
||||
vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
|
||||
|
||||
vpaddw ymm2, ymm4, ymm7
|
||||
vpaddw ymm3, ymm5, ymm7
|
||||
vpaddw ymm6, ymm0, ymm7
|
||||
vpaddw ymm7, ymm1, ymm7
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE
|
||||
vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO
|
||||
vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE
|
||||
vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO
|
||||
|
||||
vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbE * -FIX(0.22800))
|
||||
vpmulhw ymm5, ymm5, [GOTOFF(eax,PW_MF0228)] ; ymm5=(2*CbO * -FIX(0.22800))
|
||||
vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrE * FIX(0.40200))
|
||||
vpmulhw ymm1, ymm1, [GOTOFF(eax,PW_F0402)] ; ymm1=(2*CrO * FIX(0.40200))
|
||||
|
||||
vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
|
||||
vpaddw ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
|
||||
vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800))
|
||||
vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800))
|
||||
vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
|
||||
vpaddw ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
|
||||
vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200))
|
||||
vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200))
|
||||
|
||||
vpaddw ymm4, ymm4, ymm2
|
||||
vpaddw ymm5, ymm5, ymm3
|
||||
vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||
vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||
vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||
vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E
|
||||
vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O
|
||||
|
||||
vpunpckhwd ymm4, ymm2, ymm6
|
||||
vpunpcklwd ymm2, ymm2, ymm6
|
||||
vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
vpunpckhwd ymm5, ymm3, ymm7
|
||||
vpunpcklwd ymm3, ymm3, ymm7
|
||||
vpmaddwd ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpaddd ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpsrad ymm2, ymm2, SCALEBITS
|
||||
vpsrad ymm4, ymm4, SCALEBITS
|
||||
vpaddd ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpsrad ymm3, ymm3, SCALEBITS
|
||||
vpsrad ymm5, ymm5, SCALEBITS
|
||||
|
||||
vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||
vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||
vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||
vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||
|
||||
vmovdqu ymm5, YMMWORD [esi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpcmpeqw ymm4, ymm4, ymm4
|
||||
vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE
|
||||
vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO
|
||||
|
||||
vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
|
||||
vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
|
||||
vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
|
||||
vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
|
||||
; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
|
||||
; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
|
||||
|
||||
vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
|
||||
; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
|
||||
vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
|
||||
; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
|
||||
; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
|
||||
|
||||
vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
|
||||
; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
|
||||
|
||||
vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
|
||||
; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
|
||||
vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
|
||||
; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
|
||||
vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
|
||||
; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
|
||||
|
||||
vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
|
||||
; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
|
||||
vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
|
||||
; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
|
||||
|
||||
vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
|
||||
; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
|
||||
vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
|
||||
vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
|
||||
vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
|
||||
|
||||
vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
|
||||
; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
|
||||
vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
|
||||
; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
|
||||
vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
|
||||
; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
|
||||
vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
|
||||
|
||||
vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test edi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add esi, byte SIZEOF_YMMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st64:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_YMMWORD
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
add edi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmF
|
||||
sub ecx, byte 2*SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st32:
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st31
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
add edi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st31:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
vmovq XMM_MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
vmovd XMM_DWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_DWORD
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of eax to the output when it has enough
|
||||
; space.
|
||||
vmovd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of eax to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%else
|
||||
vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%endif
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
|
||||
; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
|
||||
; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
|
||||
vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
|
||||
; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
|
||||
; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
|
||||
|
||||
vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
|
||||
; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
|
||||
; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
|
||||
vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
|
||||
; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
|
||||
vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
|
||||
; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
|
||||
|
||||
vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
|
||||
vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test edi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
|
||||
.out0:
|
||||
add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add esi, byte SIZEOF_YMMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st64:
|
||||
cmp ecx, byte SIZEOF_YMMWORD/2
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
add edi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmC
|
||||
vmovdqa ymmD, ymmH
|
||||
sub ecx, byte SIZEOF_YMMWORD/2
|
||||
.column_st32:
|
||||
cmp ecx, byte SIZEOF_YMMWORD/4
|
||||
jb short .column_st16
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
add edi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub ecx, byte SIZEOF_YMMWORD/4
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_YMMWORD/8
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_YMMWORD/8
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_YMMWORD/16
|
||||
jb short .column_st7
|
||||
vmovq MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_YMMWORD/16*4
|
||||
sub ecx, byte SIZEOF_YMMWORD/16
|
||||
vpsrldq xmmA, SIZEOF_YMMWORD/16*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
vmovd XMM_DWORD [edi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop ecx
|
||||
pop esi
|
||||
pop ebx
|
||||
pop edx
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
404
TMessagesProj/jni/mozjpeg/simd/i386/jdcolext-mmx.asm
Normal file
404
TMessagesProj/jni/mozjpeg/simd/i386/jdcolext-mmx.asm
Normal file
|
|
@ -0,0 +1,404 @@
|
|||
;
|
||||
; jdcolext.asm - colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
; JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
; int num_rows)
|
||||
;
|
||||
|
||||
%define out_width(b) (b) + 8 ; JDIMENSION out_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define input_row(b) (b) + 16 ; JDIMENSION input_row
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
|
||||
; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [input_row(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax
|
||||
push edi
|
||||
push edx
|
||||
push ebx
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx] ; inptr1
|
||||
mov edx, JSAMPROW [edx] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
|
||||
movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
|
||||
|
||||
pcmpeqw mm4, mm4
|
||||
pcmpeqw mm7, mm7
|
||||
psrlw mm4, BYTE_BIT
|
||||
psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
|
||||
movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
pand mm4, mm5 ; mm4=Cb(0246)=CbE
|
||||
psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO
|
||||
pand mm0, mm1 ; mm0=Cr(0246)=CrE
|
||||
psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO
|
||||
|
||||
paddw mm4, mm7
|
||||
paddw mm5, mm7
|
||||
paddw mm0, mm7
|
||||
paddw mm1, mm7
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movq mm2, mm4 ; mm2=CbE
|
||||
movq mm3, mm5 ; mm3=CbO
|
||||
paddw mm4, mm4 ; mm4=2*CbE
|
||||
paddw mm5, mm5 ; mm5=2*CbO
|
||||
movq mm6, mm0 ; mm6=CrE
|
||||
movq mm7, mm1 ; mm7=CrO
|
||||
paddw mm0, mm0 ; mm0=2*CrE
|
||||
paddw mm1, mm1 ; mm1=2*CrO
|
||||
|
||||
pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
|
||||
pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
|
||||
pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
|
||||
pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
|
||||
|
||||
paddw mm4, [GOTOFF(eax,PW_ONE)]
|
||||
paddw mm5, [GOTOFF(eax,PW_ONE)]
|
||||
psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800))
|
||||
psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800))
|
||||
paddw mm0, [GOTOFF(eax,PW_ONE)]
|
||||
paddw mm1, [GOTOFF(eax,PW_ONE)]
|
||||
psraw mm0, 1 ; mm0=(CrE * FIX(0.40200))
|
||||
psraw mm1, 1 ; mm1=(CrO * FIX(0.40200))
|
||||
|
||||
paddw mm4, mm2
|
||||
paddw mm5, mm3
|
||||
paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||
paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||
paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||
paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
|
||||
movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm5, mm3
|
||||
punpcklwd mm2, mm6
|
||||
punpckhwd mm4, mm6
|
||||
pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
punpcklwd mm3, mm7
|
||||
punpckhwd mm5, mm7
|
||||
pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm4, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm2, SCALEBITS
|
||||
psrad mm4, SCALEBITS
|
||||
paddd mm3, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm3, SCALEBITS
|
||||
psrad mm5, SCALEBITS
|
||||
|
||||
packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||
packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||
psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||
psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||
|
||||
movq mm5, MMWORD [esi] ; mm5=Y(01234567)
|
||||
|
||||
pcmpeqw mm4, mm4
|
||||
psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand mm4, mm5 ; mm4=Y(0246)=YE
|
||||
psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO
|
||||
|
||||
paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
|
||||
paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
|
||||
packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
|
||||
packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
|
||||
|
||||
paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
|
||||
paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
|
||||
packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
|
||||
packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
|
||||
|
||||
paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
|
||||
paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
|
||||
packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
|
||||
packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
|
||||
punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
|
||||
|
||||
movq mmG, mmA
|
||||
movq mmH, mmA
|
||||
punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
|
||||
punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
|
||||
|
||||
psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
|
||||
psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
|
||||
|
||||
movq mmC, mmD
|
||||
movq mmB, mmD
|
||||
punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
|
||||
punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
|
||||
|
||||
psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
|
||||
|
||||
movq mmF, mmE
|
||||
punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
|
||||
punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
|
||||
|
||||
punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
|
||||
punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
|
||||
punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st16:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_MMWORD
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq mmA, mmC
|
||||
sub ecx, byte 2*SIZEOF_MMWORD
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
jmp short .column_st4
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA, mmE
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
movd eax, mmA
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st2
|
||||
mov dword [edi+0*SIZEOF_DWORD], eax
|
||||
psrlq mmA, DWORD_BIT
|
||||
movd eax, mmA
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
add edi, byte SIZEOF_DWORD
|
||||
.column_st2:
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [edi+0*SIZEOF_WORD], ax
|
||||
shr eax, WORD_BIT
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
add edi, byte SIZEOF_WORD
|
||||
.column_st1:
|
||||
cmp ecx, byte SIZEOF_BYTE
|
||||
jb short .nextrow
|
||||
mov byte [edi+0*SIZEOF_BYTE], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%else
|
||||
pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%endif
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
|
||||
|
||||
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
|
||||
punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
|
||||
punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
|
||||
|
||||
movq mmC, mmA
|
||||
punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
|
||||
punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
|
||||
movq mmG, mmB
|
||||
punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
|
||||
punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
|
||||
|
||||
movq mmD, mmA
|
||||
punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
|
||||
movq mmH, mmC
|
||||
punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_MMWORD/2
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq mmA, mmC
|
||||
movq mmD, mmH
|
||||
sub ecx, byte SIZEOF_MMWORD/2
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD/4
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA, mmD
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
add edi, byte 1*SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
cmp ecx, byte SIZEOF_MMWORD/8
|
||||
jb short .nextrow
|
||||
movd dword [edi+0*SIZEOF_DWORD], mmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop ecx
|
||||
pop esi
|
||||
pop ebx
|
||||
pop edx
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
458
TMessagesProj/jni/mozjpeg/simd/i386/jdcolext-sse2.asm
Normal file
458
TMessagesProj/jni/mozjpeg/simd/i386/jdcolext-sse2.asm
Normal file
|
|
@ -0,0 +1,458 @@
|
|||
;
|
||||
; jdcolext.asm - colorspace conversion (SSE2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2012, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
; JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
; int num_rows)
|
||||
;
|
||||
|
||||
%define out_width(b) (b) + 8 ; JDIMENSION out_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define input_row(b) (b) + 16 ; JDIMENSION input_row
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
%define num_rows(b) (b) + 24 ; int num_rows
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [input_row(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
|
||||
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop ecx
|
||||
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov eax, INT [num_rows(eax)]
|
||||
test eax, eax
|
||||
jle near .return
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax
|
||||
push edi
|
||||
push edx
|
||||
push ebx
|
||||
push esi
|
||||
push ecx ; col
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx] ; inptr1
|
||||
mov edx, JSAMPROW [edx] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
|
||||
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm4, xmm4
|
||||
pcmpeqw xmm7, xmm7
|
||||
psrlw xmm4, BYTE_BIT
|
||||
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
|
||||
psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
||||
pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
|
||||
psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
||||
|
||||
paddw xmm4, xmm7
|
||||
paddw xmm5, xmm7
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm1, xmm7
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movdqa xmm2, xmm4 ; xmm2=CbE
|
||||
movdqa xmm3, xmm5 ; xmm3=CbO
|
||||
paddw xmm4, xmm4 ; xmm4=2*CbE
|
||||
paddw xmm5, xmm5 ; xmm5=2*CbO
|
||||
movdqa xmm6, xmm0 ; xmm6=CrE
|
||||
movdqa xmm7, xmm1 ; xmm7=CrO
|
||||
paddw xmm0, xmm0 ; xmm0=2*CrE
|
||||
paddw xmm1, xmm1 ; xmm1=2*CrO
|
||||
|
||||
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
|
||||
pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
|
||||
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
|
||||
pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
|
||||
|
||||
paddw xmm4, [GOTOFF(eax,PW_ONE)]
|
||||
paddw xmm5, [GOTOFF(eax,PW_ONE)]
|
||||
psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
|
||||
psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
|
||||
paddw xmm0, [GOTOFF(eax,PW_ONE)]
|
||||
paddw xmm1, [GOTOFF(eax,PW_ONE)]
|
||||
psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
|
||||
psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
|
||||
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm5, xmm3
|
||||
paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||
paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||
paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||
paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm5, xmm3
|
||||
punpcklwd xmm2, xmm6
|
||||
punpckhwd xmm4, xmm6
|
||||
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
punpcklwd xmm3, xmm7
|
||||
punpckhwd xmm5, xmm7
|
||||
pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad xmm2, SCALEBITS
|
||||
psrad xmm4, SCALEBITS
|
||||
paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad xmm3, SCALEBITS
|
||||
psrad xmm5, SCALEBITS
|
||||
|
||||
packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||
packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||
psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||
psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||
|
||||
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm4, xmm4
|
||||
psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
|
||||
psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
||||
|
||||
paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
||||
paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
||||
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||
|
||||
paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
||||
paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
||||
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||
|
||||
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
||||
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
||||
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
movdqa xmmH, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||
|
||||
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||
|
||||
movdqa xmmC, xmmD
|
||||
movdqa xmmB, xmmD
|
||||
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||
|
||||
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||
|
||||
movdqa xmmF, xmmE
|
||||
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||
|
||||
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||
movdqa xmmB, xmmE
|
||||
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||
|
||||
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||
movdqa xmmB, xmmF
|
||||
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||
|
||||
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test edi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add esi, byte SIZEOF_XMMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st32:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmF
|
||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
movq XMM_MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
psrldq xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
movd XMM_DWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_DWORD
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
psrldq xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of eax to the output when it has enough
|
||||
; space.
|
||||
movd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of eax to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%else
|
||||
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%endif
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||
movdqa xmmG, xmmB
|
||||
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
movdqa xmmH, xmmC
|
||||
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test edi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add esi, byte SIZEOF_XMMWORD ; inptr0
|
||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st32:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmC
|
||||
movdqa xmmD, xmmH
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_XMMWORD/8
|
||||
jb short .column_st7
|
||||
movq XMM_MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD/8*4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
psrldq xmmA, SIZEOF_XMMWORD/8*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
movd XMM_DWORD [edi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop ecx
|
||||
pop esi
|
||||
pop ebx
|
||||
pop edx
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW
|
||||
add ebx, byte SIZEOF_JSAMPROW
|
||||
add edx, byte SIZEOF_JSAMPROW
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec eax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
118
TMessagesProj/jni/mozjpeg/simd/i386/jdcolor-avx2.asm
Normal file
118
TMessagesProj/jni/mozjpeg/simd/i386/jdcolor-avx2.asm
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
;
|
||||
; jdcolor.asm - colorspace conversion (AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_avx2):
|
||||
|
||||
PW_F0402 times 16 dw F_0_402
|
||||
PW_MF0228 times 16 dw -F_0_228
|
||||
PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
|
||||
PW_ONE times 16 dw 1
|
||||
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
117
TMessagesProj/jni/mozjpeg/simd/i386/jdcolor-mmx.asm
Normal file
117
TMessagesProj/jni/mozjpeg/simd/i386/jdcolor-mmx.asm
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
;
|
||||
; jdcolor.asm - colorspace conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_mmx):
|
||||
|
||||
PW_F0402 times 4 dw F_0_402
|
||||
PW_MF0228 times 4 dw -F_0_228
|
||||
PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
|
||||
PW_ONE times 4 dw 1
|
||||
PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jdcolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
|
||||
%include "jdcolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
|
||||
%include "jdcolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
|
||||
%include "jdcolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
|
||||
%include "jdcolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
|
||||
%include "jdcolext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
|
||||
%include "jdcolext-mmx.asm"
|
||||
117
TMessagesProj/jni/mozjpeg/simd/i386/jdcolor-sse2.asm
Normal file
117
TMessagesProj/jni/mozjpeg/simd/i386/jdcolor-sse2.asm
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
;
|
||||
; jdcolor.asm - colorspace conversion (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_sse2):
|
||||
|
||||
PW_F0402 times 8 dw F_0_402
|
||||
PW_MF0228 times 8 dw -F_0_228
|
||||
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
|
||||
PW_ONE times 8 dw 1
|
||||
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
136
TMessagesProj/jni/mozjpeg/simd/i386/jdmerge-avx2.asm
Normal file
136
TMessagesProj/jni/mozjpeg/simd/i386/jdmerge-avx2.asm
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
;
|
||||
; jdmerge.asm - merged upsampling/color conversion (AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_merged_upsample_avx2)
|
||||
|
||||
EXTN(jconst_merged_upsample_avx2):
|
||||
|
||||
PW_F0402 times 16 dw F_0_402
|
||||
PW_MF0228 times 16 dw -F_0_228
|
||||
PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
|
||||
PW_ONE times 16 dw 1
|
||||
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extrgb_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extrgb_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extrgbx_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extrgbx_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extbgr_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extbgr_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extbgrx_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extbgrx_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extxbgr_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extxbgr_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extxrgb_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extxrgb_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
123
TMessagesProj/jni/mozjpeg/simd/i386/jdmerge-mmx.asm
Normal file
123
TMessagesProj/jni/mozjpeg/simd/i386/jdmerge-mmx.asm
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
;
|
||||
; jdmerge.asm - merged upsampling/color conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_merged_upsample_mmx)
|
||||
|
||||
EXTN(jconst_merged_upsample_mmx):
|
||||
|
||||
PW_F0402 times 4 dw F_0_402
|
||||
PW_MF0228 times 4 dw -F_0_228
|
||||
PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
|
||||
PW_ONE times 4 dw 1
|
||||
PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jdmrgext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
|
||||
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
|
||||
%include "jdmrgext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
|
||||
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
|
||||
%include "jdmrgext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
|
||||
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
|
||||
%include "jdmrgext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
|
||||
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
|
||||
%include "jdmrgext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
|
||||
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
|
||||
%include "jdmrgext-mmx.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
|
||||
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
|
||||
%include "jdmrgext-mmx.asm"
|
||||
135
TMessagesProj/jni/mozjpeg/simd/i386/jdmerge-sse2.asm
Normal file
135
TMessagesProj/jni/mozjpeg/simd/i386/jdmerge-sse2.asm
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
;
|
||||
; jdmerge.asm - merged upsampling/color conversion (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_merged_upsample_sse2)
|
||||
|
||||
EXTN(jconst_merged_upsample_sse2):
|
||||
|
||||
PW_F0402 times 8 dw F_0_402
|
||||
PW_MF0228 times 8 dw -F_0_228
|
||||
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
|
||||
PW_ONE times 8 dw 1
|
||||
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extrgb_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extrgb_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extrgbx_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extrgbx_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extbgr_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extbgr_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extbgrx_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extbgrx_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extxbgr_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extxbgr_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extxrgb_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extxrgb_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
575
TMessagesProj/jni/mozjpeg/simd/i386/jdmrgext-avx2.asm
Normal file
575
TMessagesProj/jni/mozjpeg/simd/i386/jdmrgext-avx2.asm
Normal file
|
|
@ -0,0 +1,575 @@
|
|||
;
|
||||
; jdmrgext.asm - merged upsampling/color conversion (AVX2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b) + 8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
|
||||
; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_avx2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [output_width(eax)] ; col
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
pop ecx ; col
|
||||
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
|
||||
vpcmpeqw ymm3, ymm3, ymm3
|
||||
vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
|
||||
vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
|
||||
vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL
|
||||
vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
|
||||
vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL
|
||||
vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
|
||||
|
||||
vpaddw ymm5, ymm6, ymm3
|
||||
vpaddw ymm2, ymm4, ymm3
|
||||
vpaddw ymm1, ymm7, ymm3
|
||||
vpaddw ymm3, ymm0, ymm3
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH
|
||||
vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL
|
||||
vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH
|
||||
vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL
|
||||
|
||||
vpmulhw ymm6, ymm6, [GOTOFF(eax,PW_MF0228)] ; ymm6=(2*CbH * -FIX(0.22800))
|
||||
vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbL * -FIX(0.22800))
|
||||
vpmulhw ymm7, ymm7, [GOTOFF(eax,PW_F0402)] ; ymm7=(2*CrH * FIX(0.40200))
|
||||
vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrL * FIX(0.40200))
|
||||
|
||||
vpaddw ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
|
||||
vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
|
||||
vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800))
|
||||
vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800))
|
||||
vpaddw ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
|
||||
vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
|
||||
vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200))
|
||||
vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200))
|
||||
|
||||
vpaddw ymm6, ymm6, ymm5
|
||||
vpaddw ymm4, ymm4, ymm2
|
||||
vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||
vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||
vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||
vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H
|
||||
vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H
|
||||
|
||||
vpunpckhwd ymm6, ymm5, ymm1
|
||||
vpunpcklwd ymm5, ymm5, ymm1
|
||||
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
vpunpckhwd ymm7, ymm2, ymm3
|
||||
vpunpcklwd ymm2, ymm2, ymm3
|
||||
vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpaddd ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpsrad ymm5, ymm5, SCALEBITS
|
||||
vpsrad ymm6, ymm6, SCALEBITS
|
||||
vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpaddd ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
|
||||
vpsrad ymm2, ymm2, SCALEBITS
|
||||
vpsrad ymm7, ymm7, SCALEBITS
|
||||
|
||||
vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||
vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||
vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||
vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||
|
||||
vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H
|
||||
|
||||
mov al, 2 ; Yctr
|
||||
jmp short .Yloop_1st
|
||||
alignx 16, 7
|
||||
|
||||
.Yloop_2nd:
|
||||
vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
|
||||
vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
|
||||
vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
|
||||
alignx 16, 7
|
||||
|
||||
.Yloop_1st:
|
||||
vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpcmpeqw ymm6, ymm6, ymm6
|
||||
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE
|
||||
vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO
|
||||
|
||||
vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H)
|
||||
vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H)
|
||||
vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H)
|
||||
|
||||
vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
|
||||
vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
|
||||
vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
|
||||
vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
|
||||
; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
|
||||
; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
|
||||
|
||||
vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
|
||||
; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
|
||||
vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
|
||||
; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
|
||||
; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
|
||||
|
||||
vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
|
||||
; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
|
||||
|
||||
vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
|
||||
; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
|
||||
vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
|
||||
; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
|
||||
vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
|
||||
; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
|
||||
|
||||
vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
|
||||
; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
|
||||
vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
|
||||
; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
|
||||
|
||||
vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
|
||||
; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
|
||||
vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
|
||||
vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
|
||||
vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
|
||||
|
||||
vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
|
||||
; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
|
||||
vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
|
||||
; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
|
||||
vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
|
||||
; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
|
||||
vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
|
||||
|
||||
vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test edi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add esi, byte SIZEOF_YMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st64:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_YMMWORD
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
add edi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmF
|
||||
sub ecx, byte 2*SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st32:
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st31
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
add edi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st31:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
vmovq XMM_MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
vmovd XMM_DWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_DWORD
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of eax to the output when it has enough
|
||||
; space.
|
||||
vmovd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of eax to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%else
|
||||
vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%endif
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
|
||||
; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
|
||||
; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
|
||||
vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
|
||||
; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
|
||||
; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
|
||||
|
||||
vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
|
||||
; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
|
||||
; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
|
||||
vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
|
||||
; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
|
||||
vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
|
||||
; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
|
||||
|
||||
vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
|
||||
vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
cmp ecx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test edi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
|
||||
.out0:
|
||||
add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_YMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add esi, byte SIZEOF_YMMWORD ; inptr0
|
||||
dec al
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st64:
|
||||
cmp ecx, byte SIZEOF_YMMWORD/2
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
|
||||
add edi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmC
|
||||
vmovdqa ymmD, ymmH
|
||||
sub ecx, byte SIZEOF_YMMWORD/2
|
||||
.column_st32:
|
||||
cmp ecx, byte SIZEOF_YMMWORD/4
|
||||
jb short .column_st16
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
|
||||
add edi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub ecx, byte SIZEOF_YMMWORD/4
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_YMMWORD/8
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
sub ecx, byte SIZEOF_YMMWORD/8
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_YMMWORD/16
|
||||
jb short .column_st7
|
||||
vmovq MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_YMMWORD/16*4
|
||||
sub ecx, byte SIZEOF_YMMWORD/16
|
||||
vpsrldq xmmA, SIZEOF_YMMWORD/16*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
vmovd XMM_DWORD [edi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.endcolumn:
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b) + 8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov eax, POINTER [output_width(ebp)]
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(ebp)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(ebp)]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
push edx ; inptr2
|
||||
push ebx ; inptr1
|
||||
push esi ; inptr00
|
||||
mov ebx, esp
|
||||
|
||||
push edi ; output_buf (outptr0)
|
||||
push ecx ; in_row_group_ctr
|
||||
push ebx ; input_buf
|
||||
push eax ; output_width
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_avx2)
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; inptr01
|
||||
add edi, byte SIZEOF_JSAMPROW ; outptr1
|
||||
mov POINTER [ebx+0*SIZEOF_POINTER], esi
|
||||
mov POINTER [ebx-1*SIZEOF_POINTER], edi
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_avx2)
|
||||
|
||||
add esp, byte 7*SIZEOF_DWORD
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
460
TMessagesProj/jni/mozjpeg/simd/i386/jdmrgext-mmx.asm
Normal file
460
TMessagesProj/jni/mozjpeg/simd/i386/jdmrgext-mmx.asm
Normal file
|
|
@ -0,0 +1,460 @@
|
|||
;
|
||||
; jdmrgext.asm - merged upsampling/color conversion (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b) + 8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [output_width(eax)] ; col
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
pop ecx ; col
|
||||
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
|
||||
movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
|
||||
|
||||
pxor mm1, mm1 ; mm1=(all 0's)
|
||||
pcmpeqw mm3, mm3
|
||||
psllw mm3, 7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
|
||||
|
||||
movq mm4, mm6
|
||||
punpckhbw mm6, mm1 ; mm6=Cb(4567)=CbH
|
||||
punpcklbw mm4, mm1 ; mm4=Cb(0123)=CbL
|
||||
movq mm0, mm7
|
||||
punpckhbw mm7, mm1 ; mm7=Cr(4567)=CrH
|
||||
punpcklbw mm0, mm1 ; mm0=Cr(0123)=CrL
|
||||
|
||||
paddw mm6, mm3
|
||||
paddw mm4, mm3
|
||||
paddw mm7, mm3
|
||||
paddw mm0, mm3
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movq mm5, mm6 ; mm5=CbH
|
||||
movq mm2, mm4 ; mm2=CbL
|
||||
paddw mm6, mm6 ; mm6=2*CbH
|
||||
paddw mm4, mm4 ; mm4=2*CbL
|
||||
movq mm1, mm7 ; mm1=CrH
|
||||
movq mm3, mm0 ; mm3=CrL
|
||||
paddw mm7, mm7 ; mm7=2*CrH
|
||||
paddw mm0, mm0 ; mm0=2*CrL
|
||||
|
||||
pmulhw mm6, [GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800))
|
||||
pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800))
|
||||
pmulhw mm7, [GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200))
|
||||
pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200))
|
||||
|
||||
paddw mm6, [GOTOFF(eax,PW_ONE)]
|
||||
paddw mm4, [GOTOFF(eax,PW_ONE)]
|
||||
psraw mm6, 1 ; mm6=(CbH * -FIX(0.22800))
|
||||
psraw mm4, 1 ; mm4=(CbL * -FIX(0.22800))
|
||||
paddw mm7, [GOTOFF(eax,PW_ONE)]
|
||||
paddw mm0, [GOTOFF(eax,PW_ONE)]
|
||||
psraw mm7, 1 ; mm7=(CrH * FIX(0.40200))
|
||||
psraw mm0, 1 ; mm0=(CrL * FIX(0.40200))
|
||||
|
||||
paddw mm6, mm5
|
||||
paddw mm4, mm2
|
||||
paddw mm6, mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||
paddw mm4, mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||
paddw mm7, mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||
paddw mm0, mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||
|
||||
movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H
|
||||
movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H
|
||||
|
||||
movq mm6, mm5
|
||||
movq mm7, mm2
|
||||
punpcklwd mm5, mm1
|
||||
punpckhwd mm6, mm1
|
||||
pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
punpcklwd mm2, mm3
|
||||
punpckhwd mm7, mm3
|
||||
pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm6, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm5, SCALEBITS
|
||||
psrad mm6, SCALEBITS
|
||||
paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd mm7, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad mm2, SCALEBITS
|
||||
psrad mm7, SCALEBITS
|
||||
|
||||
packssdw mm5, mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||
packssdw mm2, mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||
psubw mm5, mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||
psubw mm2, mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||
|
||||
movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H
|
||||
|
||||
mov al, 2 ; Yctr
|
||||
jmp short .Yloop_1st
|
||||
alignx 16, 7
|
||||
|
||||
.Yloop_2nd:
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
|
||||
movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
|
||||
movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
|
||||
alignx 16, 7
|
||||
|
||||
.Yloop_1st:
|
||||
movq mm7, MMWORD [esi] ; mm7=Y(01234567)
|
||||
|
||||
pcmpeqw mm6, mm6
|
||||
psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand mm6, mm7 ; mm6=Y(0246)=YE
|
||||
psrlw mm7, BYTE_BIT ; mm7=Y(1357)=YO
|
||||
|
||||
movq mm1, mm0 ; mm1=mm0=(R-Y)(L/H)
|
||||
movq mm3, mm2 ; mm3=mm2=(G-Y)(L/H)
|
||||
movq mm5, mm4 ; mm5=mm4=(B-Y)(L/H)
|
||||
|
||||
paddw mm0, mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
|
||||
paddw mm1, mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
|
||||
packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
|
||||
packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
|
||||
|
||||
paddw mm2, mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
|
||||
paddw mm3, mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
|
||||
packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
|
||||
packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
|
||||
|
||||
paddw mm4, mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
|
||||
paddw mm5, mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
|
||||
packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
|
||||
packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
|
||||
punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
|
||||
|
||||
movq mmG, mmA
|
||||
movq mmH, mmA
|
||||
punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
|
||||
punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
|
||||
|
||||
psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
|
||||
psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
|
||||
|
||||
movq mmC, mmD
|
||||
movq mmB, mmD
|
||||
punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
|
||||
punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
|
||||
|
||||
psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
|
||||
|
||||
movq mmF, mmE
|
||||
punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
|
||||
punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
|
||||
|
||||
punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
|
||||
punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
|
||||
punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st16:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_MMWORD
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
|
||||
movq mmA, mmC
|
||||
sub ecx, byte 2*SIZEOF_MMWORD
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
jmp short .column_st4
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA, mmE
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
movd eax, mmA
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st2
|
||||
mov dword [edi+0*SIZEOF_DWORD], eax
|
||||
psrlq mmA, DWORD_BIT
|
||||
movd eax, mmA
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
add edi, byte SIZEOF_DWORD
|
||||
.column_st2:
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [edi+0*SIZEOF_WORD], ax
|
||||
shr eax, WORD_BIT
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
add edi, byte SIZEOF_WORD
|
||||
.column_st1:
|
||||
cmp ecx, byte SIZEOF_BYTE
|
||||
jb short .endcolumn
|
||||
mov byte [edi+0*SIZEOF_BYTE], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%else
|
||||
pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
|
||||
pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
|
||||
%endif
|
||||
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
|
||||
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
|
||||
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
|
||||
; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
|
||||
|
||||
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
|
||||
punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
|
||||
punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
|
||||
punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
|
||||
|
||||
movq mmC, mmA
|
||||
punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
|
||||
punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
|
||||
movq mmG, mmB
|
||||
punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
|
||||
punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
|
||||
|
||||
movq mmD, mmA
|
||||
punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
|
||||
movq mmH, mmC
|
||||
punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
|
||||
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st16
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
|
||||
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
jz short .endcolumn
|
||||
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
|
||||
add esi, byte SIZEOF_MMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_MMWORD ; inptr1
|
||||
add edx, byte SIZEOF_MMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_MMWORD/2
|
||||
jb short .column_st8
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
|
||||
movq mmA, mmC
|
||||
movq mmD, mmH
|
||||
sub ecx, byte SIZEOF_MMWORD/2
|
||||
add edi, byte 2*SIZEOF_MMWORD
|
||||
.column_st8:
|
||||
cmp ecx, byte SIZEOF_MMWORD/4
|
||||
jb short .column_st4
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
|
||||
movq mmA, mmD
|
||||
sub ecx, byte SIZEOF_MMWORD/4
|
||||
add edi, byte 1*SIZEOF_MMWORD
|
||||
.column_st4:
|
||||
cmp ecx, byte SIZEOF_MMWORD/8
|
||||
jb short .endcolumn
|
||||
movd dword [edi+0*SIZEOF_DWORD], mmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.endcolumn:
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b) + 8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov eax, JDIMENSION [output_width(ebp)]
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(ebp)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(ebp)]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
push edx ; inptr2
|
||||
push ebx ; inptr1
|
||||
push esi ; inptr00
|
||||
mov ebx, esp
|
||||
|
||||
push edi ; output_buf (outptr0)
|
||||
push ecx ; in_row_group_ctr
|
||||
push ebx ; input_buf
|
||||
push eax ; output_width
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_mmx)
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; inptr01
|
||||
add edi, byte SIZEOF_JSAMPROW ; outptr1
|
||||
mov POINTER [ebx+0*SIZEOF_POINTER], esi
|
||||
mov POINTER [ebx-1*SIZEOF_POINTER], edi
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_mmx)
|
||||
|
||||
add esp, byte 7*SIZEOF_DWORD
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
517
TMessagesProj/jni/mozjpeg/simd/i386/jdmrgext-sse2.asm
Normal file
517
TMessagesProj/jni/mozjpeg/simd/i386/jdmrgext-sse2.asm
Normal file
|
|
@ -0,0 +1,517 @@
|
|||
;
|
||||
; jdmrgext.asm - merged upsampling/color conversion (SSE2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2012, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b) + 8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov ecx, JDIMENSION [output_width(eax)] ; col
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
push ecx
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(eax)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)]
|
||||
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
pop ecx ; col
|
||||
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
movpic eax, POINTER [gotptr] ; load GOT address (eax)
|
||||
|
||||
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
|
||||
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
|
||||
|
||||
pxor xmm1, xmm1 ; xmm1=(all 0's)
|
||||
pcmpeqw xmm3, xmm3
|
||||
psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
|
||||
punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
|
||||
movdqa xmm0, xmm7
|
||||
punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
|
||||
punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
|
||||
|
||||
paddw xmm6, xmm3
|
||||
paddw xmm4, xmm3
|
||||
paddw xmm7, xmm3
|
||||
paddw xmm0, xmm3
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movdqa xmm5, xmm6 ; xmm5=CbH
|
||||
movdqa xmm2, xmm4 ; xmm2=CbL
|
||||
paddw xmm6, xmm6 ; xmm6=2*CbH
|
||||
paddw xmm4, xmm4 ; xmm4=2*CbL
|
||||
movdqa xmm1, xmm7 ; xmm1=CrH
|
||||
movdqa xmm3, xmm0 ; xmm3=CrL
|
||||
paddw xmm7, xmm7 ; xmm7=2*CrH
|
||||
paddw xmm0, xmm0 ; xmm0=2*CrL
|
||||
|
||||
pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
|
||||
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
|
||||
pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
|
||||
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
|
||||
|
||||
paddw xmm6, [GOTOFF(eax,PW_ONE)]
|
||||
paddw xmm4, [GOTOFF(eax,PW_ONE)]
|
||||
psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
|
||||
psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
|
||||
paddw xmm7, [GOTOFF(eax,PW_ONE)]
|
||||
paddw xmm0, [GOTOFF(eax,PW_ONE)]
|
||||
psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
|
||||
psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
|
||||
|
||||
paddw xmm6, xmm5
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||
paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||
paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||
paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
movdqa xmm7, xmm2
|
||||
punpcklwd xmm5, xmm1
|
||||
punpckhwd xmm6, xmm1
|
||||
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
punpcklwd xmm2, xmm3
|
||||
punpckhwd xmm7, xmm3
|
||||
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
|
||||
|
||||
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad xmm5, SCALEBITS
|
||||
psrad xmm6, SCALEBITS
|
||||
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
|
||||
paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
|
||||
psrad xmm2, SCALEBITS
|
||||
psrad xmm7, SCALEBITS
|
||||
|
||||
packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||
packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||
psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||
psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
|
||||
|
||||
mov al, 2 ; Yctr
|
||||
jmp short .Yloop_1st
|
||||
alignx 16, 7
|
||||
|
||||
.Yloop_2nd:
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
|
||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
|
||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
|
||||
alignx 16, 7
|
||||
|
||||
.Yloop_1st:
|
||||
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm6, xmm6
|
||||
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
|
||||
psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
|
||||
|
||||
movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
|
||||
movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
|
||||
movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
|
||||
|
||||
paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
|
||||
paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
|
||||
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||
|
||||
paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
|
||||
paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
|
||||
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||
|
||||
paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
|
||||
paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
|
||||
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
movdqa xmmH, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||
|
||||
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||
|
||||
movdqa xmmC, xmmD
|
||||
movdqa xmmB, xmmD
|
||||
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||
|
||||
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||
|
||||
movdqa xmmF, xmmE
|
||||
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||
|
||||
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||
movdqa xmmB, xmmE
|
||||
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||
|
||||
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||
movdqa xmmB, xmmF
|
||||
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||
|
||||
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test edi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add esi, byte SIZEOF_XMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st32:
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmF
|
||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
movq XMM_MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_MMWORD
|
||||
sub ecx, byte SIZEOF_MMWORD
|
||||
psrldq xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
movd XMM_DWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_DWORD
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
psrldq xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of eax to the output when it has enough
|
||||
; space.
|
||||
movd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of eax to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%else
|
||||
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%endif
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||
movdqa xmmG, xmmB
|
||||
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
movdqa xmmH, xmmC
|
||||
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test edi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add esi, byte SIZEOF_XMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add ebx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add edx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.column_st32:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmC
|
||||
movdqa xmmD, xmmH
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_XMMWORD/8
|
||||
jb short .column_st7
|
||||
movq XMM_MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD/8*4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
psrldq xmmA, SIZEOF_XMMWORD/8*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
movd XMM_DWORD [edi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.endcolumn:
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
%define output_width(b) (b) + 8 ; JDIMENSION output_width
|
||||
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
|
||||
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
|
||||
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov eax, POINTER [output_width(ebp)]
|
||||
|
||||
mov edi, JSAMPIMAGE [input_buf(ebp)]
|
||||
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
|
||||
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
|
||||
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
|
||||
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
|
||||
mov edi, JSAMPARRAY [output_buf(ebp)]
|
||||
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
|
||||
|
||||
push edx ; inptr2
|
||||
push ebx ; inptr1
|
||||
push esi ; inptr00
|
||||
mov ebx, esp
|
||||
|
||||
push edi ; output_buf (outptr0)
|
||||
push ecx ; in_row_group_ctr
|
||||
push ebx ; input_buf
|
||||
push eax ; output_width
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; inptr01
|
||||
add edi, byte SIZEOF_JSAMPROW ; outptr1
|
||||
mov POINTER [ebx+0*SIZEOF_POINTER], esi
|
||||
mov POINTER [ebx-1*SIZEOF_POINTER], edi
|
||||
|
||||
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
add esp, byte 7*SIZEOF_DWORD
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
760
TMessagesProj/jni/mozjpeg/simd/i386/jdsample-avx2.asm
Normal file
760
TMessagesProj/jni/mozjpeg/simd/i386/jdsample-avx2.asm
Normal file
|
|
@ -0,0 +1,760 @@
|
|||
;
|
||||
; jdsample.asm - upsampling (AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fancy_upsample_avx2)
|
||||
|
||||
EXTN(jconst_fancy_upsample_avx2):
|
||||
|
||||
PW_ONE times 16 dw 1
|
||||
PW_TWO times 16 dw 2
|
||||
PW_THREE times 16 dw 3
|
||||
PW_SEVEN times 16 dw 7
|
||||
PW_EIGHT times 16 dw 8
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; The upsampling algorithm is linear interpolation between pixel centers,
|
||||
; also known as a "triangle filter". This is a good compromise between
|
||||
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
|
||||
; of the way between input pixel centers.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
|
||||
test eax, eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
test eax, SIZEOF_YMMWORD-1
|
||||
jz short .skip
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
.skip:
|
||||
vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
|
||||
vpcmpeqb xmm7, xmm7, xmm7
|
||||
vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
|
||||
vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
|
||||
add eax, byte SIZEOF_YMMWORD-1
|
||||
and eax, byte -SIZEOF_YMMWORD
|
||||
cmp eax, byte SIZEOF_YMMWORD
|
||||
ja short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_last:
|
||||
vpcmpeqb xmm6, xmm6, xmm6
|
||||
vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1)
|
||||
vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
|
||||
vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
jmp short .upsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
|
||||
vperm2i128 ymm6, ymm0, ymm6, 0x20
|
||||
vpslldq ymm6, ymm6, 15
|
||||
|
||||
.upsample:
|
||||
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
|
||||
|
||||
vperm2i128 ymm2, ymm0, ymm1, 0x20
|
||||
vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
|
||||
vperm2i128 ymm4, ymm0, ymm1, 0x03
|
||||
vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
|
||||
|
||||
vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
|
||||
vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
|
||||
|
||||
vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
|
||||
|
||||
vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
|
||||
vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
|
||||
vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
|
||||
vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
|
||||
vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
|
||||
vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
|
||||
|
||||
vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
|
||||
|
||||
vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
|
||||
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
|
||||
vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
|
||||
vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
|
||||
vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
|
||||
vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
|
||||
|
||||
vpaddw ymm2, ymm2, ymm1
|
||||
vpaddw ymm5, ymm5, ymm4
|
||||
vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
|
||||
vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
|
||||
vpaddw ymm3, ymm3, ymm1
|
||||
vpaddw ymm6, ymm6, ymm4
|
||||
vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
|
||||
vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
|
||||
|
||||
vpsllw ymm3, ymm3, BYTE_BIT
|
||||
vpsllw ymm6, ymm6, BYTE_BIT
|
||||
vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
|
||||
vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
|
||||
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
|
||||
|
||||
sub eax, byte SIZEOF_YMMWORD
|
||||
add esi, byte 1*SIZEOF_YMMWORD ; inptr
|
||||
add edi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
cmp eax, byte SIZEOF_YMMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; Again a triangle filter; see comments for h2v1 case, above.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
|
||||
; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_avx2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov edx, eax ; edx = original ebp
|
||||
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
|
||||
test eax, eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(edx)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(edx)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
|
||||
test eax, SIZEOF_YMMWORD-1
|
||||
jz short .skip
|
||||
push edx
|
||||
mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
pop edx
|
||||
.skip:
|
||||
; -- process the first column block
|
||||
|
||||
vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
|
||||
vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
|
||||
vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
|
||||
|
||||
vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
|
||||
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
vpcmpeqb xmm7, xmm7, xmm7
|
||||
vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
|
||||
|
||||
vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
|
||||
vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
|
||||
|
||||
vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm1
|
||||
vmovdqa YMMWORD [wk(1)], ymm2
|
||||
|
||||
poppic ebx
|
||||
|
||||
add eax, byte SIZEOF_YMMWORD-1
|
||||
and eax, byte -SIZEOF_YMMWORD
|
||||
cmp eax, byte SIZEOF_YMMWORD
|
||||
ja short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_last:
|
||||
; -- process the last column block
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
vpcmpeqb xmm1, xmm1, xmm1
|
||||
vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2)
|
||||
vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
|
||||
|
||||
vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
|
||||
vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
|
||||
|
||||
vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
|
||||
vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
|
||||
|
||||
jmp near .upsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
; -- process the next column block
|
||||
|
||||
vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
|
||||
vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
|
||||
vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
|
||||
|
||||
vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
|
||||
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
|
||||
vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
|
||||
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
|
||||
vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
|
||||
|
||||
vperm2i128 ymm1, ymm3, ymm1, 0x20
|
||||
vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
|
||||
vperm2i128 ymm2, ymm3, ymm2, 0x20
|
||||
vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
|
||||
|
||||
vmovdqa YMMWORD [wk(2)], ymm1
|
||||
vmovdqa YMMWORD [wk(3)], ymm2
|
||||
|
||||
.upsample:
|
||||
; -- process the upper row
|
||||
|
||||
vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
|
||||
|
||||
vperm2i128 ymm0, ymm1, ymm7, 0x03
|
||||
vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
|
||||
vperm2i128 ymm4, ymm1, ymm3, 0x20
|
||||
vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
|
||||
|
||||
vperm2i128 ymm5, ymm1, ymm7, 0x03
|
||||
vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm6, ymm1, ymm3, 0x20
|
||||
vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
|
||||
vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vperm2i128 ymm2, ymm1, ymm3, 0x03
|
||||
vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
|
||||
vperm2i128 ymm4, ymm1, ymm3, 0x03
|
||||
vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm1, ymm1, ymm7, 0x20
|
||||
vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
|
||||
vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm4
|
||||
|
||||
vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
|
||||
vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
|
||||
vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||
vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
|
||||
vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
|
||||
vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
vpaddw ymm1, ymm1, ymm7
|
||||
vpaddw ymm5, ymm5, ymm3
|
||||
vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
|
||||
vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm2, ymm2, ymm3
|
||||
vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
|
||||
vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
|
||||
|
||||
vpsllw ymm0, ymm0, BYTE_BIT
|
||||
vpsllw ymm2, ymm2, BYTE_BIT
|
||||
vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
|
||||
vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
|
||||
|
||||
vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
|
||||
vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
|
||||
|
||||
; -- process the lower row
|
||||
|
||||
vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
|
||||
|
||||
vperm2i128 ymm7, ymm1, ymm6, 0x03
|
||||
vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
|
||||
vperm2i128 ymm3, ymm1, ymm4, 0x20
|
||||
vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
|
||||
|
||||
vperm2i128 ymm0, ymm1, ymm6, 0x03
|
||||
vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm2, ymm1, ymm4, 0x20
|
||||
vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
|
||||
vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vperm2i128 ymm5, ymm1, ymm4, 0x03
|
||||
vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
|
||||
vperm2i128 ymm3, ymm1, ymm4, 0x03
|
||||
vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm1, ymm1, ymm6, 0x20
|
||||
vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
|
||||
vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
|
||||
|
||||
vmovdqa YMMWORD [wk(1)], ymm3
|
||||
|
||||
vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
|
||||
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
|
||||
vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||
vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
|
||||
vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
|
||||
vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
vpaddw ymm1, ymm1, ymm6
|
||||
vpaddw ymm0, ymm0, ymm4
|
||||
vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
|
||||
vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
|
||||
vpaddw ymm7, ymm7, ymm6
|
||||
vpaddw ymm5, ymm5, ymm4
|
||||
vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
|
||||
vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
|
||||
|
||||
vpsllw ymm7, ymm7, BYTE_BIT
|
||||
vpsllw ymm5, ymm5, BYTE_BIT
|
||||
vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
|
||||
vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
|
||||
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
|
||||
|
||||
poppic ebx
|
||||
|
||||
sub eax, byte SIZEOF_YMMWORD
|
||||
add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
|
||||
add ebx, byte 1*SIZEOF_YMMWORD ; inptr0
|
||||
add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
|
||||
add edx, byte 2*SIZEOF_YMMWORD ; outptr0
|
||||
add edi, byte 2*SIZEOF_YMMWORD ; outptr1
|
||||
cmp eax, byte SIZEOF_YMMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b) + 12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (SIZEOF_YMMWORD-1)
|
||||
and edx, -SIZEOF_YMMWORD
|
||||
jz short .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz short .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
mov eax, edx ; colctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
cmp eax, byte SIZEOF_YMMWORD
|
||||
ja near .above_16
|
||||
|
||||
vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
vpunpckhbw xmm1, xmm0, xmm0
|
||||
vpunpcklbw xmm0, xmm0, xmm0
|
||||
|
||||
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||
vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
jmp short .nextrow
|
||||
|
||||
.above_16:
|
||||
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vpunpckhbw ymm1, ymm0, ymm0
|
||||
vpunpcklbw ymm0, ymm0, ymm0
|
||||
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_YMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte SIZEOF_YMMWORD ; inptr
|
||||
add edi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
jmp short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b) + 12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (SIZEOF_YMMWORD-1)
|
||||
and edx, -SIZEOF_YMMWORD
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
mov eax, edx ; colctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
cmp eax, byte SIZEOF_YMMWORD
|
||||
ja short .above_16
|
||||
|
||||
vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
vpunpckhbw xmm1, xmm0, xmm0
|
||||
vpunpcklbw xmm0, xmm0, xmm0
|
||||
|
||||
vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
|
||||
vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
|
||||
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||
vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
jmp near .nextrow
|
||||
|
||||
.above_16:
|
||||
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
|
||||
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vpunpckhbw ymm1, ymm0, ymm0
|
||||
vpunpcklbw ymm0, ymm0, ymm0
|
||||
|
||||
vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
|
||||
vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
|
||||
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
|
||||
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_YMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte SIZEOF_YMMWORD ; inptr
|
||||
add ebx, 2*SIZEOF_YMMWORD ; outptr0
|
||||
add edi, 2*SIZEOF_YMMWORD ; outptr1
|
||||
jmp short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
731
TMessagesProj/jni/mozjpeg/simd/i386/jdsample-mmx.asm
Normal file
731
TMessagesProj/jni/mozjpeg/simd/i386/jdsample-mmx.asm
Normal file
|
|
@ -0,0 +1,731 @@
|
|||
;
|
||||
; jdsample.asm - upsampling (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fancy_upsample_mmx)
|
||||
|
||||
EXTN(jconst_fancy_upsample_mmx):
|
||||
|
||||
PW_ONE times 4 dw 1
|
||||
PW_TWO times 4 dw 2
|
||||
PW_THREE times 4 dw 3
|
||||
PW_SEVEN times 4 dw 7
|
||||
PW_EIGHT times 4 dw 8
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; The upsampling algorithm is linear interpolation between pixel centers,
|
||||
; also known as a "triangle filter". This is a good compromise between
|
||||
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
|
||||
; of the way between input pixel centers.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
|
||||
test eax, eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
test eax, SIZEOF_MMWORD-1
|
||||
jz short .skip
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
.skip:
|
||||
pxor mm0, mm0 ; mm0=(all 0's)
|
||||
pcmpeqb mm7, mm7
|
||||
psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
|
||||
pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
|
||||
add eax, byte SIZEOF_MMWORD-1
|
||||
and eax, byte -SIZEOF_MMWORD
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_last:
|
||||
pcmpeqb mm6, mm6
|
||||
psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
|
||||
pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
jmp short .upsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
|
||||
|
||||
.upsample:
|
||||
movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
movq mm2, mm1
|
||||
movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7)
|
||||
psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
|
||||
psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
|
||||
|
||||
por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6)
|
||||
por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8)
|
||||
|
||||
movq mm7, mm1
|
||||
psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
|
||||
|
||||
movq mm4, mm1
|
||||
punpcklbw mm1, mm0 ; mm1=( 0 1 2 3)
|
||||
punpckhbw mm4, mm0 ; mm4=( 4 5 6 7)
|
||||
movq mm5, mm2
|
||||
punpcklbw mm2, mm0 ; mm2=(-1 0 1 2)
|
||||
punpckhbw mm5, mm0 ; mm5=( 3 4 5 6)
|
||||
movq mm6, mm3
|
||||
punpcklbw mm3, mm0 ; mm3=( 1 2 3 4)
|
||||
punpckhbw mm6, mm0 ; mm6=( 5 6 7 8)
|
||||
|
||||
pmullw mm1, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
|
||||
paddw mm2, [GOTOFF(ebx,PW_ONE)]
|
||||
paddw mm5, [GOTOFF(ebx,PW_ONE)]
|
||||
paddw mm3, [GOTOFF(ebx,PW_TWO)]
|
||||
paddw mm6, [GOTOFF(ebx,PW_TWO)]
|
||||
|
||||
paddw mm2, mm1
|
||||
paddw mm5, mm4
|
||||
psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6)
|
||||
psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14)
|
||||
paddw mm3, mm1
|
||||
paddw mm6, mm4
|
||||
psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7)
|
||||
psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15)
|
||||
|
||||
psllw mm3, BYTE_BIT
|
||||
psllw mm6, BYTE_BIT
|
||||
por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
|
||||
por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
|
||||
|
||||
sub eax, byte SIZEOF_MMWORD
|
||||
add esi, byte 1*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte 2*SIZEOF_MMWORD ; outptr
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; Again a triangle filter; see comments for h2v1 case, above.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov edx, eax ; edx = original ebp
|
||||
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
|
||||
test eax, eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(edx)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(edx)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
|
||||
test eax, SIZEOF_MMWORD-1
|
||||
jz short .skip
|
||||
push edx
|
||||
mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
pop edx
|
||||
.skip:
|
||||
; -- process the first column block
|
||||
|
||||
movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
|
||||
movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
|
||||
movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pxor mm3, mm3 ; mm3=(all 0's)
|
||||
movq mm4, mm0
|
||||
punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3)
|
||||
punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7)
|
||||
movq mm5, mm1
|
||||
punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3)
|
||||
punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7)
|
||||
movq mm6, mm2
|
||||
punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3)
|
||||
punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7)
|
||||
|
||||
pmullw mm0, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
pcmpeqb mm7, mm7
|
||||
psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
|
||||
|
||||
paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
|
||||
paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
|
||||
paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
|
||||
paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
|
||||
|
||||
movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
|
||||
movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
|
||||
|
||||
pand mm1, mm7 ; mm1=( 0 - - -)
|
||||
pand mm2, mm7 ; mm2=( 0 - - -)
|
||||
|
||||
movq MMWORD [wk(0)], mm1
|
||||
movq MMWORD [wk(1)], mm2
|
||||
|
||||
poppic ebx
|
||||
|
||||
add eax, byte SIZEOF_MMWORD-1
|
||||
and eax, byte -SIZEOF_MMWORD
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_last:
|
||||
; -- process the last column block
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pcmpeqb mm1, mm1
|
||||
psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
|
||||
movq mm2, mm1
|
||||
|
||||
pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
|
||||
pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
|
||||
|
||||
movq MMWORD [wk(2)], mm1
|
||||
movq MMWORD [wk(3)], mm2
|
||||
|
||||
jmp short .upsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
; -- process the next column block
|
||||
|
||||
movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
|
||||
movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
|
||||
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pxor mm3, mm3 ; mm3=(all 0's)
|
||||
movq mm4, mm0
|
||||
punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3)
|
||||
punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7)
|
||||
movq mm5, mm1
|
||||
punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3)
|
||||
punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7)
|
||||
movq mm6, mm2
|
||||
punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3)
|
||||
punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7)
|
||||
|
||||
pmullw mm0, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
|
||||
paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
|
||||
paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
|
||||
paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
|
||||
|
||||
movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
|
||||
movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
|
||||
|
||||
psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
|
||||
psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
|
||||
|
||||
movq MMWORD [wk(2)], mm1
|
||||
movq MMWORD [wk(3)], mm2
|
||||
|
||||
.upsample:
|
||||
; -- process the upper row
|
||||
|
||||
movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
|
||||
movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
|
||||
|
||||
movq mm0, mm7
|
||||
movq mm4, mm3
|
||||
psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -)
|
||||
psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
|
||||
movq mm5, mm7
|
||||
movq mm6, mm3
|
||||
psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
|
||||
psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6)
|
||||
|
||||
por mm0, mm4 ; mm0=( 1 2 3 4)
|
||||
por mm5, mm6 ; mm5=( 3 4 5 6)
|
||||
|
||||
movq mm1, mm7
|
||||
movq mm2, mm3
|
||||
psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
|
||||
psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -)
|
||||
movq mm4, mm3
|
||||
psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
|
||||
|
||||
por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
|
||||
por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
|
||||
|
||||
movq MMWORD [wk(0)], mm4
|
||||
|
||||
pmullw mm7, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm3, [GOTOFF(ebx,PW_THREE)]
|
||||
paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm5, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm0, [GOTOFF(ebx,PW_SEVEN)]
|
||||
paddw mm2, [GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
paddw mm1, mm7
|
||||
paddw mm5, mm3
|
||||
psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6)
|
||||
psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14)
|
||||
paddw mm0, mm7
|
||||
paddw mm2, mm3
|
||||
psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7)
|
||||
psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15)
|
||||
|
||||
psllw mm0, BYTE_BIT
|
||||
psllw mm2, BYTE_BIT
|
||||
por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
|
||||
por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
|
||||
movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
|
||||
|
||||
; -- process the lower row
|
||||
|
||||
movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
|
||||
movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
|
||||
|
||||
movq mm7, mm6
|
||||
movq mm3, mm4
|
||||
psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -)
|
||||
psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
|
||||
movq mm0, mm6
|
||||
movq mm2, mm4
|
||||
psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
|
||||
psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6)
|
||||
|
||||
por mm7, mm3 ; mm7=( 1 2 3 4)
|
||||
por mm0, mm2 ; mm0=( 3 4 5 6)
|
||||
|
||||
movq mm1, mm6
|
||||
movq mm5, mm4
|
||||
psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
|
||||
psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -)
|
||||
movq mm3, mm4
|
||||
psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
|
||||
|
||||
por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
|
||||
por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
|
||||
|
||||
movq MMWORD [wk(1)], mm3
|
||||
|
||||
pmullw mm6, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
|
||||
paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm0, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw mm7, [GOTOFF(ebx,PW_SEVEN)]
|
||||
paddw mm5, [GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
paddw mm1, mm6
|
||||
paddw mm0, mm4
|
||||
psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6)
|
||||
psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14)
|
||||
paddw mm7, mm6
|
||||
paddw mm5, mm4
|
||||
psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7)
|
||||
psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15)
|
||||
|
||||
psllw mm7, BYTE_BIT
|
||||
psllw mm5, BYTE_BIT
|
||||
por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
|
||||
por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
|
||||
|
||||
poppic ebx
|
||||
|
||||
sub eax, byte SIZEOF_MMWORD
|
||||
add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
|
||||
add ebx, byte 1*SIZEOF_MMWORD ; inptr0
|
||||
add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
|
||||
add edx, byte 2*SIZEOF_MMWORD ; outptr0
|
||||
add edi, byte 2*SIZEOF_MMWORD ; outptr1
|
||||
cmp eax, byte SIZEOF_MMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b) + 12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (2*SIZEOF_MMWORD)-1
|
||||
and edx, byte -(2*SIZEOF_MMWORD)
|
||||
jz short .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz short .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
mov eax, edx ; colctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
|
||||
movq mm1, mm0
|
||||
punpcklbw mm0, mm0
|
||||
punpckhbw mm1, mm1
|
||||
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
|
||||
movq mm3, mm2
|
||||
punpcklbw mm2, mm2
|
||||
punpckhbw mm3, mm3
|
||||
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr
|
||||
add edi, byte 4*SIZEOF_MMWORD ; outptr
|
||||
jmp short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b) + 12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (2*SIZEOF_MMWORD)-1
|
||||
and edx, byte -(2*SIZEOF_MMWORD)
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz short .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
mov eax, edx ; colctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
|
||||
|
||||
movq mm1, mm0
|
||||
punpcklbw mm0, mm0
|
||||
punpckhbw mm1, mm1
|
||||
|
||||
movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
|
||||
movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
|
||||
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
|
||||
movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
|
||||
|
||||
movq mm3, mm2
|
||||
punpcklbw mm2, mm2
|
||||
punpckhbw mm3, mm3
|
||||
|
||||
movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
|
||||
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
|
||||
movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
|
||||
|
||||
sub eax, byte 2*SIZEOF_MMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte 2*SIZEOF_MMWORD ; inptr
|
||||
add ebx, byte 4*SIZEOF_MMWORD ; outptr0
|
||||
add edi, byte 4*SIZEOF_MMWORD ; outptr1
|
||||
jmp short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
724
TMessagesProj/jni/mozjpeg/simd/i386/jdsample-sse2.asm
Normal file
724
TMessagesProj/jni/mozjpeg/simd/i386/jdsample-sse2.asm
Normal file
|
|
@ -0,0 +1,724 @@
|
|||
;
|
||||
; jdsample.asm - upsampling (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jconst_fancy_upsample_sse2):
|
||||
|
||||
PW_ONE times 8 dw 1
|
||||
PW_TWO times 8 dw 2
|
||||
PW_THREE times 8 dw 3
|
||||
PW_SEVEN times 8 dw 7
|
||||
PW_EIGHT times 8 dw 8
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; The upsampling algorithm is linear interpolation between pixel centers,
|
||||
; also known as a "triangle filter". This is a good compromise between
|
||||
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
|
||||
; of the way between input pixel centers.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
|
||||
test eax, eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
|
||||
test eax, SIZEOF_XMMWORD-1
|
||||
jz short .skip
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
.skip:
|
||||
pxor xmm0, xmm0 ; xmm0=(all 0's)
|
||||
pcmpeqb xmm7, xmm7
|
||||
psrldq xmm7, (SIZEOF_XMMWORD-1)
|
||||
pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
|
||||
add eax, byte SIZEOF_XMMWORD-1
|
||||
and eax, byte -SIZEOF_XMMWORD
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_last:
|
||||
pcmpeqb xmm6, xmm6
|
||||
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||
pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
jmp short .upsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||
|
||||
.upsample:
|
||||
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
|
||||
pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
|
||||
psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
|
||||
|
||||
por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
|
||||
por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
|
||||
|
||||
movdqa xmm7, xmm1
|
||||
psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
|
||||
|
||||
movdqa xmm4, xmm1
|
||||
punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5, xmm2
|
||||
punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
|
||||
punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||
movdqa xmm6, xmm3
|
||||
punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
|
||||
punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||
paddw xmm2, [GOTOFF(ebx,PW_ONE)]
|
||||
paddw xmm5, [GOTOFF(ebx,PW_ONE)]
|
||||
paddw xmm3, [GOTOFF(ebx,PW_TWO)]
|
||||
paddw xmm6, [GOTOFF(ebx,PW_TWO)]
|
||||
|
||||
paddw xmm2, xmm1
|
||||
paddw xmm5, xmm4
|
||||
psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm3, xmm1
|
||||
paddw xmm6, xmm4
|
||||
psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm3, BYTE_BIT
|
||||
psllw xmm6, BYTE_BIT
|
||||
por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
|
||||
por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
|
||||
|
||||
sub eax, byte SIZEOF_XMMWORD
|
||||
add esi, byte 1*SIZEOF_XMMWORD ; inptr
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop eax
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; Again a triangle filter; see comments for h2v1 case, above.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic eax ; make a room for GOT address
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
movpic POINTER [gotptr], ebx ; save GOT address
|
||||
|
||||
mov edx, eax ; edx = original ebp
|
||||
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
|
||||
test eax, eax
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(edx)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(edx)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push eax ; colctr
|
||||
push ecx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
|
||||
test eax, SIZEOF_XMMWORD-1
|
||||
jz short .skip
|
||||
push edx
|
||||
mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
pop edx
|
||||
.skip:
|
||||
; -- process the first column block
|
||||
|
||||
movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
|
||||
movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
|
||||
movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||
movdqa xmm4, xmm0
|
||||
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5, xmm1
|
||||
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm6, xmm2
|
||||
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||
|
||||
pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
pcmpeqb xmm7, xmm7
|
||||
psrldq xmm7, (SIZEOF_XMMWORD-2)
|
||||
|
||||
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
|
||||
|
||||
pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
|
||||
pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm1
|
||||
movdqa XMMWORD [wk(1)], xmm2
|
||||
|
||||
poppic ebx
|
||||
|
||||
add eax, byte SIZEOF_XMMWORD-1
|
||||
and eax, byte -SIZEOF_XMMWORD
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop_last:
|
||||
; -- process the last column block
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pcmpeqb xmm1, xmm1
|
||||
pslldq xmm1, (SIZEOF_XMMWORD-2)
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
||||
pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
|
||||
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
|
||||
|
||||
jmp near .upsample
|
||||
alignx 16, 7
|
||||
|
||||
.columnloop:
|
||||
; -- process the next column block
|
||||
|
||||
movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
|
||||
movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
|
||||
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
|
||||
|
||||
pushpic ebx
|
||||
movpic ebx, POINTER [gotptr] ; load GOT address
|
||||
|
||||
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||
movdqa xmm4, xmm0
|
||||
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5, xmm1
|
||||
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm6, xmm2
|
||||
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||
|
||||
pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||
|
||||
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||
movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
|
||||
|
||||
pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
|
||||
pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm1
|
||||
movdqa XMMWORD [wk(3)], xmm2
|
||||
|
||||
.upsample:
|
||||
; -- process the upper row
|
||||
|
||||
movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
|
||||
pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
|
||||
movdqa xmm5, xmm7
|
||||
movdqa xmm6, xmm3
|
||||
psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
|
||||
pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
|
||||
|
||||
por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
|
||||
por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||
|
||||
movdqa xmm1, xmm7
|
||||
movdqa xmm2, xmm3
|
||||
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||
psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
|
||||
movdqa xmm4, xmm3
|
||||
psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
|
||||
|
||||
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4
|
||||
|
||||
pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
|
||||
paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
|
||||
paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
paddw xmm1, xmm7
|
||||
paddw xmm5, xmm3
|
||||
psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm2, xmm3
|
||||
psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm0, BYTE_BIT
|
||||
psllw xmm2, BYTE_BIT
|
||||
por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
|
||||
por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
|
||||
|
||||
; -- process the lower row
|
||||
|
||||
movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
|
||||
pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
|
||||
movdqa xmm0, xmm6
|
||||
movdqa xmm2, xmm4
|
||||
psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
|
||||
pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
|
||||
|
||||
por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
|
||||
por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
|
||||
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm5, xmm4
|
||||
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||
psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
|
||||
movdqa xmm3, xmm4
|
||||
psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
|
||||
|
||||
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
movdqa XMMWORD [wk(1)], xmm3
|
||||
|
||||
pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
|
||||
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
|
||||
paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
|
||||
paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
|
||||
paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
|
||||
|
||||
paddw xmm1, xmm6
|
||||
paddw xmm0, xmm4
|
||||
psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm7, xmm6
|
||||
paddw xmm5, xmm4
|
||||
psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm7, BYTE_BIT
|
||||
psllw xmm5, BYTE_BIT
|
||||
por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
|
||||
por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
poppic ebx
|
||||
|
||||
sub eax, byte SIZEOF_XMMWORD
|
||||
add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
|
||||
add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
|
||||
add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
|
||||
add edx, byte 2*SIZEOF_XMMWORD ; outptr0
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr1
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
pop ecx
|
||||
pop eax
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b) + 12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (2*SIZEOF_XMMWORD)-1
|
||||
and edx, byte -(2*SIZEOF_XMMWORD)
|
||||
jz short .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz short .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov edi, JSAMPROW [edi] ; outptr
|
||||
mov eax, edx ; colctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0
|
||||
punpckhbw xmm1, xmm1
|
||||
|
||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm2
|
||||
punpckhbw xmm3, xmm3
|
||||
|
||||
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
|
||||
|
||||
sub eax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add edi, byte 4*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec ecx ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
|
||||
%define output_width(b) (b) + 12 ; JDIMENSION output_width
|
||||
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
|
||||
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov edx, JDIMENSION [output_width(ebp)]
|
||||
add edx, byte (2*SIZEOF_XMMWORD)-1
|
||||
and edx, byte -(2*SIZEOF_XMMWORD)
|
||||
jz near .return
|
||||
|
||||
mov ecx, INT [max_v_samp(ebp)] ; rowctr
|
||||
test ecx, ecx
|
||||
jz near .return
|
||||
|
||||
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
|
||||
mov edi, POINTER [output_data_ptr(ebp)]
|
||||
mov edi, JSAMPARRAY [edi] ; output_data
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov esi, JSAMPROW [esi] ; inptr
|
||||
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
mov eax, edx ; colctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0
|
||||
punpckhbw xmm1, xmm1
|
||||
|
||||
movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
sub eax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm2
|
||||
punpckhbw xmm3, xmm3
|
||||
|
||||
movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
|
||||
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
|
||||
|
||||
sub eax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add esi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
|
||||
add edi, byte 4*SIZEOF_XMMWORD ; outptr1
|
||||
jmp short .columnloop
|
||||
alignx 16, 7
|
||||
|
||||
.nextrow:
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub ecx, byte 2 ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
.return:
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
318
TMessagesProj/jni/mozjpeg/simd/i386/jfdctflt-3dn.asm
Normal file
318
TMessagesProj/jni/mozjpeg/simd/i386/jfdctflt-3dn.asm
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
;
|
||||
; jfdctflt.asm - floating-point FDCT (3DNow!)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the forward DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_float_3dnow)
|
||||
|
||||
EXTN(jconst_fdct_float_3dnow):
|
||||
|
||||
PD_0_382 times 2 dd 0.382683432365089771728460
|
||||
PD_0_707 times 2 dd 0.707106781186547524400844
|
||||
PD_0_541 times 2 dd 0.541196100146196984399723
|
||||
PD_1_306 times 2 dd 1.306562964876376527856643
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
|
||||
;
|
||||
|
||||
%define data(b) (b) + 8 ; FAST_FLOAT *data
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
|
||||
|
||||
EXTN(jsimd_fdct_float_3dnow):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
||||
mov ecx, DCTSIZE/2
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
|
||||
|
||||
movq mm4, mm0 ; transpose coefficients
|
||||
punpckldq mm0, mm1 ; mm0=(00 10)=data0
|
||||
punpckhdq mm4, mm1 ; mm4=(01 11)=data1
|
||||
movq mm5, mm2 ; transpose coefficients
|
||||
punpckldq mm2, mm3 ; mm2=(06 16)=data6
|
||||
punpckhdq mm5, mm3 ; mm5=(07 17)=data7
|
||||
|
||||
movq mm6, mm4
|
||||
movq mm7, mm0
|
||||
pfsub mm4, mm2 ; mm4=data1-data6=tmp6
|
||||
pfsub mm0, mm5 ; mm0=data0-data7=tmp7
|
||||
pfadd mm6, mm2 ; mm6=data1+data6=tmp1
|
||||
pfadd mm7, mm5 ; mm7=data0+data7=tmp0
|
||||
|
||||
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
|
||||
|
||||
movq mm4, mm1 ; transpose coefficients
|
||||
punpckldq mm1, mm3 ; mm1=(02 12)=data2
|
||||
punpckhdq mm4, mm3 ; mm4=(03 13)=data3
|
||||
movq mm0, mm2 ; transpose coefficients
|
||||
punpckldq mm2, mm5 ; mm2=(04 14)=data4
|
||||
punpckhdq mm0, mm5 ; mm0=(05 15)=data5
|
||||
|
||||
movq mm3, mm4
|
||||
movq mm5, mm1
|
||||
pfadd mm4, mm2 ; mm4=data3+data4=tmp3
|
||||
pfadd mm1, mm0 ; mm1=data2+data5=tmp2
|
||||
pfsub mm3, mm2 ; mm3=data3-data4=tmp4
|
||||
pfsub mm5, mm0 ; mm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm2, mm7
|
||||
movq mm0, mm6
|
||||
pfsub mm7, mm4 ; mm7=tmp13
|
||||
pfsub mm6, mm1 ; mm6=tmp12
|
||||
pfadd mm2, mm4 ; mm2=tmp10
|
||||
pfadd mm0, mm1 ; mm0=tmp11
|
||||
|
||||
pfadd mm6, mm7
|
||||
pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm1, mm7
|
||||
pfsub mm2, mm0 ; mm2=data4
|
||||
pfsub mm7, mm6 ; mm7=data6
|
||||
pfadd mm4, mm0 ; mm4=data0
|
||||
pfadd mm1, mm6 ; mm1=data2
|
||||
|
||||
movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
|
||||
movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
|
||||
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [wk(0)] ; mm0=tmp6
|
||||
movq mm6, MMWORD [wk(1)] ; mm6=tmp7
|
||||
|
||||
pfadd mm3, mm5 ; mm3=tmp10
|
||||
pfadd mm5, mm0 ; mm5=tmp11
|
||||
pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
|
||||
|
||||
pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
|
||||
|
||||
movq mm2, mm3 ; mm2=tmp10
|
||||
pfsub mm3, mm0
|
||||
pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
|
||||
pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
|
||||
pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
|
||||
pfadd mm2, mm3 ; mm2=z2
|
||||
pfadd mm0, mm3 ; mm0=z4
|
||||
|
||||
movq mm7, mm6
|
||||
pfsub mm6, mm5 ; mm6=z13
|
||||
pfadd mm7, mm5 ; mm7=z11
|
||||
|
||||
movq mm4, mm6
|
||||
movq mm1, mm7
|
||||
pfsub mm6, mm2 ; mm6=data3
|
||||
pfsub mm7, mm0 ; mm7=data7
|
||||
pfadd mm4, mm2 ; mm4=data5
|
||||
pfadd mm1, mm0 ; mm1=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
|
||||
movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
|
||||
movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
|
||||
|
||||
add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec ecx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
||||
mov ecx, DCTSIZE/2
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
|
||||
|
||||
movq mm4, mm0 ; transpose coefficients
|
||||
punpckldq mm0, mm1 ; mm0=(00 01)=data0
|
||||
punpckhdq mm4, mm1 ; mm4=(10 11)=data1
|
||||
movq mm5, mm2 ; transpose coefficients
|
||||
punpckldq mm2, mm3 ; mm2=(60 61)=data6
|
||||
punpckhdq mm5, mm3 ; mm5=(70 71)=data7
|
||||
|
||||
movq mm6, mm4
|
||||
movq mm7, mm0
|
||||
pfsub mm4, mm2 ; mm4=data1-data6=tmp6
|
||||
pfsub mm0, mm5 ; mm0=data0-data7=tmp7
|
||||
pfadd mm6, mm2 ; mm6=data1+data6=tmp1
|
||||
pfadd mm7, mm5 ; mm7=data0+data7=tmp0
|
||||
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
|
||||
|
||||
movq mm4, mm1 ; transpose coefficients
|
||||
punpckldq mm1, mm3 ; mm1=(20 21)=data2
|
||||
punpckhdq mm4, mm3 ; mm4=(30 31)=data3
|
||||
movq mm0, mm2 ; transpose coefficients
|
||||
punpckldq mm2, mm5 ; mm2=(40 41)=data4
|
||||
punpckhdq mm0, mm5 ; mm0=(50 51)=data5
|
||||
|
||||
movq mm3, mm4
|
||||
movq mm5, mm1
|
||||
pfadd mm4, mm2 ; mm4=data3+data4=tmp3
|
||||
pfadd mm1, mm0 ; mm1=data2+data5=tmp2
|
||||
pfsub mm3, mm2 ; mm3=data3-data4=tmp4
|
||||
pfsub mm5, mm0 ; mm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm2, mm7
|
||||
movq mm0, mm6
|
||||
pfsub mm7, mm4 ; mm7=tmp13
|
||||
pfsub mm6, mm1 ; mm6=tmp12
|
||||
pfadd mm2, mm4 ; mm2=tmp10
|
||||
pfadd mm0, mm1 ; mm0=tmp11
|
||||
|
||||
pfadd mm6, mm7
|
||||
pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm1, mm7
|
||||
pfsub mm2, mm0 ; mm2=data4
|
||||
pfsub mm7, mm6 ; mm7=data6
|
||||
pfadd mm4, mm0 ; mm4=data0
|
||||
pfadd mm1, mm6 ; mm1=data2
|
||||
|
||||
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
|
||||
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [wk(0)] ; mm0=tmp6
|
||||
movq mm6, MMWORD [wk(1)] ; mm6=tmp7
|
||||
|
||||
pfadd mm3, mm5 ; mm3=tmp10
|
||||
pfadd mm5, mm0 ; mm5=tmp11
|
||||
pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
|
||||
|
||||
pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
|
||||
|
||||
movq mm2, mm3 ; mm2=tmp10
|
||||
pfsub mm3, mm0
|
||||
pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
|
||||
pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
|
||||
pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
|
||||
pfadd mm2, mm3 ; mm2=z2
|
||||
pfadd mm0, mm3 ; mm0=z4
|
||||
|
||||
movq mm7, mm6
|
||||
pfsub mm6, mm5 ; mm6=z13
|
||||
pfadd mm7, mm5 ; mm7=z11
|
||||
|
||||
movq mm4, mm6
|
||||
movq mm1, mm7
|
||||
pfsub mm6, mm2 ; mm6=data3
|
||||
pfsub mm7, mm0 ; mm7=data7
|
||||
pfadd mm4, mm2 ; mm4=data5
|
||||
pfadd mm1, mm0 ; mm1=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
|
||||
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
|
||||
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
|
||||
|
||||
add edx, byte 2*SIZEOF_FAST_FLOAT
|
||||
dec ecx
|
||||
jnz near .columnloop
|
||||
|
||||
femms ; empty MMX/3DNow! state
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
369
TMessagesProj/jni/mozjpeg/simd/i386/jfdctflt-sse.asm
Normal file
369
TMessagesProj/jni/mozjpeg/simd/i386/jfdctflt-sse.asm
Normal file
|
|
@ -0,0 +1,369 @@
|
|||
;
|
||||
; jfdctflt.asm - floating-point FDCT (SSE)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the forward DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||
shufps %1, %2, 0x44
|
||||
%endmacro
|
||||
|
||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||
shufps %1, %2, 0xEE
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_float_sse)
|
||||
|
||||
EXTN(jconst_fdct_float_sse):
|
||||
|
||||
PD_0_382 times 4 dd 0.382683432365089771728460
|
||||
PD_0_707 times 4 dd 0.707106781186547524400844
|
||||
PD_0_541 times 4 dd 0.541196100146196984399723
|
||||
PD_1_306 times 4 dd 1.306562964876376527856643
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_float_sse(FAST_FLOAT *data)
|
||||
;
|
||||
|
||||
%define data(b) (b) + 8 ; FAST_FLOAT *data
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_float_sse)
|
||||
|
||||
EXTN(jsimd_fdct_float_sse):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
|
||||
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
|
||||
|
||||
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
|
||||
unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
|
||||
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
|
||||
unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
|
||||
|
||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
|
||||
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
|
||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
|
||||
|
||||
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||
unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
|
||||
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
|
||||
unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
|
||||
|
||||
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
|
||||
unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
|
||||
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
|
||||
unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
|
||||
|
||||
movaps xmm0, xmm7
|
||||
movaps xmm5, xmm6
|
||||
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||
|
||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
|
||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
|
||||
unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
|
||||
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
|
||||
unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
|
||||
|
||||
movaps xmm2, xmm7
|
||||
movaps xmm3, xmm4
|
||||
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm1, xmm5
|
||||
movaps xmm6, xmm0
|
||||
subps xmm5, xmm7 ; xmm5=tmp13
|
||||
subps xmm0, xmm4 ; xmm0=tmp12
|
||||
addps xmm1, xmm7 ; xmm1=tmp10
|
||||
addps xmm6, xmm4 ; xmm6=tmp11
|
||||
|
||||
addps xmm0, xmm5
|
||||
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
|
||||
|
||||
movaps xmm7, xmm1
|
||||
movaps xmm4, xmm5
|
||||
subps xmm1, xmm6 ; xmm1=data4
|
||||
subps xmm5, xmm0 ; xmm5=data6
|
||||
addps xmm7, xmm6 ; xmm7=data0
|
||||
addps xmm4, xmm0 ; xmm4=data2
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||
|
||||
addps xmm2, xmm3 ; xmm2=tmp10
|
||||
addps xmm3, xmm6 ; xmm3=tmp11
|
||||
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||
|
||||
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
|
||||
|
||||
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||
subps xmm2, xmm6
|
||||
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
|
||||
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||
addps xmm1, xmm2 ; xmm1=z2
|
||||
addps xmm6, xmm2 ; xmm6=z4
|
||||
|
||||
movaps xmm5, xmm0
|
||||
subps xmm0, xmm3 ; xmm0=z13
|
||||
addps xmm5, xmm3 ; xmm5=z11
|
||||
|
||||
movaps xmm7, xmm0
|
||||
movaps xmm4, xmm5
|
||||
subps xmm0, xmm1 ; xmm0=data3
|
||||
subps xmm5, xmm6 ; xmm5=data7
|
||||
addps xmm7, xmm1 ; xmm7=data5
|
||||
addps xmm4, xmm6 ; xmm4=data1
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec ecx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
|
||||
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
|
||||
|
||||
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
|
||||
unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
|
||||
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
|
||||
unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
|
||||
|
||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
|
||||
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
|
||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
|
||||
|
||||
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
|
||||
unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
|
||||
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
|
||||
unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
|
||||
|
||||
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
|
||||
unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
|
||||
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
|
||||
unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
|
||||
|
||||
movaps xmm0, xmm7
|
||||
movaps xmm5, xmm6
|
||||
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||
|
||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
|
||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
|
||||
unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
|
||||
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
|
||||
unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
|
||||
|
||||
movaps xmm2, xmm7
|
||||
movaps xmm3, xmm4
|
||||
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm1, xmm5
|
||||
movaps xmm6, xmm0
|
||||
subps xmm5, xmm7 ; xmm5=tmp13
|
||||
subps xmm0, xmm4 ; xmm0=tmp12
|
||||
addps xmm1, xmm7 ; xmm1=tmp10
|
||||
addps xmm6, xmm4 ; xmm6=tmp11
|
||||
|
||||
addps xmm0, xmm5
|
||||
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
|
||||
|
||||
movaps xmm7, xmm1
|
||||
movaps xmm4, xmm5
|
||||
subps xmm1, xmm6 ; xmm1=data4
|
||||
subps xmm5, xmm0 ; xmm5=data6
|
||||
addps xmm7, xmm6 ; xmm7=data0
|
||||
addps xmm4, xmm0 ; xmm4=data2
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||
|
||||
addps xmm2, xmm3 ; xmm2=tmp10
|
||||
addps xmm3, xmm6 ; xmm3=tmp11
|
||||
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||
|
||||
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
|
||||
|
||||
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||
subps xmm2, xmm6
|
||||
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
|
||||
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||
addps xmm1, xmm2 ; xmm1=z2
|
||||
addps xmm6, xmm2 ; xmm6=z4
|
||||
|
||||
movaps xmm5, xmm0
|
||||
subps xmm0, xmm3 ; xmm0=z13
|
||||
addps xmm5, xmm3 ; xmm5=z11
|
||||
|
||||
movaps xmm7, xmm0
|
||||
movaps xmm4, xmm5
|
||||
subps xmm0, xmm1 ; xmm0=data3
|
||||
subps xmm5, xmm6 ; xmm5=data7
|
||||
addps xmm7, xmm1 ; xmm7=data5
|
||||
addps xmm4, xmm6 ; xmm4=data1
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
add edx, byte 4*SIZEOF_FAST_FLOAT
|
||||
dec ecx
|
||||
jnz near .columnloop
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
395
TMessagesProj/jni/mozjpeg/simd/i386/jfdctfst-mmx.asm
Normal file
395
TMessagesProj/jni/mozjpeg/simd/i386/jfdctfst-mmx.asm
Normal file
|
|
@ -0,0 +1,395 @@
|
|||
;
|
||||
; jfdctfst.asm - fast integer FDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the forward DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
|
||||
; for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_0_382 equ 98 ; FIX(0.382683433)
|
||||
F_0_541 equ 139 ; FIX(0.541196100)
|
||||
F_0_707 equ 181 ; FIX(0.707106781)
|
||||
F_1_306 equ 334 ; FIX(1.306562965)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
|
||||
F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_ifast_mmx)
|
||||
|
||||
EXTN(jconst_fdct_ifast_mmx):
|
||||
|
||||
PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
|
||||
PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
|
||||
PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
|
||||
PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_ifast_mmx(DCTELEM *data)
|
||||
;
|
||||
|
||||
%define data(b) (b) + 8 ; DCTELEM *data
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
|
||||
|
||||
EXTN(jsimd_fdct_ifast_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(20 21 22 23), mm2=(24 25 26 27)
|
||||
; mm1=(30 31 32 33), mm3=(34 35 36 37)
|
||||
|
||||
movq mm4, mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
|
||||
punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
|
||||
movq mm5, mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
|
||||
punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 01 02 03), mm1=(04 05 06 07)
|
||||
; mm7=(10 11 12 13), mm3=(14 15 16 17)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
|
||||
|
||||
movq mm4, mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
|
||||
punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
|
||||
movq mm2, mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
|
||||
punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
|
||||
|
||||
movq mm7, mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
|
||||
punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
|
||||
movq mm3, mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
|
||||
punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
|
||||
|
||||
movq mm0, mm7
|
||||
movq mm5, mm6
|
||||
psubw mm7, mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6, mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0, mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5, mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7, mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
|
||||
punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
|
||||
movq mm6, mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
|
||||
punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
|
||||
|
||||
movq mm2, mm7
|
||||
movq mm3, mm4
|
||||
paddw mm7, mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4, mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2, mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3, mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1, mm5
|
||||
movq mm6, mm0
|
||||
psubw mm5, mm7 ; mm5=tmp13
|
||||
psubw mm0, mm4 ; mm0=tmp12
|
||||
paddw mm1, mm7 ; mm1=tmp10
|
||||
paddw mm6, mm4 ; mm6=tmp11
|
||||
|
||||
paddw mm0, mm5
|
||||
psllw mm0, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
|
||||
|
||||
movq mm7, mm1
|
||||
movq mm4, mm5
|
||||
psubw mm1, mm6 ; mm1=data4
|
||||
psubw mm5, mm0 ; mm5=data6
|
||||
paddw mm7, mm6 ; mm7=data0
|
||||
paddw mm4, mm0 ; mm4=data2
|
||||
|
||||
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm6, MMWORD [wk(0)] ; mm6=tmp6
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=tmp7
|
||||
|
||||
paddw mm2, mm3 ; mm2=tmp10
|
||||
paddw mm3, mm6 ; mm3=tmp11
|
||||
paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
|
||||
|
||||
psllw mm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm6, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw mm3, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
|
||||
|
||||
movq mm1, mm2 ; mm1=tmp10
|
||||
psubw mm2, mm6
|
||||
pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
|
||||
pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
|
||||
pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
|
||||
paddw mm1, mm2 ; mm1=z2
|
||||
paddw mm6, mm2 ; mm6=z4
|
||||
|
||||
movq mm5, mm0
|
||||
psubw mm0, mm3 ; mm0=z13
|
||||
paddw mm5, mm3 ; mm5=z11
|
||||
|
||||
movq mm7, mm0
|
||||
movq mm4, mm5
|
||||
psubw mm0, mm1 ; mm0=data3
|
||||
psubw mm5, mm6 ; mm5=data7
|
||||
paddw mm7, mm1 ; mm7=data5
|
||||
paddw mm4, mm6 ; mm4=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(02 12 22 32), mm2=(42 52 62 72)
|
||||
; mm1=(03 13 23 33), mm3=(43 53 63 73)
|
||||
|
||||
movq mm4, mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
|
||||
punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
|
||||
movq mm5, mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
|
||||
punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 10 20 30), mm1=(40 50 60 70)
|
||||
; mm7=(01 11 21 31), mm3=(41 51 61 71)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
|
||||
|
||||
movq mm4, mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
|
||||
punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
|
||||
movq mm2, mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
|
||||
punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
|
||||
|
||||
movq mm7, mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
|
||||
punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
|
||||
movq mm3, mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
|
||||
punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
|
||||
|
||||
movq mm0, mm7
|
||||
movq mm5, mm6
|
||||
psubw mm7, mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6, mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0, mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5, mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7, mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
|
||||
punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
|
||||
movq mm6, mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
|
||||
punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
|
||||
|
||||
movq mm2, mm7
|
||||
movq mm3, mm4
|
||||
paddw mm7, mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4, mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2, mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3, mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1, mm5
|
||||
movq mm6, mm0
|
||||
psubw mm5, mm7 ; mm5=tmp13
|
||||
psubw mm0, mm4 ; mm0=tmp12
|
||||
paddw mm1, mm7 ; mm1=tmp10
|
||||
paddw mm6, mm4 ; mm6=tmp11
|
||||
|
||||
paddw mm0, mm5
|
||||
psllw mm0, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
|
||||
|
||||
movq mm7, mm1
|
||||
movq mm4, mm5
|
||||
psubw mm1, mm6 ; mm1=data4
|
||||
psubw mm5, mm0 ; mm5=data6
|
||||
paddw mm7, mm6 ; mm7=data0
|
||||
paddw mm4, mm0 ; mm4=data2
|
||||
|
||||
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm6, MMWORD [wk(0)] ; mm6=tmp6
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=tmp7
|
||||
|
||||
paddw mm2, mm3 ; mm2=tmp10
|
||||
paddw mm3, mm6 ; mm3=tmp11
|
||||
paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
|
||||
|
||||
psllw mm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm6, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw mm3, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
|
||||
|
||||
movq mm1, mm2 ; mm1=tmp10
|
||||
psubw mm2, mm6
|
||||
pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
|
||||
pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
|
||||
pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
|
||||
paddw mm1, mm2 ; mm1=z2
|
||||
paddw mm6, mm2 ; mm6=z4
|
||||
|
||||
movq mm5, mm0
|
||||
psubw mm0, mm3 ; mm0=z13
|
||||
paddw mm5, mm3 ; mm5=z11
|
||||
|
||||
movq mm7, mm0
|
||||
movq mm4, mm5
|
||||
psubw mm0, mm1 ; mm0=data3
|
||||
psubw mm5, mm6 ; mm5=data7
|
||||
paddw mm7, mm1 ; mm7=data5
|
||||
paddw mm4, mm6 ; mm4=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
add edx, byte 4*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .columnloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
403
TMessagesProj/jni/mozjpeg/simd/i386/jfdctfst-sse2.asm
Normal file
403
TMessagesProj/jni/mozjpeg/simd/i386/jfdctfst-sse2.asm
Normal file
|
|
@ -0,0 +1,403 @@
|
|||
;
|
||||
; jfdctfst.asm - fast integer FDCT (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the forward DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
|
||||
; for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_0_382 equ 98 ; FIX(0.382683433)
|
||||
F_0_541 equ 139 ; FIX(0.541196100)
|
||||
F_0_707 equ 181 ; FIX(0.707106781)
|
||||
F_1_306 equ 334 ; FIX(1.306562965)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
|
||||
F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_ifast_sse2)
|
||||
|
||||
EXTN(jconst_fdct_ifast_sse2):
|
||||
|
||||
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
|
||||
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
|
||||
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
|
||||
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_ifast_sse2(DCTELEM *data)
|
||||
;
|
||||
|
||||
%define data(b) (b) + 8 ; DCTELEM *data
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
|
||||
|
||||
EXTN(jsimd_fdct_ifast_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
|
||||
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
|
||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
|
||||
|
||||
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
movdqa xmm3, xmm0
|
||||
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm0, xmm6
|
||||
psubw xmm3, xmm1 ; xmm3=tmp13
|
||||
psubw xmm6, xmm7 ; xmm6=tmp12
|
||||
paddw xmm4, xmm1 ; xmm4=tmp10
|
||||
paddw xmm0, xmm7 ; xmm0=tmp11
|
||||
|
||||
paddw xmm6, xmm3
|
||||
psllw xmm6, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
|
||||
|
||||
movdqa xmm1, xmm4
|
||||
movdqa xmm7, xmm3
|
||||
psubw xmm4, xmm0 ; xmm4=data4
|
||||
psubw xmm3, xmm6 ; xmm3=data6
|
||||
paddw xmm1, xmm0 ; xmm1=data0
|
||||
paddw xmm7, xmm6 ; xmm7=data2
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
paddw xmm2, xmm5 ; xmm2=tmp10
|
||||
paddw xmm5, xmm0 ; xmm5=tmp11
|
||||
paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
|
||||
|
||||
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
|
||||
|
||||
movdqa xmm4, xmm2 ; xmm4=tmp10
|
||||
psubw xmm2, xmm0
|
||||
pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
|
||||
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||
pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
|
||||
paddw xmm4, xmm2 ; xmm4=z2
|
||||
paddw xmm0, xmm2 ; xmm0=z4
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
psubw xmm6, xmm5 ; xmm6=z13
|
||||
paddw xmm3, xmm5 ; xmm3=z11
|
||||
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm5, xmm3
|
||||
psubw xmm6, xmm4 ; xmm6=data3
|
||||
psubw xmm3, xmm0 ; xmm3=data7
|
||||
paddw xmm2, xmm4 ; xmm2=data5
|
||||
paddw xmm5, xmm0 ; xmm5=data1
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
; mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
|
||||
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
|
||||
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
|
||||
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
|
||||
punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
|
||||
|
||||
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
|
||||
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
|
||||
punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
|
||||
movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
|
||||
punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
|
||||
|
||||
movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
|
||||
punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
|
||||
punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
|
||||
|
||||
movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
|
||||
movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
|
||||
punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
|
||||
|
||||
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
|
||||
punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
|
||||
movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
|
||||
punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
|
||||
|
||||
movdqa xmm5, xmm6
|
||||
movdqa xmm3, xmm1
|
||||
psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
|
||||
psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
|
||||
paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
|
||||
paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
|
||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
|
||||
punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
|
||||
movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
|
||||
punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm0, xmm2
|
||||
paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
|
||||
paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
|
||||
psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
|
||||
psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm1, xmm5
|
||||
psubw xmm3, xmm6 ; xmm3=tmp13
|
||||
psubw xmm5, xmm2 ; xmm5=tmp12
|
||||
paddw xmm4, xmm6 ; xmm4=tmp10
|
||||
paddw xmm1, xmm2 ; xmm1=tmp11
|
||||
|
||||
paddw xmm5, xmm3
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm2, xmm3
|
||||
psubw xmm4, xmm1 ; xmm4=data4
|
||||
psubw xmm3, xmm5 ; xmm3=data6
|
||||
paddw xmm6, xmm1 ; xmm6=data0
|
||||
paddw xmm2, xmm5 ; xmm2=data2
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||
|
||||
paddw xmm7, xmm0 ; xmm7=tmp10
|
||||
paddw xmm0, xmm1 ; xmm0=tmp11
|
||||
paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
|
||||
|
||||
psllw xmm7, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
|
||||
|
||||
movdqa xmm4, xmm7 ; xmm4=tmp10
|
||||
psubw xmm7, xmm1
|
||||
pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
|
||||
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||
pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
|
||||
paddw xmm4, xmm7 ; xmm4=z2
|
||||
paddw xmm1, xmm7 ; xmm1=z4
|
||||
|
||||
movdqa xmm3, xmm5
|
||||
psubw xmm5, xmm0 ; xmm5=z13
|
||||
paddw xmm3, xmm0 ; xmm3=z11
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
movdqa xmm2, xmm3
|
||||
psubw xmm5, xmm4 ; xmm5=data3
|
||||
psubw xmm3, xmm1 ; xmm3=data7
|
||||
paddw xmm6, xmm4 ; xmm6=data5
|
||||
paddw xmm2, xmm1 ; xmm2=data1
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
|
||||
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
331
TMessagesProj/jni/mozjpeg/simd/i386/jfdctint-avx2.asm
Normal file
331
TMessagesProj/jni/mozjpeg/simd/i386/jfdctint-avx2.asm
Normal file
|
|
@ -0,0 +1,331 @@
|
|||
;
|
||||
; jfdctint.asm - accurate integer FDCT (AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, 2018, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit matrix transpose using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%8: Temp registers
|
||||
|
||||
%macro dotranspose 8
|
||||
; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
|
||||
; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
|
||||
; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
|
||||
; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
|
||||
|
||||
vpunpcklwd %5, %1, %2
|
||||
vpunpckhwd %6, %1, %2
|
||||
vpunpcklwd %7, %3, %4
|
||||
vpunpckhwd %8, %3, %4
|
||||
; transpose coefficients(phase 1)
|
||||
; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53)
|
||||
; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57)
|
||||
; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73)
|
||||
; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77)
|
||||
|
||||
vpunpckldq %1, %5, %7
|
||||
vpunpckhdq %2, %5, %7
|
||||
vpunpckldq %3, %6, %8
|
||||
vpunpckhdq %4, %6, %8
|
||||
; transpose coefficients(phase 2)
|
||||
; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
|
||||
; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
|
||||
; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
|
||||
; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
|
||||
|
||||
vpermq %1, %1, 0x8D
|
||||
vpermq %2, %2, 0x8D
|
||||
vpermq %3, %3, 0xD8
|
||||
vpermq %4, %4, 0xD8
|
||||
; transpose coefficients(phase 3)
|
||||
; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70)
|
||||
; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
|
||||
; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
|
||||
; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77)
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit slow integer forward DCT using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%8: Temp registers
|
||||
; %9: Pass (1 or 2)
|
||||
|
||||
%macro dodct 9
|
||||
vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
|
||||
vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
|
||||
vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
|
||||
vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5
|
||||
|
||||
; -- Even part
|
||||
|
||||
vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1
|
||||
vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11
|
||||
vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12
|
||||
|
||||
vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10
|
||||
vpsignw %1, %1, [GOTOFF(ebx, PW_1_NEG1)] ; %1=tmp10_neg11
|
||||
vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11)
|
||||
%if %9 == 1
|
||||
vpsllw %1, %7, PASS1_BITS ; %1=data0_4
|
||||
%else
|
||||
vpaddw %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
|
||||
vpsraw %1, %7, PASS1_BITS ; %1=data0_4
|
||||
%endif
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13
|
||||
vpunpcklwd %2, %6, %7
|
||||
vpunpckhwd %6, %6, %7
|
||||
vpmaddwd %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %2=data2_6L
|
||||
vpmaddwd %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %6=data2_6H
|
||||
|
||||
vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
|
||||
vpaddd %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
|
||||
vpsrad %2, %2, DESCALE_P %+ %9
|
||||
vpsrad %6, %6, DESCALE_P %+ %9
|
||||
|
||||
vpackssdw %3, %2, %6 ; %6=data2_6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3
|
||||
vpunpcklwd %6, %7, %2
|
||||
vpunpckhwd %7, %7, %2
|
||||
vpmaddwd %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %6=z3_4L
|
||||
vpmaddwd %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %7=z3_4H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6
|
||||
vpunpcklwd %2, %8, %4
|
||||
vpunpckhwd %4, %8, %4
|
||||
vpmaddwd %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %2=tmp4_5L
|
||||
vpmaddwd %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %4=tmp4_5H
|
||||
|
||||
vpaddd %2, %2, %6 ; %2=data7_5L
|
||||
vpaddd %4, %4, %7 ; %4=data7_5H
|
||||
|
||||
vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
|
||||
vpaddd %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
|
||||
vpsrad %2, %2, DESCALE_P %+ %9
|
||||
vpsrad %4, %4, DESCALE_P %+ %9
|
||||
|
||||
vpackssdw %4, %2, %4 ; %4=data7_5
|
||||
|
||||
vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4
|
||||
vpunpcklwd %8, %5, %2
|
||||
vpunpckhwd %5, %5, %2
|
||||
vpmaddwd %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %8=tmp6_7L
|
||||
vpmaddwd %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %5=tmp6_7H
|
||||
|
||||
vpaddd %8, %8, %6 ; %8=data3_1L
|
||||
vpaddd %5, %5, %7 ; %5=data3_1H
|
||||
|
||||
vpaddd %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
|
||||
vpaddd %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
|
||||
vpsrad %8, %8, DESCALE_P %+ %9
|
||||
vpsrad %5, %5, DESCALE_P %+ %9
|
||||
|
||||
vpackssdw %2, %8, %5 ; %2=data3_1
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_islow_avx2)
|
||||
|
||||
EXTN(jconst_fdct_islow_avx2):
|
||||
|
||||
PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
times 4 dw (F_0_541 - F_1_847), F_0_541
|
||||
PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
times 4 dw (F_1_175 - F_0_390), F_1_175
|
||||
PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562
|
||||
times 4 dw (F_1_501 - F_0_899), -F_0_899
|
||||
PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
|
||||
PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
|
||||
PW_1_NEG1 times 8 dw 1
|
||||
times 8 dw -1
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_islow_avx2(DCTELEM *data)
|
||||
;
|
||||
|
||||
%define data(b) (b) + 8 ; DCTELEM *data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
|
||||
|
||||
EXTN(jsimd_fdct_islow_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
pushpic ebx
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(ebp)] ; (DCTELEM *)
|
||||
|
||||
vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
|
||||
; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
vperm2i128 ymm0, ymm4, ymm6, 0x20
|
||||
vperm2i128 ymm1, ymm4, ymm6, 0x31
|
||||
vperm2i128 ymm2, ymm5, ymm7, 0x20
|
||||
vperm2i128 ymm3, ymm5, ymm7, 0x31
|
||||
; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
|
||||
; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
|
||||
; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
|
||||
; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
|
||||
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
|
||||
vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
|
||||
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
|
||||
|
||||
vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
|
||||
vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3
|
||||
vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5
|
||||
vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7
|
||||
|
||||
vmovdqu YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
|
||||
vmovdqu YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
|
||||
vmovdqu YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
|
||||
vmovdqu YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
|
||||
|
||||
vzeroupper
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
poppic ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
620
TMessagesProj/jni/mozjpeg/simd/i386/jfdctint-mmx.asm
Normal file
620
TMessagesProj/jni/mozjpeg/simd/i386/jfdctint-mmx.asm
Normal file
|
|
@ -0,0 +1,620 @@
|
|||
;
|
||||
; jfdctint.asm - accurate integer FDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_islow_mmx)
|
||||
|
||||
EXTN(jconst_fdct_islow_mmx):
|
||||
|
||||
PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541
|
||||
PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847)
|
||||
PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175
|
||||
PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390)
|
||||
PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899
|
||||
PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
|
||||
PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_islow_mmx(DCTELEM *data)
|
||||
;
|
||||
|
||||
%define data(b) (b) + 8 ; DCTELEM *data
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
|
||||
|
||||
EXTN(jsimd_fdct_islow_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(20 21 22 23), mm2=(24 25 26 27)
|
||||
; mm1=(30 31 32 33), mm3=(34 35 36 37)
|
||||
|
||||
movq mm4, mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
|
||||
punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
|
||||
movq mm5, mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
|
||||
punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 01 02 03), mm1=(04 05 06 07)
|
||||
; mm7=(10 11 12 13), mm3=(14 15 16 17)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
|
||||
|
||||
movq mm4, mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
|
||||
punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
|
||||
movq mm2, mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
|
||||
punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
|
||||
|
||||
movq mm7, mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
|
||||
punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
|
||||
movq mm3, mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
|
||||
punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
|
||||
|
||||
movq mm0, mm7
|
||||
movq mm5, mm6
|
||||
psubw mm7, mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6, mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0, mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5, mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7, mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
|
||||
punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
|
||||
movq mm6, mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
|
||||
punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
|
||||
|
||||
movq mm2, mm7
|
||||
movq mm3, mm4
|
||||
paddw mm7, mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4, mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2, mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3, mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1, mm5
|
||||
movq mm6, mm0
|
||||
paddw mm5, mm7 ; mm5=tmp10
|
||||
paddw mm0, mm4 ; mm0=tmp11
|
||||
psubw mm1, mm7 ; mm1=tmp13
|
||||
psubw mm6, mm4 ; mm6=tmp12
|
||||
|
||||
movq mm7, mm5
|
||||
paddw mm5, mm0 ; mm5=tmp10+tmp11
|
||||
psubw mm7, mm0 ; mm7=tmp10-tmp11
|
||||
|
||||
psllw mm5, PASS1_BITS ; mm5=data0
|
||||
psllw mm7, PASS1_BITS ; mm7=data4
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movq mm4, mm1 ; mm1=tmp13
|
||||
movq mm0, mm1
|
||||
punpcklwd mm4, mm6 ; mm6=tmp12
|
||||
punpckhwd mm0, mm6
|
||||
movq mm1, mm4
|
||||
movq mm6, mm0
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
|
||||
|
||||
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm4, DESCALE_P1
|
||||
psrad mm0, DESCALE_P1
|
||||
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm1, DESCALE_P1
|
||||
psrad mm6, DESCALE_P1
|
||||
|
||||
packssdw mm4, mm0 ; mm4=data2
|
||||
packssdw mm1, mm6 ; mm1=data6
|
||||
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp6
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp7
|
||||
|
||||
movq mm0, mm2 ; mm2=tmp4
|
||||
movq mm6, mm3 ; mm3=tmp5
|
||||
paddw mm0, mm5 ; mm0=z3
|
||||
paddw mm6, mm7 ; mm6=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm1, mm0
|
||||
punpcklwd mm4, mm6
|
||||
punpckhwd mm1, mm6
|
||||
movq mm0, mm4
|
||||
movq mm6, mm1
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
|
||||
movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm1, mm2
|
||||
punpcklwd mm4, mm7
|
||||
punpckhwd mm1, mm7
|
||||
movq mm2, mm4
|
||||
movq mm7, mm1
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
|
||||
|
||||
paddd mm4, MMWORD [wk(0)] ; mm4=data7L
|
||||
paddd mm1, MMWORD [wk(1)] ; mm1=data7H
|
||||
paddd mm2, mm0 ; mm2=data1L
|
||||
paddd mm7, mm6 ; mm7=data1H
|
||||
|
||||
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm4, DESCALE_P1
|
||||
psrad mm1, DESCALE_P1
|
||||
paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm2, DESCALE_P1
|
||||
psrad mm7, DESCALE_P1
|
||||
|
||||
packssdw mm4, mm1 ; mm4=data7
|
||||
packssdw mm2, mm7 ; mm2=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
|
||||
|
||||
movq mm1, mm3
|
||||
movq mm7, mm3
|
||||
punpcklwd mm1, mm5
|
||||
punpckhwd mm7, mm5
|
||||
movq mm3, mm1
|
||||
movq mm5, mm7
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
|
||||
|
||||
paddd mm1, mm0 ; mm1=data5L
|
||||
paddd mm7, mm6 ; mm7=data5H
|
||||
paddd mm3, MMWORD [wk(0)] ; mm3=data3L
|
||||
paddd mm5, MMWORD [wk(1)] ; mm5=data3H
|
||||
|
||||
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm1, DESCALE_P1
|
||||
psrad mm7, DESCALE_P1
|
||||
paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad mm3, DESCALE_P1
|
||||
psrad mm5, DESCALE_P1
|
||||
|
||||
packssdw mm1, mm7 ; mm1=data5
|
||||
packssdw mm3, mm5 ; mm3=data3
|
||||
|
||||
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
|
||||
|
||||
add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm0=(02 12 22 32), mm2=(42 52 62 72)
|
||||
; mm1=(03 13 23 33), mm3=(43 53 63 73)
|
||||
|
||||
movq mm4, mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
|
||||
punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
|
||||
movq mm5, mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
|
||||
punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; mm6=(00 10 20 30), mm1=(40 50 60 70)
|
||||
; mm7=(01 11 21 31), mm3=(41 51 61 71)
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
|
||||
|
||||
movq mm4, mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
|
||||
punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
|
||||
movq mm2, mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
|
||||
punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
|
||||
|
||||
movq mm7, mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
|
||||
punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
|
||||
movq mm3, mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
|
||||
punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
|
||||
|
||||
movq mm0, mm7
|
||||
movq mm5, mm6
|
||||
psubw mm7, mm2 ; mm7=data1-data6=tmp6
|
||||
psubw mm6, mm3 ; mm6=data0-data7=tmp7
|
||||
paddw mm0, mm2 ; mm0=data1+data6=tmp1
|
||||
paddw mm5, mm3 ; mm5=data0+data7=tmp0
|
||||
|
||||
movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
|
||||
|
||||
movq mm7, mm4 ; transpose coefficients(phase 2)
|
||||
punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
|
||||
punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
|
||||
movq mm6, mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
|
||||
punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
|
||||
|
||||
movq mm2, mm7
|
||||
movq mm3, mm4
|
||||
paddw mm7, mm1 ; mm7=data3+data4=tmp3
|
||||
paddw mm4, mm6 ; mm4=data2+data5=tmp2
|
||||
psubw mm2, mm1 ; mm2=data3-data4=tmp4
|
||||
psubw mm3, mm6 ; mm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1, mm5
|
||||
movq mm6, mm0
|
||||
paddw mm5, mm7 ; mm5=tmp10
|
||||
paddw mm0, mm4 ; mm0=tmp11
|
||||
psubw mm1, mm7 ; mm1=tmp13
|
||||
psubw mm6, mm4 ; mm6=tmp12
|
||||
|
||||
movq mm7, mm5
|
||||
paddw mm5, mm0 ; mm5=tmp10+tmp11
|
||||
psubw mm7, mm0 ; mm7=tmp10-tmp11
|
||||
|
||||
paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
psraw mm5, PASS1_BITS ; mm5=data0
|
||||
psraw mm7, PASS1_BITS ; mm7=data4
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
|
||||
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movq mm4, mm1 ; mm1=tmp13
|
||||
movq mm0, mm1
|
||||
punpcklwd mm4, mm6 ; mm6=tmp12
|
||||
punpckhwd mm0, mm6
|
||||
movq mm1, mm4
|
||||
movq mm6, mm0
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
|
||||
|
||||
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm4, DESCALE_P2
|
||||
psrad mm0, DESCALE_P2
|
||||
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm1, DESCALE_P2
|
||||
psrad mm6, DESCALE_P2
|
||||
|
||||
packssdw mm4, mm0 ; mm4=data2
|
||||
packssdw mm1, mm6 ; mm1=data6
|
||||
|
||||
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp6
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp7
|
||||
|
||||
movq mm0, mm2 ; mm2=tmp4
|
||||
movq mm6, mm3 ; mm3=tmp5
|
||||
paddw mm0, mm5 ; mm0=z3
|
||||
paddw mm6, mm7 ; mm6=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm1, mm0
|
||||
punpcklwd mm4, mm6
|
||||
punpckhwd mm1, mm6
|
||||
movq mm0, mm4
|
||||
movq mm6, mm1
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
|
||||
|
||||
movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
|
||||
movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm1, mm2
|
||||
punpcklwd mm4, mm7
|
||||
punpckhwd mm1, mm7
|
||||
movq mm2, mm4
|
||||
movq mm7, mm1
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
|
||||
|
||||
paddd mm4, MMWORD [wk(0)] ; mm4=data7L
|
||||
paddd mm1, MMWORD [wk(1)] ; mm1=data7H
|
||||
paddd mm2, mm0 ; mm2=data1L
|
||||
paddd mm7, mm6 ; mm7=data1H
|
||||
|
||||
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm4, DESCALE_P2
|
||||
psrad mm1, DESCALE_P2
|
||||
paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm2, DESCALE_P2
|
||||
psrad mm7, DESCALE_P2
|
||||
|
||||
packssdw mm4, mm1 ; mm4=data7
|
||||
packssdw mm2, mm7 ; mm2=data1
|
||||
|
||||
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
|
||||
|
||||
movq mm1, mm3
|
||||
movq mm7, mm3
|
||||
punpcklwd mm1, mm5
|
||||
punpckhwd mm7, mm5
|
||||
movq mm3, mm1
|
||||
movq mm5, mm7
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
|
||||
|
||||
paddd mm1, mm0 ; mm1=data5L
|
||||
paddd mm7, mm6 ; mm7=data5H
|
||||
paddd mm3, MMWORD [wk(0)] ; mm3=data3L
|
||||
paddd mm5, MMWORD [wk(1)] ; mm5=data3H
|
||||
|
||||
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm1, DESCALE_P2
|
||||
psrad mm7, DESCALE_P2
|
||||
paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad mm3, DESCALE_P2
|
||||
psrad mm5, DESCALE_P2
|
||||
|
||||
packssdw mm1, mm7 ; mm1=data5
|
||||
packssdw mm3, mm5 ; mm3=data3
|
||||
|
||||
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
|
||||
|
||||
add edx, byte 4*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz near .columnloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
633
TMessagesProj/jni/mozjpeg/simd/i386/jfdctint-sse2.asm
Normal file
633
TMessagesProj/jni/mozjpeg/simd/i386/jfdctint-sse2.asm
Normal file
|
|
@ -0,0 +1,633 @@
|
|||
;
|
||||
; jfdctint.asm - accurate integer FDCT (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_islow_sse2)
|
||||
|
||||
EXTN(jconst_fdct_islow_sse2):
|
||||
|
||||
PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
|
||||
PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
|
||||
PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
|
||||
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_islow_sse2(DCTELEM *data)
|
||||
;
|
||||
|
||||
%define data(b) (b) + 8 ; DCTELEM *data
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 6
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
|
||||
|
||||
EXTN(jsimd_fdct_islow_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
; push edi ; unused
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
|
||||
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
|
||||
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
|
||||
|
||||
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
movdqa xmm3, xmm0
|
||||
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm0, xmm6
|
||||
paddw xmm3, xmm1 ; xmm3=tmp10
|
||||
paddw xmm6, xmm7 ; xmm6=tmp11
|
||||
psubw xmm4, xmm1 ; xmm4=tmp13
|
||||
psubw xmm0, xmm7 ; xmm0=tmp12
|
||||
|
||||
movdqa xmm1, xmm3
|
||||
paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
|
||||
psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
|
||||
|
||||
psllw xmm3, PASS1_BITS ; xmm3=data0
|
||||
psllw xmm1, PASS1_BITS ; xmm1=data4
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
|
||||
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movdqa xmm7, xmm4 ; xmm4=tmp13
|
||||
movdqa xmm6, xmm4
|
||||
punpcklwd xmm7, xmm0 ; xmm0=tmp12
|
||||
punpckhwd xmm6, xmm0
|
||||
movdqa xmm4, xmm7
|
||||
movdqa xmm0, xmm6
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
|
||||
|
||||
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad xmm7, DESCALE_P1
|
||||
psrad xmm6, DESCALE_P1
|
||||
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad xmm4, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
|
||||
packssdw xmm7, xmm6 ; xmm7=data2
|
||||
packssdw xmm4, xmm0 ; xmm4=data6
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
|
||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
|
||||
|
||||
movdqa xmm6, xmm2 ; xmm2=tmp4
|
||||
movdqa xmm0, xmm5 ; xmm5=tmp5
|
||||
paddw xmm6, xmm3 ; xmm6=z3
|
||||
paddw xmm0, xmm1 ; xmm0=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm4, xmm6
|
||||
punpcklwd xmm7, xmm0
|
||||
punpckhwd xmm4, xmm0
|
||||
movdqa xmm6, xmm7
|
||||
movdqa xmm0, xmm4
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
|
||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movdqa xmm7, xmm2
|
||||
movdqa xmm4, xmm2
|
||||
punpcklwd xmm7, xmm1
|
||||
punpckhwd xmm4, xmm1
|
||||
movdqa xmm2, xmm7
|
||||
movdqa xmm1, xmm4
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
|
||||
|
||||
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
|
||||
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
|
||||
paddd xmm2, xmm6 ; xmm2=data1L
|
||||
paddd xmm1, xmm0 ; xmm1=data1H
|
||||
|
||||
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad xmm7, DESCALE_P1
|
||||
psrad xmm4, DESCALE_P1
|
||||
paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad xmm2, DESCALE_P1
|
||||
psrad xmm1, DESCALE_P1
|
||||
|
||||
packssdw xmm7, xmm4 ; xmm7=data7
|
||||
packssdw xmm2, xmm1 ; xmm2=data1
|
||||
|
||||
movdqa xmm4, xmm5
|
||||
movdqa xmm1, xmm5
|
||||
punpcklwd xmm4, xmm3
|
||||
punpckhwd xmm1, xmm3
|
||||
movdqa xmm5, xmm4
|
||||
movdqa xmm3, xmm1
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
|
||||
pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
|
||||
|
||||
paddd xmm4, xmm6 ; xmm4=data5L
|
||||
paddd xmm1, xmm0 ; xmm1=data5H
|
||||
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
|
||||
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
|
||||
|
||||
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad xmm4, DESCALE_P1
|
||||
psrad xmm1, DESCALE_P1
|
||||
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
|
||||
psrad xmm5, DESCALE_P1
|
||||
psrad xmm3, DESCALE_P1
|
||||
|
||||
packssdw xmm4, xmm1 ; xmm4=data5
|
||||
packssdw xmm5, xmm3 ; xmm5=data3
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
; mov edx, POINTER [data(eax)] ; (DCTELEM *)
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
|
||||
movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
|
||||
|
||||
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
|
||||
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
|
||||
movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||
punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
|
||||
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
|
||||
|
||||
; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
|
||||
; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
|
||||
punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
|
||||
movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
|
||||
punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
|
||||
|
||||
movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
|
||||
punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
|
||||
punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
|
||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
|
||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
|
||||
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
|
||||
|
||||
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
|
||||
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
|
||||
punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
|
||||
|
||||
movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
|
||||
punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
|
||||
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
|
||||
punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm7, xmm6
|
||||
psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
|
||||
psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||
paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
|
||||
paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
|
||||
punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
|
||||
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
|
||||
punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
movdqa xmm3, xmm4
|
||||
paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
|
||||
paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||
psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
|
||||
psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm1, xmm7
|
||||
movdqa xmm6, xmm2
|
||||
paddw xmm7, xmm5 ; xmm7=tmp10
|
||||
paddw xmm2, xmm4 ; xmm2=tmp11
|
||||
psubw xmm1, xmm5 ; xmm1=tmp13
|
||||
psubw xmm6, xmm4 ; xmm6=tmp12
|
||||
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
|
||||
psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
|
||||
|
||||
paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
|
||||
psraw xmm7, PASS1_BITS ; xmm7=data0
|
||||
psraw xmm5, PASS1_BITS ; xmm5=data4
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
|
||||
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movdqa xmm4, xmm1 ; xmm1=tmp13
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm4, xmm6 ; xmm6=tmp12
|
||||
punpckhwd xmm2, xmm6
|
||||
movdqa xmm1, xmm4
|
||||
movdqa xmm6, xmm2
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
|
||||
|
||||
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm2, DESCALE_P2
|
||||
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad xmm1, DESCALE_P2
|
||||
psrad xmm6, DESCALE_P2
|
||||
|
||||
packssdw xmm4, xmm2 ; xmm4=data2
|
||||
packssdw xmm1, xmm6 ; xmm1=data6
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||
|
||||
movdqa xmm2, xmm0 ; xmm0=tmp4
|
||||
movdqa xmm6, xmm3 ; xmm3=tmp5
|
||||
paddw xmm2, xmm7 ; xmm2=z3
|
||||
paddw xmm6, xmm5 ; xmm6=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm4, xmm6
|
||||
punpckhwd xmm1, xmm6
|
||||
movdqa xmm2, xmm4
|
||||
movdqa xmm6, xmm1
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm4, xmm5
|
||||
punpckhwd xmm1, xmm5
|
||||
movdqa xmm0, xmm4
|
||||
movdqa xmm5, xmm1
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
|
||||
|
||||
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
|
||||
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
|
||||
paddd xmm0, xmm2 ; xmm0=data1L
|
||||
paddd xmm5, xmm6 ; xmm5=data1H
|
||||
|
||||
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm1, DESCALE_P2
|
||||
paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad xmm0, DESCALE_P2
|
||||
psrad xmm5, DESCALE_P2
|
||||
|
||||
packssdw xmm4, xmm1 ; xmm4=data7
|
||||
packssdw xmm0, xmm5 ; xmm0=data1
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
|
||||
|
||||
movdqa xmm1, xmm3
|
||||
movdqa xmm5, xmm3
|
||||
punpcklwd xmm1, xmm7
|
||||
punpckhwd xmm5, xmm7
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm7, xmm5
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
|
||||
pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
|
||||
|
||||
paddd xmm1, xmm2 ; xmm1=data5L
|
||||
paddd xmm5, xmm6 ; xmm5=data5H
|
||||
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
|
||||
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
|
||||
|
||||
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad xmm1, DESCALE_P2
|
||||
psrad xmm5, DESCALE_P2
|
||||
paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
|
||||
psrad xmm3, DESCALE_P2
|
||||
psrad xmm7, DESCALE_P2
|
||||
|
||||
packssdw xmm1, xmm5 ; xmm1=data5
|
||||
packssdw xmm3, xmm7 ; xmm3=data3
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
; pop edi ; unused
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
451
TMessagesProj/jni/mozjpeg/simd/i386/jidctflt-3dn.asm
Normal file
451
TMessagesProj/jni/mozjpeg/simd/i386/jidctflt-3dn.asm
Normal file
|
|
@ -0,0 +1,451 @@
|
|||
;
|
||||
; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_float_3dnow)
|
||||
|
||||
EXTN(jconst_idct_float_3dnow):
|
||||
|
||||
PD_1_414 times 2 dd 1.414213562373095048801689
|
||||
PD_1_847 times 2 dd 1.847759065022573512256366
|
||||
PD_1_082 times 2 dd 1.082392200292393968799446
|
||||
PD_2_613 times 2 dd 2.613125929752753055713286
|
||||
PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; void *dct_table
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
|
||||
; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
|
||||
; FAST_FLOAT workspace[DCTSIZE2]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
|
||||
|
||||
EXTN(jsimd_idct_float_3dnow):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov ecx, DCTSIZE/2 ; ctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, ebx
|
||||
poppic ebx ; restore GOT address
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd mm0, mm0
|
||||
psrad mm0, (DWORD_BIT-WORD_BIT)
|
||||
pi2fd mm0, mm0
|
||||
|
||||
pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movq mm1, mm0
|
||||
punpckldq mm0, mm0
|
||||
punpckhdq mm1, mm1
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
|
||||
movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
|
||||
movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
|
||||
movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
|
||||
movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
|
||||
jmp near .nextcolumn
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd mm0, mm0
|
||||
punpcklwd mm1, mm1
|
||||
psrad mm0, (DWORD_BIT-WORD_BIT)
|
||||
psrad mm1, (DWORD_BIT-WORD_BIT)
|
||||
pi2fd mm0, mm0
|
||||
pi2fd mm1, mm1
|
||||
|
||||
pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
punpcklwd mm2, mm2
|
||||
punpcklwd mm3, mm3
|
||||
psrad mm2, (DWORD_BIT-WORD_BIT)
|
||||
psrad mm3, (DWORD_BIT-WORD_BIT)
|
||||
pi2fd mm2, mm2
|
||||
pi2fd mm3, mm3
|
||||
|
||||
pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm1
|
||||
pfsub mm0, mm2 ; mm0=tmp11
|
||||
pfsub mm1, mm3
|
||||
pfadd mm4, mm2 ; mm4=tmp10
|
||||
pfadd mm5, mm3 ; mm5=tmp13
|
||||
|
||||
pfmul mm1, [GOTOFF(ebx,PD_1_414)]
|
||||
pfsub mm1, mm5 ; mm1=tmp12
|
||||
|
||||
movq mm6, mm4
|
||||
movq mm7, mm0
|
||||
pfsub mm4, mm5 ; mm4=tmp3
|
||||
pfsub mm0, mm1 ; mm0=tmp2
|
||||
pfadd mm6, mm5 ; mm6=tmp0
|
||||
pfadd mm7, mm1 ; mm7=tmp1
|
||||
|
||||
movq MMWORD [wk(1)], mm4 ; tmp3
|
||||
movq MMWORD [wk(0)], mm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd mm2, mm2
|
||||
punpcklwd mm3, mm3
|
||||
psrad mm2, (DWORD_BIT-WORD_BIT)
|
||||
psrad mm3, (DWORD_BIT-WORD_BIT)
|
||||
pi2fd mm2, mm2
|
||||
pi2fd mm3, mm3
|
||||
|
||||
pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
punpcklwd mm5, mm5
|
||||
punpcklwd mm1, mm1
|
||||
psrad mm5, (DWORD_BIT-WORD_BIT)
|
||||
psrad mm1, (DWORD_BIT-WORD_BIT)
|
||||
pi2fd mm5, mm5
|
||||
pi2fd mm1, mm1
|
||||
|
||||
pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm0, mm5
|
||||
pfadd mm2, mm1 ; mm2=z11
|
||||
pfadd mm5, mm3 ; mm5=z13
|
||||
pfsub mm4, mm1 ; mm4=z12
|
||||
pfsub mm0, mm3 ; mm0=z10
|
||||
|
||||
movq mm1, mm2
|
||||
pfsub mm2, mm5
|
||||
pfadd mm1, mm5 ; mm1=tmp7
|
||||
|
||||
pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
|
||||
|
||||
movq mm3, mm0
|
||||
pfadd mm0, mm4
|
||||
pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
|
||||
pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
|
||||
pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
|
||||
pfsubr mm3, mm0 ; mm3=tmp12
|
||||
pfsub mm4, mm0 ; mm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
pfsub mm3, mm1 ; mm3=tmp6
|
||||
movq mm5, mm6
|
||||
movq mm0, mm7
|
||||
pfadd mm6, mm1 ; mm6=data0=(00 01)
|
||||
pfadd mm7, mm3 ; mm7=data1=(10 11)
|
||||
pfsub mm5, mm1 ; mm5=data7=(70 71)
|
||||
pfsub mm0, mm3 ; mm0=data6=(60 61)
|
||||
pfsub mm2, mm3 ; mm2=tmp5
|
||||
|
||||
movq mm1, mm6 ; transpose coefficients
|
||||
punpckldq mm6, mm7 ; mm6=(00 10)
|
||||
punpckhdq mm1, mm7 ; mm1=(01 11)
|
||||
movq mm3, mm0 ; transpose coefficients
|
||||
punpckldq mm0, mm5 ; mm0=(60 70)
|
||||
punpckhdq mm3, mm5 ; mm3=(61 71)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
|
||||
movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
|
||||
movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=tmp2
|
||||
movq mm5, MMWORD [wk(1)] ; mm5=tmp3
|
||||
|
||||
pfadd mm4, mm2 ; mm4=tmp4
|
||||
movq mm6, mm7
|
||||
movq mm1, mm5
|
||||
pfadd mm7, mm2 ; mm7=data2=(20 21)
|
||||
pfadd mm5, mm4 ; mm5=data4=(40 41)
|
||||
pfsub mm6, mm2 ; mm6=data5=(50 51)
|
||||
pfsub mm1, mm4 ; mm1=data3=(30 31)
|
||||
|
||||
movq mm0, mm7 ; transpose coefficients
|
||||
punpckldq mm7, mm1 ; mm7=(20 30)
|
||||
punpckhdq mm0, mm1 ; mm0=(21 31)
|
||||
movq mm3, mm5 ; transpose coefficients
|
||||
punpckldq mm5, mm6 ; mm5=(40 50)
|
||||
punpckhdq mm3, mm6 ; mm3=(41 51)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
|
||||
movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
|
||||
movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 2*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr
|
||||
add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
|
||||
prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
|
||||
prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
|
||||
prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
mov ecx, DCTSIZE/2 ; ctr
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm1
|
||||
pfsub mm0, mm2 ; mm0=tmp11
|
||||
pfsub mm1, mm3
|
||||
pfadd mm4, mm2 ; mm4=tmp10
|
||||
pfadd mm5, mm3 ; mm5=tmp13
|
||||
|
||||
pfmul mm1, [GOTOFF(ebx,PD_1_414)]
|
||||
pfsub mm1, mm5 ; mm1=tmp12
|
||||
|
||||
movq mm6, mm4
|
||||
movq mm7, mm0
|
||||
pfsub mm4, mm5 ; mm4=tmp3
|
||||
pfsub mm0, mm1 ; mm0=tmp2
|
||||
pfadd mm6, mm5 ; mm6=tmp0
|
||||
pfadd mm7, mm1 ; mm7=tmp1
|
||||
|
||||
movq MMWORD [wk(1)], mm4 ; tmp3
|
||||
movq MMWORD [wk(0)], mm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm0, mm5
|
||||
pfadd mm2, mm1 ; mm2=z11
|
||||
pfadd mm5, mm3 ; mm5=z13
|
||||
pfsub mm4, mm1 ; mm4=z12
|
||||
pfsub mm0, mm3 ; mm0=z10
|
||||
|
||||
movq mm1, mm2
|
||||
pfsub mm2, mm5
|
||||
pfadd mm1, mm5 ; mm1=tmp7
|
||||
|
||||
pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
|
||||
|
||||
movq mm3, mm0
|
||||
pfadd mm0, mm4
|
||||
pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
|
||||
pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
|
||||
pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
|
||||
pfsubr mm3, mm0 ; mm3=tmp12
|
||||
pfsub mm4, mm0 ; mm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
pfsub mm3, mm1 ; mm3=tmp6
|
||||
movq mm5, mm6
|
||||
movq mm0, mm7
|
||||
pfadd mm6, mm1 ; mm6=data0=(00 10)
|
||||
pfadd mm7, mm3 ; mm7=data1=(01 11)
|
||||
pfsub mm5, mm1 ; mm5=data7=(07 17)
|
||||
pfsub mm0, mm3 ; mm0=data6=(06 16)
|
||||
pfsub mm2, mm3 ; mm2=tmp5
|
||||
|
||||
movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC]
|
||||
pcmpeqd mm3, mm3
|
||||
psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
|
||||
|
||||
pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **)
|
||||
pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **)
|
||||
pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **)
|
||||
pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **)
|
||||
|
||||
pand mm6, mm3 ; mm6=(00 -- 10 --)
|
||||
pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11)
|
||||
pand mm0, mm3 ; mm0=(06 -- 16 --)
|
||||
pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17)
|
||||
por mm6, mm7 ; mm6=(00 01 10 11)
|
||||
por mm0, mm5 ; mm0=(06 07 16 17)
|
||||
|
||||
movq mm1, MMWORD [wk(0)] ; mm1=tmp2
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=tmp3
|
||||
|
||||
pfadd mm4, mm2 ; mm4=tmp4
|
||||
movq mm7, mm1
|
||||
movq mm5, mm3
|
||||
pfadd mm1, mm2 ; mm1=data2=(02 12)
|
||||
pfadd mm3, mm4 ; mm3=data4=(04 14)
|
||||
pfsub mm7, mm2 ; mm7=data5=(05 15)
|
||||
pfsub mm5, mm4 ; mm5=data3=(03 13)
|
||||
|
||||
movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC]
|
||||
pcmpeqd mm4, mm4
|
||||
psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
|
||||
|
||||
pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **)
|
||||
pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **)
|
||||
pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **)
|
||||
pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **)
|
||||
|
||||
pand mm3, mm4 ; mm3=(04 -- 14 --)
|
||||
pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15)
|
||||
pand mm1, mm4 ; mm1=(02 -- 12 --)
|
||||
pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13)
|
||||
por mm3, mm7 ; mm3=(04 05 14 15)
|
||||
por mm1, mm5 ; mm1=(02 03 12 13)
|
||||
|
||||
movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15)
|
||||
packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17)
|
||||
paddb mm6, mm2
|
||||
paddb mm1, mm2
|
||||
|
||||
movq mm4, mm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17)
|
||||
|
||||
movq mm7, mm6 ; transpose coefficients(phase 3)
|
||||
punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07)
|
||||
punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17)
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
|
||||
|
||||
poppic ebx ; restore GOT address
|
||||
|
||||
add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr
|
||||
add edi, byte 2*SIZEOF_JSAMPROW
|
||||
dec ecx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
femms ; empty MMX/3DNow! state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
571
TMessagesProj/jni/mozjpeg/simd/i386/jidctflt-sse.asm
Normal file
571
TMessagesProj/jni/mozjpeg/simd/i386/jidctflt-sse.asm
Normal file
|
|
@ -0,0 +1,571 @@
|
|||
;
|
||||
; jidctflt.asm - floating-point IDCT (SSE & MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||
shufps %1, %2, 0x44
|
||||
%endmacro
|
||||
|
||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||
shufps %1, %2, 0xEE
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_float_sse)
|
||||
|
||||
EXTN(jconst_idct_float_sse):
|
||||
|
||||
PD_1_414 times 4 dd 1.414213562373095048801689
|
||||
PD_1_847 times 4 dd 1.847759065022573512256366
|
||||
PD_1_082 times 4 dd 1.082392200292393968799446
|
||||
PD_M2_613 times 4 dd -2.613125929752753055713286
|
||||
PD_0_125 times 4 dd 0.125 ; 1/8
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; void *dct_table
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
|
||||
; FAST_FLOAT workspace[DCTSIZE2]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_float_sse)
|
||||
|
||||
EXTN(jsimd_idct_float_sse):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, mm0
|
||||
packsswb mm1, mm1
|
||||
movd eax, mm1
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpckhwd mm1, mm0 ; mm1=(** 02 ** 03)
|
||||
punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
|
||||
psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03)
|
||||
psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
|
||||
cvtpi2ps xmm3, mm1 ; xmm3=(02 03 ** **)
|
||||
cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **)
|
||||
movlhps xmm0, xmm3 ; xmm0=in0=(00 01 02 03)
|
||||
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm1, xmm0
|
||||
movaps xmm2, xmm0
|
||||
movaps xmm3, xmm0
|
||||
|
||||
shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
|
||||
shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
|
||||
shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
|
||||
shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpckhwd mm4, mm0 ; mm4=(** 02 ** 03)
|
||||
punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
|
||||
punpckhwd mm5, mm1 ; mm5=(** 22 ** 23)
|
||||
punpcklwd mm1, mm1 ; mm1=(20 20 21 21)
|
||||
|
||||
psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03)
|
||||
psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
|
||||
cvtpi2ps xmm4, mm4 ; xmm4=(02 03 ** **)
|
||||
cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **)
|
||||
psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23)
|
||||
psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21)
|
||||
cvtpi2ps xmm5, mm5 ; xmm5=(22 23 ** **)
|
||||
cvtpi2ps xmm1, mm1 ; xmm1=(20 21 ** **)
|
||||
|
||||
punpckhwd mm6, mm2 ; mm6=(** 42 ** 43)
|
||||
punpcklwd mm2, mm2 ; mm2=(40 40 41 41)
|
||||
punpckhwd mm7, mm3 ; mm7=(** 62 ** 63)
|
||||
punpcklwd mm3, mm3 ; mm3=(60 60 61 61)
|
||||
|
||||
psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43)
|
||||
psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41)
|
||||
cvtpi2ps xmm6, mm6 ; xmm6=(42 43 ** **)
|
||||
cvtpi2ps xmm2, mm2 ; xmm2=(40 41 ** **)
|
||||
psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63)
|
||||
psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61)
|
||||
cvtpi2ps xmm7, mm7 ; xmm7=(62 63 ** **)
|
||||
cvtpi2ps xmm3, mm3 ; xmm3=(60 61 ** **)
|
||||
|
||||
movlhps xmm0, xmm4 ; xmm0=in0=(00 01 02 03)
|
||||
movlhps xmm1, xmm5 ; xmm1=in2=(20 21 22 23)
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movlhps xmm2, xmm6 ; xmm2=in4=(40 41 42 43)
|
||||
movlhps xmm3, xmm7 ; xmm3=in6=(60 61 62 63)
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4, xmm0
|
||||
movaps xmm5, xmm1
|
||||
subps xmm0, xmm2 ; xmm0=tmp11
|
||||
subps xmm1, xmm3
|
||||
addps xmm4, xmm2 ; xmm4=tmp10
|
||||
addps xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
|
||||
subps xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm0
|
||||
subps xmm4, xmm5 ; xmm4=tmp3
|
||||
subps xmm0, xmm1 ; xmm0=tmp2
|
||||
addps xmm6, xmm5 ; xmm6=tmp0
|
||||
addps xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpckhwd mm6, mm4 ; mm6=(** 12 ** 13)
|
||||
punpcklwd mm4, mm4 ; mm4=(10 10 11 11)
|
||||
punpckhwd mm2, mm0 ; mm2=(** 32 ** 33)
|
||||
punpcklwd mm0, mm0 ; mm0=(30 30 31 31)
|
||||
|
||||
psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13)
|
||||
psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11)
|
||||
cvtpi2ps xmm4, mm6 ; xmm4=(12 13 ** **)
|
||||
cvtpi2ps xmm2, mm4 ; xmm2=(10 11 ** **)
|
||||
psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33)
|
||||
psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31)
|
||||
cvtpi2ps xmm0, mm2 ; xmm0=(32 33 ** **)
|
||||
cvtpi2ps xmm3, mm0 ; xmm3=(30 31 ** **)
|
||||
|
||||
punpckhwd mm7, mm5 ; mm7=(** 52 ** 53)
|
||||
punpcklwd mm5, mm5 ; mm5=(50 50 51 51)
|
||||
punpckhwd mm3, mm1 ; mm3=(** 72 ** 73)
|
||||
punpcklwd mm1, mm1 ; mm1=(70 70 71 71)
|
||||
|
||||
movlhps xmm2, xmm4 ; xmm2=in1=(10 11 12 13)
|
||||
movlhps xmm3, xmm0 ; xmm3=in3=(30 31 32 33)
|
||||
|
||||
psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53)
|
||||
psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51)
|
||||
cvtpi2ps xmm4, mm7 ; xmm4=(52 53 ** **)
|
||||
cvtpi2ps xmm5, mm5 ; xmm5=(50 51 ** **)
|
||||
psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73)
|
||||
psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71)
|
||||
cvtpi2ps xmm0, mm3 ; xmm0=(72 73 ** **)
|
||||
cvtpi2ps xmm1, mm1 ; xmm1=(70 71 ** **)
|
||||
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movlhps xmm5, xmm4 ; xmm5=in5=(50 51 52 53)
|
||||
movlhps xmm1, xmm0 ; xmm1=in7=(70 71 72 73)
|
||||
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4, xmm2
|
||||
movaps xmm0, xmm5
|
||||
addps xmm2, xmm1 ; xmm2=z11
|
||||
addps xmm5, xmm3 ; xmm5=z13
|
||||
subps xmm4, xmm1 ; xmm4=z12
|
||||
subps xmm0, xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1, xmm2
|
||||
subps xmm2, xmm5
|
||||
addps xmm1, xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3, xmm0
|
||||
addps xmm0, xmm4
|
||||
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
||||
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3, xmm0 ; xmm3=tmp12
|
||||
subps xmm4, xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3, xmm1 ; xmm3=tmp6
|
||||
movaps xmm5, xmm6
|
||||
movaps xmm0, xmm7
|
||||
addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
|
||||
addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
|
||||
subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
|
||||
subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
|
||||
subps xmm2, xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||
unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
|
||||
movaps xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
|
||||
unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
||||
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
||||
|
||||
addps xmm4, xmm2 ; xmm4=tmp4
|
||||
movaps xmm0, xmm7
|
||||
movaps xmm3, xmm5
|
||||
addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
|
||||
addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
|
||||
subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
|
||||
subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
|
||||
|
||||
movaps xmm2, xmm7 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
|
||||
unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
|
||||
movaps xmm4, xmm5 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
|
||||
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
|
||||
|
||||
movaps xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
|
||||
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
|
||||
movaps xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
|
||||
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
||||
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
|
||||
movaps xmm6, xmm5 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
|
||||
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
|
||||
movaps xmm3, xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
|
||||
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
|
||||
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
|
||||
add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4, xmm0
|
||||
movaps xmm5, xmm1
|
||||
subps xmm0, xmm2 ; xmm0=tmp11
|
||||
subps xmm1, xmm3
|
||||
addps xmm4, xmm2 ; xmm4=tmp10
|
||||
addps xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
|
||||
subps xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm0
|
||||
subps xmm4, xmm5 ; xmm4=tmp3
|
||||
subps xmm0, xmm1 ; xmm0=tmp2
|
||||
addps xmm6, xmm5 ; xmm6=tmp0
|
||||
addps xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4, xmm2
|
||||
movaps xmm0, xmm5
|
||||
addps xmm2, xmm1 ; xmm2=z11
|
||||
addps xmm5, xmm3 ; xmm5=z13
|
||||
subps xmm4, xmm1 ; xmm4=z12
|
||||
subps xmm0, xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1, xmm2
|
||||
subps xmm2, xmm5
|
||||
addps xmm1, xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3, xmm0
|
||||
addps xmm0, xmm4
|
||||
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
||||
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3, xmm0 ; xmm3=tmp12
|
||||
subps xmm4, xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3, xmm1 ; xmm3=tmp6
|
||||
movaps xmm5, xmm6
|
||||
movaps xmm0, xmm7
|
||||
addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
|
||||
addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
|
||||
subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
|
||||
subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
|
||||
subps xmm2, xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1, [GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125]
|
||||
|
||||
mulps xmm6, xmm1 ; descale(1/8)
|
||||
mulps xmm7, xmm1 ; descale(1/8)
|
||||
mulps xmm5, xmm1 ; descale(1/8)
|
||||
mulps xmm0, xmm1 ; descale(1/8)
|
||||
|
||||
movhlps xmm3, xmm6
|
||||
movhlps xmm1, xmm7
|
||||
cvtps2pi mm0, xmm6 ; round to int32, mm0=data0L=(00 10)
|
||||
cvtps2pi mm1, xmm7 ; round to int32, mm1=data1L=(01 11)
|
||||
cvtps2pi mm2, xmm3 ; round to int32, mm2=data0H=(20 30)
|
||||
cvtps2pi mm3, xmm1 ; round to int32, mm3=data1H=(21 31)
|
||||
packssdw mm0, mm2 ; mm0=data0=(00 10 20 30)
|
||||
packssdw mm1, mm3 ; mm1=data1=(01 11 21 31)
|
||||
|
||||
movhlps xmm6, xmm5
|
||||
movhlps xmm7, xmm0
|
||||
cvtps2pi mm4, xmm5 ; round to int32, mm4=data7L=(07 17)
|
||||
cvtps2pi mm5, xmm0 ; round to int32, mm5=data6L=(06 16)
|
||||
cvtps2pi mm6, xmm6 ; round to int32, mm6=data7H=(27 37)
|
||||
cvtps2pi mm7, xmm7 ; round to int32, mm7=data6H=(26 36)
|
||||
packssdw mm4, mm6 ; mm4=data7=(07 17 27 37)
|
||||
packssdw mm5, mm7 ; mm5=data6=(06 16 26 36)
|
||||
|
||||
packsswb mm0, mm5 ; mm0=(00 10 20 30 06 16 26 36)
|
||||
packsswb mm1, mm4 ; mm1=(01 11 21 31 07 17 27 37)
|
||||
|
||||
movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2
|
||||
movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
||||
|
||||
movaps xmm6, [GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125]
|
||||
|
||||
addps xmm4, xmm2 ; xmm4=tmp4
|
||||
movaps xmm5, xmm3
|
||||
movaps xmm0, xmm1
|
||||
addps xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
|
||||
addps xmm1, xmm4 ; xmm1=data4=(04 14 24 34)
|
||||
subps xmm5, xmm2 ; xmm5=data5=(05 15 25 35)
|
||||
subps xmm0, xmm4 ; xmm0=data3=(03 13 23 33)
|
||||
|
||||
mulps xmm3, xmm6 ; descale(1/8)
|
||||
mulps xmm1, xmm6 ; descale(1/8)
|
||||
mulps xmm5, xmm6 ; descale(1/8)
|
||||
mulps xmm0, xmm6 ; descale(1/8)
|
||||
|
||||
movhlps xmm7, xmm3
|
||||
movhlps xmm2, xmm1
|
||||
cvtps2pi mm2, xmm3 ; round to int32, mm2=data2L=(02 12)
|
||||
cvtps2pi mm3, xmm1 ; round to int32, mm3=data4L=(04 14)
|
||||
cvtps2pi mm6, xmm7 ; round to int32, mm6=data2H=(22 32)
|
||||
cvtps2pi mm7, xmm2 ; round to int32, mm7=data4H=(24 34)
|
||||
packssdw mm2, mm6 ; mm2=data2=(02 12 22 32)
|
||||
packssdw mm3, mm7 ; mm3=data4=(04 14 24 34)
|
||||
|
||||
movhlps xmm4, xmm5
|
||||
movhlps xmm6, xmm0
|
||||
cvtps2pi mm5, xmm5 ; round to int32, mm5=data5L=(05 15)
|
||||
cvtps2pi mm4, xmm0 ; round to int32, mm4=data3L=(03 13)
|
||||
cvtps2pi mm6, xmm4 ; round to int32, mm6=data5H=(25 35)
|
||||
cvtps2pi mm7, xmm6 ; round to int32, mm7=data3H=(23 33)
|
||||
packssdw mm5, mm6 ; mm5=data5=(05 15 25 35)
|
||||
packssdw mm4, mm7 ; mm4=data3=(03 13 23 33)
|
||||
|
||||
movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb mm2, mm3 ; mm2=(02 12 22 32 04 14 24 34)
|
||||
packsswb mm4, mm5 ; mm4=(03 13 23 33 05 15 25 35)
|
||||
|
||||
paddb mm0, mm6
|
||||
paddb mm1, mm6
|
||||
paddb mm2, mm6
|
||||
paddb mm4, mm6
|
||||
|
||||
movq mm7, mm0 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm0, mm1 ; mm0=(00 01 10 11 20 21 30 31)
|
||||
punpckhbw mm7, mm1 ; mm7=(06 07 16 17 26 27 36 37)
|
||||
movq mm3, mm2 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm2, mm4 ; mm2=(02 03 12 13 22 23 32 33)
|
||||
punpckhbw mm3, mm4 ; mm3=(04 05 14 15 24 25 34 35)
|
||||
|
||||
movq mm5, mm0 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm0, mm2 ; mm0=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm5, mm2 ; mm5=(20 21 22 23 30 31 32 33)
|
||||
movq mm6, mm3 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm3, mm7 ; mm3=(04 05 06 07 14 15 16 17)
|
||||
punpckhwd mm6, mm7 ; mm6=(24 25 26 27 34 35 36 37)
|
||||
|
||||
movq mm1, mm0 ; transpose coefficients(phase 3)
|
||||
punpckldq mm0, mm3 ; mm0=(00 01 02 03 04 05 06 07)
|
||||
punpckhdq mm1, mm3 ; mm1=(10 11 12 13 14 15 16 17)
|
||||
movq mm4, mm5 ; transpose coefficients(phase 3)
|
||||
punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27)
|
||||
punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37)
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
|
||||
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
|
||||
|
||||
poppic ebx ; restore GOT address
|
||||
|
||||
add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
|
||||
add edi, byte 4*SIZEOF_JSAMPROW
|
||||
dec ecx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
497
TMessagesProj/jni/mozjpeg/simd/i386/jidctflt-sse2.asm
Normal file
497
TMessagesProj/jni/mozjpeg/simd/i386/jidctflt-sse2.asm
Normal file
|
|
@ -0,0 +1,497 @@
|
|||
;
|
||||
; jidctflt.asm - floating-point IDCT (SSE & SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||
shufps %1, %2, 0x44
|
||||
%endmacro
|
||||
|
||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||
shufps %1, %2, 0xEE
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_float_sse2)
|
||||
|
||||
EXTN(jconst_idct_float_sse2):
|
||||
|
||||
PD_1_414 times 4 dd 1.414213562373095048801689
|
||||
PD_1_847 times 4 dd 1.847759065022573512256366
|
||||
PD_1_082 times 4 dd 1.082392200292393968799446
|
||||
PD_M2_613 times 4 dd -2.613125929752753055713286
|
||||
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; void *dct_table
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
|
||||
; FAST_FLOAT workspace[DCTSIZE2]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_float_sse2)
|
||||
|
||||
EXTN(jsimd_idct_float_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, xmm2
|
||||
por xmm3, xmm4
|
||||
por xmm5, xmm6
|
||||
por xmm1, xmm3
|
||||
por xmm5, xmm7
|
||||
por xmm1, xmm5
|
||||
packsswb xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm1, xmm0
|
||||
movaps xmm2, xmm0
|
||||
movaps xmm3, xmm0
|
||||
|
||||
shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
|
||||
shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
|
||||
shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
|
||||
shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
|
||||
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
|
||||
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||
cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
|
||||
|
||||
punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
|
||||
punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
|
||||
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
|
||||
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
|
||||
cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
|
||||
cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
|
||||
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4, xmm0
|
||||
movaps xmm5, xmm1
|
||||
subps xmm0, xmm2 ; xmm0=tmp11
|
||||
subps xmm1, xmm3
|
||||
addps xmm4, xmm2 ; xmm4=tmp10
|
||||
addps xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
|
||||
subps xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm0
|
||||
subps xmm4, xmm5 ; xmm4=tmp3
|
||||
subps xmm0, xmm1 ; xmm0=tmp2
|
||||
addps xmm6, xmm5 ; xmm6=tmp0
|
||||
addps xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
|
||||
punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
|
||||
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
|
||||
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
|
||||
cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
|
||||
cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
|
||||
|
||||
punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
|
||||
punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
|
||||
psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
|
||||
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
|
||||
cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
|
||||
cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
|
||||
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4, xmm2
|
||||
movaps xmm0, xmm5
|
||||
addps xmm2, xmm1 ; xmm2=z11
|
||||
addps xmm5, xmm3 ; xmm5=z13
|
||||
subps xmm4, xmm1 ; xmm4=z12
|
||||
subps xmm0, xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1, xmm2
|
||||
subps xmm2, xmm5
|
||||
addps xmm1, xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3, xmm0
|
||||
addps xmm0, xmm4
|
||||
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
||||
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3, xmm0 ; xmm3=tmp12
|
||||
subps xmm4, xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3, xmm1 ; xmm3=tmp6
|
||||
movaps xmm5, xmm6
|
||||
movaps xmm0, xmm7
|
||||
addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
|
||||
addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
|
||||
subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
|
||||
subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
|
||||
subps xmm2, xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||
unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
|
||||
movaps xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
|
||||
unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
||||
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
||||
|
||||
addps xmm4, xmm2 ; xmm4=tmp4
|
||||
movaps xmm0, xmm7
|
||||
movaps xmm3, xmm5
|
||||
addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
|
||||
addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
|
||||
subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
|
||||
subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
|
||||
|
||||
movaps xmm2, xmm7 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
|
||||
unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
|
||||
movaps xmm4, xmm5 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
|
||||
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
|
||||
|
||||
movaps xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
|
||||
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
|
||||
movaps xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
|
||||
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
||||
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
|
||||
movaps xmm6, xmm5 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
|
||||
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
|
||||
movaps xmm3, xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
|
||||
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
|
||||
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
|
||||
add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4, xmm0
|
||||
movaps xmm5, xmm1
|
||||
subps xmm0, xmm2 ; xmm0=tmp11
|
||||
subps xmm1, xmm3
|
||||
addps xmm4, xmm2 ; xmm4=tmp10
|
||||
addps xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1, [GOTOFF(ebx,PD_1_414)]
|
||||
subps xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm0
|
||||
subps xmm4, xmm5 ; xmm4=tmp3
|
||||
subps xmm0, xmm1 ; xmm0=tmp2
|
||||
addps xmm6, xmm5 ; xmm6=tmp0
|
||||
addps xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4, xmm2
|
||||
movaps xmm0, xmm5
|
||||
addps xmm2, xmm1 ; xmm2=z11
|
||||
addps xmm5, xmm3 ; xmm5=z13
|
||||
subps xmm4, xmm1 ; xmm4=z12
|
||||
subps xmm0, xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1, xmm2
|
||||
subps xmm2, xmm5
|
||||
addps xmm1, xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3, xmm0
|
||||
addps xmm0, xmm4
|
||||
mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
|
||||
mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3, xmm0 ; xmm3=tmp12
|
||||
subps xmm4, xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3, xmm1 ; xmm3=tmp6
|
||||
movaps xmm5, xmm6
|
||||
movaps xmm0, xmm7
|
||||
addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
|
||||
addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
|
||||
subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
|
||||
subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
|
||||
subps xmm2, xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
|
||||
pcmpeqd xmm3, xmm3
|
||||
psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||
|
||||
addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
|
||||
addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
|
||||
addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
|
||||
addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
|
||||
|
||||
pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
|
||||
pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
|
||||
pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
|
||||
pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
|
||||
por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||
por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
|
||||
|
||||
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
|
||||
|
||||
addps xmm4, xmm2 ; xmm4=tmp4
|
||||
movaps xmm7, xmm1
|
||||
movaps xmm5, xmm3
|
||||
addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
|
||||
addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
|
||||
subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
|
||||
subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
|
||||
|
||||
movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
|
||||
pcmpeqd xmm4, xmm4
|
||||
psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||
|
||||
addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
|
||||
addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
|
||||
addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
|
||||
addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
|
||||
|
||||
pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
|
||||
pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
|
||||
pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
|
||||
pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
|
||||
por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
|
||||
por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
|
||||
packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
|
||||
paddb xmm6, xmm2
|
||||
paddb xmm1, xmm2
|
||||
|
||||
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
|
||||
movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
|
||||
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
|
||||
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
|
||||
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
poppic ebx ; restore GOT address
|
||||
|
||||
add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
|
||||
add edi, byte 4*SIZEOF_JSAMPROW
|
||||
dec ecx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
499
TMessagesProj/jni/mozjpeg/simd/i386/jidctfst-mmx.asm
Normal file
499
TMessagesProj/jni/mozjpeg/simd/i386/jidctfst-mmx.asm
Normal file
|
|
@ -0,0 +1,499 @@
|
|||
;
|
||||
; jidctfst.asm - fast integer IDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the inverse DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
|
||||
; for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%if IFAST_SCALE_BITS != PASS1_BITS
|
||||
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
|
||||
%endif
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_1_082 equ 277 ; FIX(1.082392200)
|
||||
F_1_414 equ 362 ; FIX(1.414213562)
|
||||
F_1_847 equ 473 ; FIX(1.847759065)
|
||||
F_2_613 equ 669 ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
|
||||
F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_ifast_mmx)
|
||||
|
||||
EXTN(jconst_idct_ifast_mmx):
|
||||
|
||||
PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
|
||||
PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
|
||||
PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
|
||||
PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
|
||||
; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
|
||||
; JCOEF workspace[DCTSIZE2]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
|
||||
|
||||
EXTN(jsimd_idct_ifast_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; JCOEF *wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, mm0
|
||||
packsswb mm1, mm1
|
||||
movd eax, mm1
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movq mm2, mm0 ; mm0=in0=(00 01 02 03)
|
||||
punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
|
||||
punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
|
||||
|
||||
movq mm1, mm0
|
||||
punpckldq mm0, mm0 ; mm0=(00 00 00 00)
|
||||
punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
|
||||
movq mm3, mm2
|
||||
punpckldq mm2, mm2 ; mm2=(02 02 02 02)
|
||||
punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm1
|
||||
psubw mm0, mm2 ; mm0=tmp11
|
||||
psubw mm1, mm3
|
||||
paddw mm4, mm2 ; mm4=tmp10
|
||||
paddw mm5, mm3 ; mm5=tmp13
|
||||
|
||||
psllw mm1, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
|
||||
psubw mm1, mm5 ; mm1=tmp12
|
||||
|
||||
movq mm6, mm4
|
||||
movq mm7, mm0
|
||||
psubw mm4, mm5 ; mm4=tmp3
|
||||
psubw mm0, mm1 ; mm0=tmp2
|
||||
paddw mm6, mm5 ; mm6=tmp0
|
||||
paddw mm7, mm1 ; mm7=tmp1
|
||||
|
||||
movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm0, mm5
|
||||
psubw mm2, mm1 ; mm2=z12
|
||||
psubw mm5, mm3 ; mm5=z10
|
||||
paddw mm4, mm1 ; mm4=z11
|
||||
paddw mm0, mm3 ; mm0=z13
|
||||
|
||||
movq mm1, mm5 ; mm1=z10(unscaled)
|
||||
psllw mm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm5, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movq mm3, mm4
|
||||
psubw mm4, mm0
|
||||
paddw mm3, mm0 ; mm3=tmp7
|
||||
|
||||
psllw mm4, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movq mm0, mm5
|
||||
paddw mm5, mm2
|
||||
pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
|
||||
pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
|
||||
pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
|
||||
psubw mm0, mm1
|
||||
psubw mm2, mm5 ; mm2=tmp10
|
||||
paddw mm0, mm5 ; mm0=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw mm0, mm3 ; mm0=tmp6
|
||||
movq mm1, mm6
|
||||
movq mm5, mm7
|
||||
paddw mm6, mm3 ; mm6=data0=(00 01 02 03)
|
||||
paddw mm7, mm0 ; mm7=data1=(10 11 12 13)
|
||||
psubw mm1, mm3 ; mm1=data7=(70 71 72 73)
|
||||
psubw mm5, mm0 ; mm5=data6=(60 61 62 63)
|
||||
psubw mm4, mm0 ; mm4=tmp5
|
||||
|
||||
movq mm3, mm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
|
||||
punpckhwd mm3, mm7 ; mm3=(02 12 03 13)
|
||||
movq mm0, mm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm5, mm1 ; mm5=(60 70 61 71)
|
||||
punpckhwd mm0, mm1 ; mm0=(62 72 63 73)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=tmp2
|
||||
movq mm1, MMWORD [wk(1)] ; mm1=tmp3
|
||||
|
||||
movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
|
||||
movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
|
||||
|
||||
paddw mm2, mm4 ; mm2=tmp4
|
||||
movq mm5, mm7
|
||||
movq mm0, mm1
|
||||
paddw mm7, mm4 ; mm7=data2=(20 21 22 23)
|
||||
paddw mm1, mm2 ; mm1=data4=(40 41 42 43)
|
||||
psubw mm5, mm4 ; mm5=data5=(50 51 52 53)
|
||||
psubw mm0, mm2 ; mm0=data3=(30 31 32 33)
|
||||
|
||||
movq mm4, mm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm7, mm0 ; mm7=(20 30 21 31)
|
||||
punpckhwd mm4, mm0 ; mm4=(22 32 23 33)
|
||||
movq mm2, mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1, mm5 ; mm1=(40 50 41 51)
|
||||
punpckhwd mm2, mm5 ; mm2=(42 52 43 53)
|
||||
|
||||
movq mm0, mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6, mm7 ; mm6=(00 10 20 30)
|
||||
punpckhdq mm0, mm7 ; mm0=(01 11 21 31)
|
||||
movq mm5, mm3 ; transpose coefficients(phase 2)
|
||||
punpckldq mm3, mm4 ; mm3=(02 12 22 32)
|
||||
punpckhdq mm5, mm4 ; mm5=(03 13 23 33)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
|
||||
movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
|
||||
|
||||
movq mm6, mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1, mm7 ; mm1=(40 50 60 70)
|
||||
punpckhdq mm6, mm7 ; mm6=(41 51 61 71)
|
||||
movq mm0, mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2, mm4 ; mm2=(42 52 62 72)
|
||||
punpckhdq mm0, mm4 ; mm0=(43 53 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; JCOEF *wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm1
|
||||
psubw mm0, mm2 ; mm0=tmp11
|
||||
psubw mm1, mm3
|
||||
paddw mm4, mm2 ; mm4=tmp10
|
||||
paddw mm5, mm3 ; mm5=tmp13
|
||||
|
||||
psllw mm1, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
|
||||
psubw mm1, mm5 ; mm1=tmp12
|
||||
|
||||
movq mm6, mm4
|
||||
movq mm7, mm0
|
||||
psubw mm4, mm5 ; mm4=tmp3
|
||||
psubw mm0, mm1 ; mm0=tmp2
|
||||
paddw mm6, mm5 ; mm6=tmp0
|
||||
paddw mm7, mm1 ; mm7=tmp1
|
||||
|
||||
movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
|
||||
movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm0, mm5
|
||||
psubw mm2, mm1 ; mm2=z12
|
||||
psubw mm5, mm3 ; mm5=z10
|
||||
paddw mm4, mm1 ; mm4=z11
|
||||
paddw mm0, mm3 ; mm0=z13
|
||||
|
||||
movq mm1, mm5 ; mm1=z10(unscaled)
|
||||
psllw mm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw mm5, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movq mm3, mm4
|
||||
psubw mm4, mm0
|
||||
paddw mm3, mm0 ; mm3=tmp7
|
||||
|
||||
psllw mm4, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movq mm0, mm5
|
||||
paddw mm5, mm2
|
||||
pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
|
||||
pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
|
||||
pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
|
||||
psubw mm0, mm1
|
||||
psubw mm2, mm5 ; mm2=tmp10
|
||||
paddw mm0, mm5 ; mm0=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw mm0, mm3 ; mm0=tmp6
|
||||
movq mm1, mm6
|
||||
movq mm5, mm7
|
||||
paddw mm6, mm3 ; mm6=data0=(00 10 20 30)
|
||||
paddw mm7, mm0 ; mm7=data1=(01 11 21 31)
|
||||
psraw mm6, (PASS1_BITS+3) ; descale
|
||||
psraw mm7, (PASS1_BITS+3) ; descale
|
||||
psubw mm1, mm3 ; mm1=data7=(07 17 27 37)
|
||||
psubw mm5, mm0 ; mm5=data6=(06 16 26 36)
|
||||
psraw mm1, (PASS1_BITS+3) ; descale
|
||||
psraw mm5, (PASS1_BITS+3) ; descale
|
||||
psubw mm4, mm0 ; mm4=tmp5
|
||||
|
||||
packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36)
|
||||
packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37)
|
||||
|
||||
movq mm3, MMWORD [wk(0)] ; mm3=tmp2
|
||||
movq mm0, MMWORD [wk(1)] ; mm0=tmp3
|
||||
|
||||
paddw mm2, mm4 ; mm2=tmp4
|
||||
movq mm5, mm3
|
||||
movq mm1, mm0
|
||||
paddw mm3, mm4 ; mm3=data2=(02 12 22 32)
|
||||
paddw mm0, mm2 ; mm0=data4=(04 14 24 34)
|
||||
psraw mm3, (PASS1_BITS+3) ; descale
|
||||
psraw mm0, (PASS1_BITS+3) ; descale
|
||||
psubw mm5, mm4 ; mm5=data5=(05 15 25 35)
|
||||
psubw mm1, mm2 ; mm1=data3=(03 13 23 33)
|
||||
psraw mm5, (PASS1_BITS+3) ; descale
|
||||
psraw mm1, (PASS1_BITS+3) ; descale
|
||||
|
||||
movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34)
|
||||
packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35)
|
||||
|
||||
paddb mm6, mm4
|
||||
paddb mm7, mm4
|
||||
paddb mm3, mm4
|
||||
paddb mm1, mm4
|
||||
|
||||
movq mm2, mm6 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31)
|
||||
punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37)
|
||||
movq mm0, mm3 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33)
|
||||
punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35)
|
||||
|
||||
movq mm5, mm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33)
|
||||
movq mm4, mm0 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17)
|
||||
punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37)
|
||||
|
||||
movq mm7, mm6 ; transpose coefficients(phase 3)
|
||||
punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07)
|
||||
punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17)
|
||||
movq mm1, mm5 ; transpose coefficients(phase 3)
|
||||
punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27)
|
||||
punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37)
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
|
||||
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
|
||||
|
||||
poppic ebx ; restore GOT address
|
||||
|
||||
add esi, byte 4*SIZEOF_JCOEF ; wsptr
|
||||
add edi, byte 4*SIZEOF_JSAMPROW
|
||||
dec ecx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
501
TMessagesProj/jni/mozjpeg/simd/i386/jidctfst-sse2.asm
Normal file
501
TMessagesProj/jni/mozjpeg/simd/i386/jidctfst-sse2.asm
Normal file
|
|
@ -0,0 +1,501 @@
|
|||
;
|
||||
; jidctfst.asm - fast integer IDCT (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the inverse DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
|
||||
; for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%if IFAST_SCALE_BITS != PASS1_BITS
|
||||
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
|
||||
%endif
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_1_082 equ 277 ; FIX(1.082392200)
|
||||
F_1_414 equ 362 ; FIX(1.414213562)
|
||||
F_1_847 equ 473 ; FIX(1.847759065)
|
||||
F_2_613 equ 669 ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
|
||||
F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_ifast_sse2)
|
||||
|
||||
EXTN(jconst_idct_ifast_sse2):
|
||||
|
||||
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
|
||||
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
|
||||
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
|
||||
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
|
||||
|
||||
EXTN(jsimd_idct_ifast_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, xmm0
|
||||
packsswb xmm1, xmm1
|
||||
packsswb xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
|
||||
pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
|
||||
pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
|
||||
pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
|
||||
pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
|
||||
pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
|
||||
pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
|
||||
pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
|
||||
jmp near .column_end
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm1
|
||||
psubw xmm0, xmm2 ; xmm0=tmp11
|
||||
psubw xmm1, xmm3
|
||||
paddw xmm4, xmm2 ; xmm4=tmp10
|
||||
paddw xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
|
||||
psubw xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm0
|
||||
psubw xmm4, xmm5 ; xmm4=tmp3
|
||||
psubw xmm0, xmm1 ; xmm0=tmp2
|
||||
paddw xmm6, xmm5 ; xmm6=tmp0
|
||||
paddw xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm0, xmm5
|
||||
psubw xmm2, xmm1 ; xmm2=z12
|
||||
psubw xmm5, xmm3 ; xmm5=z10
|
||||
paddw xmm4, xmm1 ; xmm4=z11
|
||||
paddw xmm0, xmm3 ; xmm0=z13
|
||||
|
||||
movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
|
||||
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
psubw xmm4, xmm0
|
||||
paddw xmm3, xmm0 ; xmm3=tmp7
|
||||
|
||||
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
paddw xmm5, xmm2
|
||||
pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
|
||||
pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
|
||||
pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
|
||||
psubw xmm0, xmm1
|
||||
psubw xmm2, xmm5 ; xmm2=tmp10
|
||||
paddw xmm0, xmm5 ; xmm0=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw xmm0, xmm3 ; xmm0=tmp6
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
|
||||
paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
|
||||
psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
|
||||
psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
|
||||
psubw xmm4, xmm0 ; xmm4=tmp5
|
||||
|
||||
movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
|
||||
|
||||
paddw xmm2, xmm4 ; xmm2=tmp4
|
||||
movdqa xmm5, xmm7
|
||||
movdqa xmm0, xmm1
|
||||
paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
|
||||
paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
|
||||
psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||
psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||
movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
|
||||
movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
|
||||
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
|
||||
|
||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
|
||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
|
||||
punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
|
||||
movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
|
||||
punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
|
||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
|
||||
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
|
||||
punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
|
||||
movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
|
||||
punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
|
||||
; -- Even part
|
||||
|
||||
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
|
||||
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm0, xmm5
|
||||
psubw xmm6, xmm1 ; xmm6=tmp11
|
||||
psubw xmm5, xmm3
|
||||
paddw xmm2, xmm1 ; xmm2=tmp10
|
||||
paddw xmm0, xmm3 ; xmm0=tmp13
|
||||
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
|
||||
psubw xmm5, xmm0 ; xmm5=tmp12
|
||||
|
||||
movdqa xmm1, xmm2
|
||||
movdqa xmm3, xmm6
|
||||
psubw xmm2, xmm0 ; xmm2=tmp3
|
||||
psubw xmm6, xmm5 ; xmm6=tmp2
|
||||
paddw xmm1, xmm0 ; xmm1=tmp0
|
||||
paddw xmm3, xmm5 ; xmm3=tmp1
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm6, xmm4
|
||||
psubw xmm0, xmm7 ; xmm0=z12
|
||||
psubw xmm4, xmm5 ; xmm4=z10
|
||||
paddw xmm2, xmm7 ; xmm2=z11
|
||||
paddw xmm6, xmm5 ; xmm6=z13
|
||||
|
||||
movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
|
||||
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movdqa xmm5, xmm2
|
||||
psubw xmm2, xmm6
|
||||
paddw xmm5, xmm6 ; xmm5=tmp7
|
||||
|
||||
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
paddw xmm4, xmm0
|
||||
pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
|
||||
pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
|
||||
pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
|
||||
psubw xmm6, xmm7
|
||||
psubw xmm0, xmm4 ; xmm0=tmp10
|
||||
paddw xmm6, xmm4 ; xmm6=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw xmm6, xmm5 ; xmm6=tmp6
|
||||
movdqa xmm7, xmm1
|
||||
movdqa xmm4, xmm3
|
||||
paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
|
||||
paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||
psraw xmm1, (PASS1_BITS+3) ; descale
|
||||
psraw xmm3, (PASS1_BITS+3) ; descale
|
||||
psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
|
||||
psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||
psubw xmm2, xmm6 ; xmm2=tmp5
|
||||
|
||||
packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
|
||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
|
||||
|
||||
paddw xmm0, xmm2 ; xmm0=tmp4
|
||||
movdqa xmm4, xmm5
|
||||
movdqa xmm7, xmm6
|
||||
paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
|
||||
paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
|
||||
psraw xmm5, (PASS1_BITS+3) ; descale
|
||||
psraw xmm6, (PASS1_BITS+3) ; descale
|
||||
psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||
psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
|
||||
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||
|
||||
movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||
packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||
|
||||
paddb xmm1, xmm2
|
||||
paddb xmm3, xmm2
|
||||
paddb xmm5, xmm2
|
||||
paddb xmm7, xmm2
|
||||
|
||||
movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||
punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||
punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
|
||||
mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
|
||||
|
||||
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
|
||||
mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
453
TMessagesProj/jni/mozjpeg/simd/i386/jidctint-avx2.asm
Normal file
453
TMessagesProj/jni/mozjpeg/simd/i386/jidctint-avx2.asm
Normal file
|
|
@ -0,0 +1,453 @@
|
|||
;
|
||||
; jidctint.asm - accurate integer IDCT (AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, 2018, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%8: Temp registers
|
||||
|
||||
%macro dotranspose 8
|
||||
; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
|
||||
; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
|
||||
; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
|
||||
; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76)
|
||||
|
||||
vpermq %5, %1, 0xD8
|
||||
vpermq %6, %2, 0x72
|
||||
vpermq %7, %3, 0xD8
|
||||
vpermq %8, %4, 0x72
|
||||
; transpose coefficients(phase 1)
|
||||
; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
|
||||
; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
|
||||
; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
|
||||
; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
|
||||
|
||||
vpunpcklwd %1, %5, %6
|
||||
vpunpckhwd %2, %5, %6
|
||||
vpunpcklwd %3, %7, %8
|
||||
vpunpckhwd %4, %7, %8
|
||||
; transpose coefficients(phase 2)
|
||||
; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72)
|
||||
; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73)
|
||||
; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76)
|
||||
; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77)
|
||||
|
||||
vpunpcklwd %5, %1, %2
|
||||
vpunpcklwd %6, %3, %4
|
||||
vpunpckhwd %7, %1, %2
|
||||
vpunpckhwd %8, %3, %4
|
||||
; transpose coefficients(phase 3)
|
||||
; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53)
|
||||
; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57)
|
||||
; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73)
|
||||
; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77)
|
||||
|
||||
vpunpcklqdq %1, %5, %6
|
||||
vpunpckhqdq %2, %5, %6
|
||||
vpunpcklqdq %3, %7, %8
|
||||
vpunpckhqdq %4, %7, %8
|
||||
; transpose coefficients(phase 4)
|
||||
; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
|
||||
; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
|
||||
; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
|
||||
; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit slow integer inverse DCT using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%12: Temp registers
|
||||
; %9: Pass (1 or 2)
|
||||
|
||||
%macro dodct 13
|
||||
; -- Even part
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2
|
||||
vpunpcklwd %5, %3, %6 ; %5=in26_62L
|
||||
vpunpckhwd %6, %3, %6 ; %6=in26_62H
|
||||
vpmaddwd %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %5=tmp3_2L
|
||||
vpmaddwd %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %6=tmp3_2H
|
||||
|
||||
vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0
|
||||
vpsignw %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
|
||||
vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4)
|
||||
|
||||
vpxor %1, %1, %1
|
||||
vpunpcklwd %8, %1, %7 ; %8=tmp0_1L
|
||||
vpunpckhwd %1, %1, %7 ; %1=tmp0_1H
|
||||
vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS
|
||||
vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS
|
||||
|
||||
vpsubd %3, %8, %5
|
||||
vmovdqu %11, %3 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
|
||||
vpaddd %3, %8, %5
|
||||
vmovdqu %9, %3 ; %9=tmp0_1L+tmp3_2L=tmp10_11L
|
||||
vpsubd %3, %1, %6
|
||||
vmovdqu %12, %3 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
|
||||
vpaddd %3, %1, %6
|
||||
vmovdqu %10, %3 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3
|
||||
vpunpcklwd %7, %1, %8 ; %7=z34_43L
|
||||
vpunpckhwd %8, %1, %8 ; %8=z34_43H
|
||||
vpmaddwd %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %7=z3_4L
|
||||
vpmaddwd %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %8=z3_4H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3
|
||||
vpunpcklwd %3, %4, %2 ; %3=in71_53L
|
||||
vpunpckhwd %4, %4, %2 ; %4=in71_53H
|
||||
|
||||
vpmaddwd %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %5=tmp0_1L
|
||||
vpmaddwd %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %6=tmp0_1H
|
||||
vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L
|
||||
vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H
|
||||
|
||||
vpmaddwd %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %3=tmp3_2L
|
||||
vpmaddwd %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %4=tmp3_2H
|
||||
vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L
|
||||
vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H
|
||||
vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L
|
||||
vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
vmovdqu %3, %9
|
||||
vmovdqu %4, %10
|
||||
|
||||
vpaddd %1, %3, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L
|
||||
vpaddd %2, %4, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H
|
||||
vpaddd %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpsrad %1, %1, DESCALE_P %+ %13
|
||||
vpsrad %2, %2, DESCALE_P %+ %13
|
||||
vpackssdw %1, %1, %2 ; %1=data0_1
|
||||
|
||||
vpsubd %3, %3, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L
|
||||
vpsubd %4, %4, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H
|
||||
vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpaddd %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpsrad %3, %3, DESCALE_P %+ %13
|
||||
vpsrad %4, %4, DESCALE_P %+ %13
|
||||
vpackssdw %4, %3, %4 ; %4=data7_6
|
||||
|
||||
vmovdqu %7, %11
|
||||
vmovdqu %8, %12
|
||||
|
||||
vpaddd %2, %7, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L
|
||||
vpaddd %3, %8, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H
|
||||
vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpsrad %2, %2, DESCALE_P %+ %13
|
||||
vpsrad %3, %3, DESCALE_P %+ %13
|
||||
vpackssdw %2, %2, %3 ; %2=data3_2
|
||||
|
||||
vpsubd %3, %7, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L
|
||||
vpsubd %6, %8, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H
|
||||
vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpaddd %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
|
||||
vpsrad %3, %3, DESCALE_P %+ %13
|
||||
vpsrad %6, %6, DESCALE_P %+ %13
|
||||
vpackssdw %3, %3, %6 ; %3=data4_5
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_islow_avx2)
|
||||
|
||||
EXTN(jconst_idct_islow_avx2):
|
||||
|
||||
PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
times 4 dw (F_0_541 - F_1_847), F_0_541
|
||||
PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
times 4 dw (F_1_175 - F_0_390), F_1_175
|
||||
PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
times 4 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
|
||||
PB_CENTERJSAMP times 32 db CENTERJSAMPLE
|
||||
PW_1_NEG1 times 8 dw 1
|
||||
times 8 dw -1
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
|
||||
; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
|
||||
|
||||
EXTN(jsimd_idct_islow_avx2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
vpor xmm1, xmm1, xmm0
|
||||
vpacksswb xmm1, xmm1, xmm1
|
||||
vpacksswb xmm1, xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
vpsllw xmm5, xmm5, PASS1_BITS
|
||||
|
||||
vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03)
|
||||
vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07)
|
||||
vinserti128 ymm4, ymm4, xmm5, 1
|
||||
|
||||
vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04)
|
||||
vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05)
|
||||
vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06)
|
||||
vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07)
|
||||
|
||||
jmp near .column_end
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)] ; ymm4=in0_1
|
||||
vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)] ; ymm5=in2_3
|
||||
vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)] ; ymm6=in4_5
|
||||
vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)] ; ymm7=in6_7
|
||||
vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4
|
||||
vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1
|
||||
vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
|
||||
vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
|
||||
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
|
||||
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
|
||||
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
|
||||
vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
|
||||
vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
|
||||
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
|
||||
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
|
||||
|
||||
vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
|
||||
vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67
|
||||
vpaddb ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||
vpaddb ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||
|
||||
vextracti128 xmm6, ymm1, 1 ; xmm3=data67
|
||||
vextracti128 xmm4, ymm0, 1 ; xmm2=data45
|
||||
vextracti128 xmm2, ymm1, 0 ; xmm1=data23
|
||||
vextracti128 xmm0, ymm0, 0 ; xmm0=data01
|
||||
|
||||
vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
vzeroupper
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
|
||||
|
||||
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
|
||||
|
||||
mov edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
851
TMessagesProj/jni/mozjpeg/simd/i386/jidctint-mmx.asm
Normal file
851
TMessagesProj/jni/mozjpeg/simd/i386/jidctint-mmx.asm
Normal file
|
|
@ -0,0 +1,851 @@
|
|||
;
|
||||
; jidctint.asm - accurate integer IDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_islow_mmx)
|
||||
|
||||
EXTN(jconst_idct_islow_mmx):
|
||||
|
||||
PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541
|
||||
PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847)
|
||||
PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175
|
||||
PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390)
|
||||
PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899
|
||||
PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
|
||||
; mmword wk[WK_NUM]
|
||||
%define WK_NUM 12
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
|
||||
; JCOEF workspace[DCTSIZE2]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
|
||||
|
||||
EXTN(jsimd_idct_islow_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; JCOEF *wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, mm0
|
||||
packsswb mm1, mm1
|
||||
movd eax, mm1
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw mm0, PASS1_BITS
|
||||
|
||||
movq mm2, mm0 ; mm0=in0=(00 01 02 03)
|
||||
punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
|
||||
punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
|
||||
|
||||
movq mm1, mm0
|
||||
punpckldq mm0, mm0 ; mm0=(00 00 00 00)
|
||||
punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
|
||||
movq mm3, mm2
|
||||
punpckldq mm2, mm2 ; mm2=(02 02 02 02)
|
||||
punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movq mm4, mm1 ; mm1=in2=z2
|
||||
movq mm5, mm1
|
||||
punpcklwd mm4, mm3 ; mm3=in6=z3
|
||||
punpckhwd mm5, mm3
|
||||
movq mm1, mm4
|
||||
movq mm3, mm5
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
|
||||
|
||||
movq mm6, mm0
|
||||
paddw mm0, mm2 ; mm0=in0+in4
|
||||
psubw mm6, mm2 ; mm6=in0-in4
|
||||
|
||||
pxor mm7, mm7
|
||||
pxor mm2, mm2
|
||||
punpcklwd mm7, mm0 ; mm7=tmp0L
|
||||
punpckhwd mm2, mm0 ; mm2=tmp0H
|
||||
psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
|
||||
|
||||
movq mm0, mm7
|
||||
paddd mm7, mm4 ; mm7=tmp10L
|
||||
psubd mm0, mm4 ; mm0=tmp13L
|
||||
movq mm4, mm2
|
||||
paddd mm2, mm5 ; mm2=tmp10H
|
||||
psubd mm4, mm5 ; mm4=tmp13H
|
||||
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
|
||||
movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
|
||||
movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
|
||||
|
||||
pxor mm5, mm5
|
||||
pxor mm7, mm7
|
||||
punpcklwd mm5, mm6 ; mm5=tmp1L
|
||||
punpckhwd mm7, mm6 ; mm7=tmp1H
|
||||
psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
|
||||
psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
|
||||
movq mm2, mm5
|
||||
paddd mm5, mm1 ; mm5=tmp11L
|
||||
psubd mm2, mm1 ; mm2=tmp12L
|
||||
movq mm0, mm7
|
||||
paddd mm7, mm3 ; mm7=tmp11H
|
||||
psubd mm0, mm3 ; mm0=tmp12H
|
||||
|
||||
movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
|
||||
movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
|
||||
movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
|
||||
movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movq mm5, mm6
|
||||
movq mm7, mm4
|
||||
paddw mm5, mm3 ; mm5=z3
|
||||
paddw mm7, mm1 ; mm7=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm2, mm5
|
||||
movq mm0, mm5
|
||||
punpcklwd mm2, mm7
|
||||
punpckhwd mm0, mm7
|
||||
movq mm5, mm2
|
||||
movq mm7, mm0
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movq mm2, mm3
|
||||
movq mm0, mm3
|
||||
punpcklwd mm2, mm4
|
||||
punpckhwd mm0, mm4
|
||||
movq mm3, mm2
|
||||
movq mm4, mm0
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
|
||||
|
||||
paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
|
||||
paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
|
||||
paddd mm3, mm5 ; mm3=tmp3L
|
||||
paddd mm4, mm7 ; mm4=tmp3H
|
||||
|
||||
movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
|
||||
movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
|
||||
|
||||
movq mm2, mm1
|
||||
movq mm0, mm1
|
||||
punpcklwd mm2, mm6
|
||||
punpckhwd mm0, mm6
|
||||
movq mm1, mm2
|
||||
movq mm6, mm0
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
|
||||
|
||||
paddd mm2, mm5 ; mm2=tmp1L
|
||||
paddd mm0, mm7 ; mm0=tmp1H
|
||||
paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
|
||||
paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
|
||||
|
||||
movq mm2, mm5
|
||||
movq mm0, mm7
|
||||
paddd mm5, mm3 ; mm5=data0L
|
||||
paddd mm7, mm4 ; mm7=data0H
|
||||
psubd mm2, mm3 ; mm2=data7L
|
||||
psubd mm0, mm4 ; mm0=data7H
|
||||
|
||||
movq mm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm5, mm3
|
||||
paddd mm7, mm3
|
||||
psrad mm5, DESCALE_P1
|
||||
psrad mm7, DESCALE_P1
|
||||
paddd mm2, mm3
|
||||
paddd mm0, mm3
|
||||
psrad mm2, DESCALE_P1
|
||||
psrad mm0, DESCALE_P1
|
||||
|
||||
packssdw mm5, mm7 ; mm5=data0=(00 01 02 03)
|
||||
packssdw mm2, mm0 ; mm2=data7=(70 71 72 73)
|
||||
|
||||
movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
|
||||
movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
|
||||
|
||||
movq mm7, mm4
|
||||
movq mm0, mm3
|
||||
paddd mm4, mm1 ; mm4=data1L
|
||||
paddd mm3, mm6 ; mm3=data1H
|
||||
psubd mm7, mm1 ; mm7=data6L
|
||||
psubd mm0, mm6 ; mm0=data6H
|
||||
|
||||
movq mm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm4, mm1
|
||||
paddd mm3, mm1
|
||||
psrad mm4, DESCALE_P1
|
||||
psrad mm3, DESCALE_P1
|
||||
paddd mm7, mm1
|
||||
paddd mm0, mm1
|
||||
psrad mm7, DESCALE_P1
|
||||
psrad mm0, DESCALE_P1
|
||||
|
||||
packssdw mm4, mm3 ; mm4=data1=(10 11 12 13)
|
||||
packssdw mm7, mm0 ; mm7=data6=(60 61 62 63)
|
||||
|
||||
movq mm6, mm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm5, mm4 ; mm5=(00 10 01 11)
|
||||
punpckhwd mm6, mm4 ; mm6=(02 12 03 13)
|
||||
movq mm1, mm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm7, mm2 ; mm7=(60 70 61 71)
|
||||
punpckhwd mm1, mm2 ; mm1=(62 72 63 73)
|
||||
|
||||
movq mm3, MMWORD [wk(6)] ; mm3=tmp12L
|
||||
movq mm0, MMWORD [wk(7)] ; mm0=tmp12H
|
||||
movq mm4, MMWORD [wk(10)] ; mm4=tmp1L
|
||||
movq mm2, MMWORD [wk(11)] ; mm2=tmp1H
|
||||
|
||||
movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11)
|
||||
movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13)
|
||||
movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71)
|
||||
movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73)
|
||||
|
||||
movq mm5, mm3
|
||||
movq mm6, mm0
|
||||
paddd mm3, mm4 ; mm3=data2L
|
||||
paddd mm0, mm2 ; mm0=data2H
|
||||
psubd mm5, mm4 ; mm5=data5L
|
||||
psubd mm6, mm2 ; mm6=data5H
|
||||
|
||||
movq mm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm3, mm7
|
||||
paddd mm0, mm7
|
||||
psrad mm3, DESCALE_P1
|
||||
psrad mm0, DESCALE_P1
|
||||
paddd mm5, mm7
|
||||
paddd mm6, mm7
|
||||
psrad mm5, DESCALE_P1
|
||||
psrad mm6, DESCALE_P1
|
||||
|
||||
packssdw mm3, mm0 ; mm3=data2=(20 21 22 23)
|
||||
packssdw mm5, mm6 ; mm5=data5=(50 51 52 53)
|
||||
|
||||
movq mm1, MMWORD [wk(2)] ; mm1=tmp13L
|
||||
movq mm4, MMWORD [wk(3)] ; mm4=tmp13H
|
||||
movq mm2, MMWORD [wk(8)] ; mm2=tmp0L
|
||||
movq mm7, MMWORD [wk(9)] ; mm7=tmp0H
|
||||
|
||||
movq mm0, mm1
|
||||
movq mm6, mm4
|
||||
paddd mm1, mm2 ; mm1=data3L
|
||||
paddd mm4, mm7 ; mm4=data3H
|
||||
psubd mm0, mm2 ; mm0=data4L
|
||||
psubd mm6, mm7 ; mm6=data4H
|
||||
|
||||
movq mm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
|
||||
|
||||
paddd mm1, mm2
|
||||
paddd mm4, mm2
|
||||
psrad mm1, DESCALE_P1
|
||||
psrad mm4, DESCALE_P1
|
||||
paddd mm0, mm2
|
||||
paddd mm6, mm2
|
||||
psrad mm0, DESCALE_P1
|
||||
psrad mm6, DESCALE_P1
|
||||
|
||||
packssdw mm1, mm4 ; mm1=data3=(30 31 32 33)
|
||||
packssdw mm0, mm6 ; mm0=data4=(40 41 42 43)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11)
|
||||
movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13)
|
||||
|
||||
movq mm4, mm3 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm3, mm1 ; mm3=(20 30 21 31)
|
||||
punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
|
||||
movq mm6, mm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm0, mm5 ; mm0=(40 50 41 51)
|
||||
punpckhwd mm6, mm5 ; mm6=(42 52 43 53)
|
||||
|
||||
movq mm1, mm7 ; transpose coefficients(phase 2)
|
||||
punpckldq mm7, mm3 ; mm7=(00 10 20 30)
|
||||
punpckhdq mm1, mm3 ; mm1=(01 11 21 31)
|
||||
movq mm5, mm2 ; transpose coefficients(phase 2)
|
||||
punpckldq mm2, mm4 ; mm2=(02 12 22 32)
|
||||
punpckhdq mm5, mm4 ; mm5=(03 13 23 33)
|
||||
|
||||
movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71)
|
||||
movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
|
||||
|
||||
movq mm7, mm0 ; transpose coefficients(phase 2)
|
||||
punpckldq mm0, mm3 ; mm0=(40 50 60 70)
|
||||
punpckhdq mm7, mm3 ; mm7=(41 51 61 71)
|
||||
movq mm1, mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6, mm4 ; mm6=(42 52 62 72)
|
||||
punpckhdq mm1, mm4 ; mm1=(43 53 63 73)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; JCOEF *wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movq mm4, mm1 ; mm1=in2=z2
|
||||
movq mm5, mm1
|
||||
punpcklwd mm4, mm3 ; mm3=in6=z3
|
||||
punpckhwd mm5, mm3
|
||||
movq mm1, mm4
|
||||
movq mm3, mm5
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
|
||||
|
||||
movq mm6, mm0
|
||||
paddw mm0, mm2 ; mm0=in0+in4
|
||||
psubw mm6, mm2 ; mm6=in0-in4
|
||||
|
||||
pxor mm7, mm7
|
||||
pxor mm2, mm2
|
||||
punpcklwd mm7, mm0 ; mm7=tmp0L
|
||||
punpckhwd mm2, mm0 ; mm2=tmp0H
|
||||
psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
|
||||
|
||||
movq mm0, mm7
|
||||
paddd mm7, mm4 ; mm7=tmp10L
|
||||
psubd mm0, mm4 ; mm0=tmp13L
|
||||
movq mm4, mm2
|
||||
paddd mm2, mm5 ; mm2=tmp10H
|
||||
psubd mm4, mm5 ; mm4=tmp13H
|
||||
|
||||
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
|
||||
movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
|
||||
movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
|
||||
movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
|
||||
|
||||
pxor mm5, mm5
|
||||
pxor mm7, mm7
|
||||
punpcklwd mm5, mm6 ; mm5=tmp1L
|
||||
punpckhwd mm7, mm6 ; mm7=tmp1H
|
||||
psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
|
||||
psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
|
||||
|
||||
movq mm2, mm5
|
||||
paddd mm5, mm1 ; mm5=tmp11L
|
||||
psubd mm2, mm1 ; mm2=tmp12L
|
||||
movq mm0, mm7
|
||||
paddd mm7, mm3 ; mm7=tmp11H
|
||||
psubd mm0, mm3 ; mm0=tmp12H
|
||||
|
||||
movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
|
||||
movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
|
||||
movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
|
||||
movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm5, mm6
|
||||
movq mm7, mm4
|
||||
paddw mm5, mm3 ; mm5=z3
|
||||
paddw mm7, mm1 ; mm7=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movq mm2, mm5
|
||||
movq mm0, mm5
|
||||
punpcklwd mm2, mm7
|
||||
punpckhwd mm0, mm7
|
||||
movq mm5, mm2
|
||||
movq mm7, mm0
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movq mm2, mm3
|
||||
movq mm0, mm3
|
||||
punpcklwd mm2, mm4
|
||||
punpckhwd mm0, mm4
|
||||
movq mm3, mm2
|
||||
movq mm4, mm0
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
|
||||
|
||||
paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
|
||||
paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
|
||||
paddd mm3, mm5 ; mm3=tmp3L
|
||||
paddd mm4, mm7 ; mm4=tmp3H
|
||||
|
||||
movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
|
||||
movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
|
||||
|
||||
movq mm2, mm1
|
||||
movq mm0, mm1
|
||||
punpcklwd mm2, mm6
|
||||
punpckhwd mm0, mm6
|
||||
movq mm1, mm2
|
||||
movq mm6, mm0
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
|
||||
|
||||
paddd mm2, mm5 ; mm2=tmp1L
|
||||
paddd mm0, mm7 ; mm0=tmp1H
|
||||
paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
|
||||
paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
|
||||
|
||||
movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
|
||||
movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
|
||||
movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
|
||||
|
||||
movq mm2, mm5
|
||||
movq mm0, mm7
|
||||
paddd mm5, mm3 ; mm5=data0L
|
||||
paddd mm7, mm4 ; mm7=data0H
|
||||
psubd mm2, mm3 ; mm2=data7L
|
||||
psubd mm0, mm4 ; mm0=data7H
|
||||
|
||||
movq mm3, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm5, mm3
|
||||
paddd mm7, mm3
|
||||
psrad mm5, DESCALE_P2
|
||||
psrad mm7, DESCALE_P2
|
||||
paddd mm2, mm3
|
||||
paddd mm0, mm3
|
||||
psrad mm2, DESCALE_P2
|
||||
psrad mm0, DESCALE_P2
|
||||
|
||||
packssdw mm5, mm7 ; mm5=data0=(00 10 20 30)
|
||||
packssdw mm2, mm0 ; mm2=data7=(07 17 27 37)
|
||||
|
||||
movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
|
||||
movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
|
||||
|
||||
movq mm7, mm4
|
||||
movq mm0, mm3
|
||||
paddd mm4, mm1 ; mm4=data1L
|
||||
paddd mm3, mm6 ; mm3=data1H
|
||||
psubd mm7, mm1 ; mm7=data6L
|
||||
psubd mm0, mm6 ; mm0=data6H
|
||||
|
||||
movq mm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm4, mm1
|
||||
paddd mm3, mm1
|
||||
psrad mm4, DESCALE_P2
|
||||
psrad mm3, DESCALE_P2
|
||||
paddd mm7, mm1
|
||||
paddd mm0, mm1
|
||||
psrad mm7, DESCALE_P2
|
||||
psrad mm0, DESCALE_P2
|
||||
|
||||
packssdw mm4, mm3 ; mm4=data1=(01 11 21 31)
|
||||
packssdw mm7, mm0 ; mm7=data6=(06 16 26 36)
|
||||
|
||||
packsswb mm5, mm7 ; mm5=(00 10 20 30 06 16 26 36)
|
||||
packsswb mm4, mm2 ; mm4=(01 11 21 31 07 17 27 37)
|
||||
|
||||
movq mm6, MMWORD [wk(6)] ; mm6=tmp12L
|
||||
movq mm1, MMWORD [wk(7)] ; mm1=tmp12H
|
||||
movq mm3, MMWORD [wk(10)] ; mm3=tmp1L
|
||||
movq mm0, MMWORD [wk(11)] ; mm0=tmp1H
|
||||
|
||||
movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36)
|
||||
movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37)
|
||||
|
||||
movq mm7, mm6
|
||||
movq mm2, mm1
|
||||
paddd mm6, mm3 ; mm6=data2L
|
||||
paddd mm1, mm0 ; mm1=data2H
|
||||
psubd mm7, mm3 ; mm7=data5L
|
||||
psubd mm2, mm0 ; mm2=data5H
|
||||
|
||||
movq mm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm6, mm5
|
||||
paddd mm1, mm5
|
||||
psrad mm6, DESCALE_P2
|
||||
psrad mm1, DESCALE_P2
|
||||
paddd mm7, mm5
|
||||
paddd mm2, mm5
|
||||
psrad mm7, DESCALE_P2
|
||||
psrad mm2, DESCALE_P2
|
||||
|
||||
packssdw mm6, mm1 ; mm6=data2=(02 12 22 32)
|
||||
packssdw mm7, mm2 ; mm7=data5=(05 15 25 35)
|
||||
|
||||
movq mm4, MMWORD [wk(2)] ; mm4=tmp13L
|
||||
movq mm3, MMWORD [wk(3)] ; mm3=tmp13H
|
||||
movq mm0, MMWORD [wk(8)] ; mm0=tmp0L
|
||||
movq mm5, MMWORD [wk(9)] ; mm5=tmp0H
|
||||
|
||||
movq mm1, mm4
|
||||
movq mm2, mm3
|
||||
paddd mm4, mm0 ; mm4=data3L
|
||||
paddd mm3, mm5 ; mm3=data3H
|
||||
psubd mm1, mm0 ; mm1=data4L
|
||||
psubd mm2, mm5 ; mm2=data4H
|
||||
|
||||
movq mm0, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
|
||||
|
||||
paddd mm4, mm0
|
||||
paddd mm3, mm0
|
||||
psrad mm4, DESCALE_P2
|
||||
psrad mm3, DESCALE_P2
|
||||
paddd mm1, mm0
|
||||
paddd mm2, mm0
|
||||
psrad mm1, DESCALE_P2
|
||||
psrad mm2, DESCALE_P2
|
||||
|
||||
movq mm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP]
|
||||
|
||||
packssdw mm4, mm3 ; mm4=data3=(03 13 23 33)
|
||||
packssdw mm1, mm2 ; mm1=data4=(04 14 24 34)
|
||||
|
||||
movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36)
|
||||
movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37)
|
||||
|
||||
packsswb mm6, mm1 ; mm6=(02 12 22 32 04 14 24 34)
|
||||
packsswb mm4, mm7 ; mm4=(03 13 23 33 05 15 25 35)
|
||||
|
||||
paddb mm0, mm5
|
||||
paddb mm3, mm5
|
||||
paddb mm6, mm5
|
||||
paddb mm4, mm5
|
||||
|
||||
movq mm2, mm0 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm0, mm3 ; mm0=(00 01 10 11 20 21 30 31)
|
||||
punpckhbw mm2, mm3 ; mm2=(06 07 16 17 26 27 36 37)
|
||||
movq mm1, mm6 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm6, mm4 ; mm6=(02 03 12 13 22 23 32 33)
|
||||
punpckhbw mm1, mm4 ; mm1=(04 05 14 15 24 25 34 35)
|
||||
|
||||
movq mm7, mm0 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm0, mm6 ; mm0=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm7, mm6 ; mm7=(20 21 22 23 30 31 32 33)
|
||||
movq mm5, mm1 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm1, mm2 ; mm1=(04 05 06 07 14 15 16 17)
|
||||
punpckhwd mm5, mm2 ; mm5=(24 25 26 27 34 35 36 37)
|
||||
|
||||
movq mm3, mm0 ; transpose coefficients(phase 3)
|
||||
punpckldq mm0, mm1 ; mm0=(00 01 02 03 04 05 06 07)
|
||||
punpckhdq mm3, mm1 ; mm3=(10 11 12 13 14 15 16 17)
|
||||
movq mm4, mm7 ; transpose coefficients(phase 3)
|
||||
punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27)
|
||||
punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37)
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
|
||||
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
|
||||
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
|
||||
|
||||
poppic ebx ; restore GOT address
|
||||
|
||||
add esi, byte 4*SIZEOF_JCOEF ; wsptr
|
||||
add edi, byte 4*SIZEOF_JSAMPROW
|
||||
dec ecx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
858
TMessagesProj/jni/mozjpeg/simd/i386/jidctint-sse2.asm
Normal file
858
TMessagesProj/jni/mozjpeg/simd/i386/jidctint-sse2.asm
Normal file
|
|
@ -0,0 +1,858 @@
|
|||
;
|
||||
; jidctint.asm - accurate integer IDCT (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_islow_sse2)
|
||||
|
||||
EXTN(jconst_idct_islow_sse2):
|
||||
|
||||
PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
|
||||
PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
|
||||
PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 12
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
|
||||
|
||||
EXTN(jsimd_idct_islow_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, xmm0
|
||||
packsswb xmm1, xmm1
|
||||
packsswb xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw xmm5, PASS1_BITS
|
||||
|
||||
movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
|
||||
pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
|
||||
pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
|
||||
pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
|
||||
pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
|
||||
pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
|
||||
pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
|
||||
pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
|
||||
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
|
||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||
jmp near .column_end
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movdqa xmm4, xmm1 ; xmm1=in2=z2
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm4, xmm3 ; xmm3=in6=z3
|
||||
punpckhwd xmm5, xmm3
|
||||
movdqa xmm1, xmm4
|
||||
movdqa xmm3, xmm5
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
|
||||
pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
paddw xmm0, xmm2 ; xmm0=in0+in4
|
||||
psubw xmm6, xmm2 ; xmm6=in0-in4
|
||||
|
||||
pxor xmm7, xmm7
|
||||
pxor xmm2, xmm2
|
||||
punpcklwd xmm7, xmm0 ; xmm7=tmp0L
|
||||
punpckhwd xmm2, xmm0 ; xmm2=tmp0H
|
||||
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||
psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
|
||||
|
||||
movdqa xmm0, xmm7
|
||||
paddd xmm7, xmm4 ; xmm7=tmp10L
|
||||
psubd xmm0, xmm4 ; xmm0=tmp13L
|
||||
movdqa xmm4, xmm2
|
||||
paddd xmm2, xmm5 ; xmm2=tmp10H
|
||||
psubd xmm4, xmm5 ; xmm4=tmp13H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
|
||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
|
||||
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
|
||||
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
|
||||
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm7, xmm7
|
||||
punpcklwd xmm5, xmm6 ; xmm5=tmp1L
|
||||
punpckhwd xmm7, xmm6 ; xmm7=tmp1H
|
||||
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||
psubd xmm2, xmm1 ; xmm2=tmp12L
|
||||
movdqa xmm0, xmm7
|
||||
paddd xmm7, xmm3 ; xmm7=tmp11H
|
||||
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
|
||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
|
||||
movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm5, xmm6
|
||||
movdqa xmm7, xmm4
|
||||
paddw xmm5, xmm3 ; xmm5=z3
|
||||
paddw xmm7, xmm1 ; xmm7=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
punpcklwd xmm2, xmm7
|
||||
punpckhwd xmm0, xmm7
|
||||
movdqa xmm5, xmm2
|
||||
movdqa xmm7, xmm0
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
|
||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movdqa xmm2, xmm3
|
||||
movdqa xmm0, xmm3
|
||||
punpcklwd xmm2, xmm4
|
||||
punpckhwd xmm0, xmm4
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm4, xmm0
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
|
||||
pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
|
||||
|
||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
|
||||
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
|
||||
paddd xmm3, xmm5 ; xmm3=tmp3L
|
||||
paddd xmm4, xmm7 ; xmm4=tmp3H
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
|
||||
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm0, xmm1
|
||||
punpcklwd xmm2, xmm6
|
||||
punpckhwd xmm0, xmm6
|
||||
movdqa xmm1, xmm2
|
||||
movdqa xmm6, xmm0
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
|
||||
|
||||
paddd xmm2, xmm5 ; xmm2=tmp1L
|
||||
paddd xmm0, xmm7 ; xmm0=tmp1H
|
||||
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
|
||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
|
||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm0, xmm7
|
||||
paddd xmm5, xmm3 ; xmm5=data0L
|
||||
paddd xmm7, xmm4 ; xmm7=data0H
|
||||
psubd xmm2, xmm3 ; xmm2=data7L
|
||||
psubd xmm0, xmm4 ; xmm0=data7H
|
||||
|
||||
movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm5, xmm3
|
||||
paddd xmm7, xmm3
|
||||
psrad xmm5, DESCALE_P1
|
||||
psrad xmm7, DESCALE_P1
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm0, xmm3
|
||||
psrad xmm2, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
|
||||
packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
|
||||
packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
|
||||
|
||||
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
|
||||
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
|
||||
|
||||
movdqa xmm7, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
paddd xmm4, xmm1 ; xmm4=data1L
|
||||
paddd xmm3, xmm6 ; xmm3=data1H
|
||||
psubd xmm7, xmm1 ; xmm7=data6L
|
||||
psubd xmm0, xmm6 ; xmm0=data6H
|
||||
|
||||
movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm4, xmm1
|
||||
paddd xmm3, xmm1
|
||||
psrad xmm4, DESCALE_P1
|
||||
psrad xmm3, DESCALE_P1
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm0, xmm1
|
||||
psrad xmm7, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
|
||||
packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||
packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
|
||||
|
||||
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
|
||||
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
|
||||
movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
|
||||
movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
|
||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
|
||||
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm5, xmm3
|
||||
movdqa xmm6, xmm0
|
||||
paddd xmm3, xmm4 ; xmm3=data2L
|
||||
paddd xmm0, xmm2 ; xmm0=data2H
|
||||
psubd xmm5, xmm4 ; xmm5=data5L
|
||||
psubd xmm6, xmm2 ; xmm6=data5H
|
||||
|
||||
movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm3, xmm7
|
||||
paddd xmm0, xmm7
|
||||
psrad xmm3, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
paddd xmm5, xmm7
|
||||
paddd xmm6, xmm7
|
||||
psrad xmm5, DESCALE_P1
|
||||
psrad xmm6, DESCALE_P1
|
||||
|
||||
packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
|
||||
packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
|
||||
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
|
||||
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
|
||||
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
|
||||
|
||||
movdqa xmm0, xmm1
|
||||
movdqa xmm6, xmm4
|
||||
paddd xmm1, xmm2 ; xmm1=data3L
|
||||
paddd xmm4, xmm7 ; xmm4=data3H
|
||||
psubd xmm0, xmm2 ; xmm0=data4L
|
||||
psubd xmm6, xmm7 ; xmm6=data4H
|
||||
|
||||
movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm4, xmm2
|
||||
psrad xmm1, DESCALE_P1
|
||||
psrad xmm4, DESCALE_P1
|
||||
paddd xmm0, xmm2
|
||||
paddd xmm6, xmm2
|
||||
psrad xmm0, DESCALE_P1
|
||||
psrad xmm6, DESCALE_P1
|
||||
|
||||
packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
|
||||
packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
|
||||
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
|
||||
|
||||
movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||
movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
|
||||
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
|
||||
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
|
||||
punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
|
||||
punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
|
||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
|
||||
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
|
||||
|
||||
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
|
||||
punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
|
||||
movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
|
||||
punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
|
||||
; -- Even part
|
||||
|
||||
; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movdqa xmm6, xmm1 ; xmm1=in2=z2
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm6, xmm2 ; xmm2=in6=z3
|
||||
punpckhwd xmm5, xmm2
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm2, xmm5
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
|
||||
|
||||
movdqa xmm3, xmm7
|
||||
paddw xmm7, xmm0 ; xmm7=in0+in4
|
||||
psubw xmm3, xmm0 ; xmm3=in0-in4
|
||||
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm0, xmm0
|
||||
punpcklwd xmm4, xmm7 ; xmm4=tmp0L
|
||||
punpckhwd xmm0, xmm7 ; xmm0=tmp0H
|
||||
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||
psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
|
||||
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm4, xmm6 ; xmm4=tmp10L
|
||||
psubd xmm7, xmm6 ; xmm7=tmp13L
|
||||
movdqa xmm6, xmm0
|
||||
paddd xmm0, xmm5 ; xmm0=tmp10H
|
||||
psubd xmm6, xmm5 ; xmm6=tmp13H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
|
||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
|
||||
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
|
||||
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm5, xmm3 ; xmm5=tmp1L
|
||||
punpckhwd xmm4, xmm3 ; xmm4=tmp1H
|
||||
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||
psubd xmm0, xmm1 ; xmm0=tmp12L
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm4, xmm2 ; xmm4=tmp11H
|
||||
psubd xmm7, xmm2 ; xmm7=tmp12H
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
|
||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
|
||||
movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
|
||||
movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
|
||||
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
|
||||
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
|
||||
|
||||
movdqa xmm5, xmm6
|
||||
movdqa xmm4, xmm3
|
||||
paddw xmm5, xmm1 ; xmm5=z3
|
||||
paddw xmm4, xmm2 ; xmm4=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
movdqa xmm7, xmm5
|
||||
punpcklwd xmm0, xmm4
|
||||
punpckhwd xmm7, xmm4
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm4, xmm7
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
|
||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movdqa xmm0, xmm1
|
||||
movdqa xmm7, xmm1
|
||||
punpcklwd xmm0, xmm3
|
||||
punpckhwd xmm7, xmm3
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm7
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
|
||||
pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
|
||||
|
||||
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
|
||||
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
|
||||
paddd xmm1, xmm5 ; xmm1=tmp3L
|
||||
paddd xmm3, xmm4 ; xmm3=tmp3H
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
|
||||
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
|
||||
|
||||
movdqa xmm0, xmm2
|
||||
movdqa xmm7, xmm2
|
||||
punpcklwd xmm0, xmm6
|
||||
punpckhwd xmm7, xmm6
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm6, xmm7
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
|
||||
|
||||
paddd xmm0, xmm5 ; xmm0=tmp1L
|
||||
paddd xmm7, xmm4 ; xmm7=tmp1H
|
||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
|
||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
|
||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm5, xmm1 ; xmm5=data0L
|
||||
paddd xmm4, xmm3 ; xmm4=data0H
|
||||
psubd xmm0, xmm1 ; xmm0=data7L
|
||||
psubd xmm7, xmm3 ; xmm7=data7H
|
||||
|
||||
movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm5, xmm1
|
||||
paddd xmm4, xmm1
|
||||
psrad xmm5, DESCALE_P2
|
||||
psrad xmm4, DESCALE_P2
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm7, xmm1
|
||||
psrad xmm0, DESCALE_P2
|
||||
psrad xmm7, DESCALE_P2
|
||||
|
||||
packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
|
||||
packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
|
||||
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm7, xmm1
|
||||
paddd xmm3, xmm2 ; xmm3=data1L
|
||||
paddd xmm1, xmm6 ; xmm1=data1H
|
||||
psubd xmm4, xmm2 ; xmm4=data6L
|
||||
psubd xmm7, xmm6 ; xmm7=data6H
|
||||
|
||||
movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm3, xmm2
|
||||
paddd xmm1, xmm2
|
||||
psrad xmm3, DESCALE_P2
|
||||
psrad xmm1, DESCALE_P2
|
||||
paddd xmm4, xmm2
|
||||
paddd xmm7, xmm2
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm7, DESCALE_P2
|
||||
|
||||
packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||
packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||
|
||||
packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
|
||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
|
||||
movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
|
||||
movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
movdqa xmm0, xmm2
|
||||
paddd xmm6, xmm1 ; xmm6=data2L
|
||||
paddd xmm2, xmm7 ; xmm2=data2H
|
||||
psubd xmm4, xmm1 ; xmm4=data5L
|
||||
psubd xmm0, xmm7 ; xmm0=data5H
|
||||
|
||||
movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm6, xmm5
|
||||
paddd xmm2, xmm5
|
||||
psrad xmm6, DESCALE_P2
|
||||
psrad xmm2, DESCALE_P2
|
||||
paddd xmm4, xmm5
|
||||
paddd xmm0, xmm5
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm0, DESCALE_P2
|
||||
|
||||
packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
|
||||
packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
|
||||
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
|
||||
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
|
||||
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
|
||||
|
||||
movdqa xmm2, xmm3
|
||||
movdqa xmm0, xmm1
|
||||
paddd xmm3, xmm7 ; xmm3=data3L
|
||||
paddd xmm1, xmm5 ; xmm1=data3H
|
||||
psubd xmm2, xmm7 ; xmm2=data4L
|
||||
psubd xmm0, xmm5 ; xmm0=data4H
|
||||
|
||||
movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm3, xmm7
|
||||
paddd xmm1, xmm7
|
||||
psrad xmm3, DESCALE_P2
|
||||
psrad xmm1, DESCALE_P2
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm0, xmm7
|
||||
psrad xmm2, DESCALE_P2
|
||||
psrad xmm0, DESCALE_P2
|
||||
|
||||
movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
|
||||
|
||||
packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
|
||||
packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||
packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||
|
||||
paddb xmm7, xmm5
|
||||
paddb xmm1, xmm5
|
||||
paddb xmm6, xmm5
|
||||
paddb xmm3, xmm5
|
||||
|
||||
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||
punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||
punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||
|
||||
movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
|
||||
mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
|
||||
mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
|
||||
movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
704
TMessagesProj/jni/mozjpeg/simd/i386/jidctred-mmx.asm
Normal file
704
TMessagesProj/jni/mozjpeg/simd/i386/jidctred-mmx.asm
Normal file
|
|
@ -0,0 +1,704 @@
|
|||
;
|
||||
; jidctred.asm - reduced-size IDCT (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains inverse-DCT routines that produce reduced-size
|
||||
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
|
||||
; The following code is based directly on the IJG's original jidctred.c;
|
||||
; see the jidctred.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
|
||||
%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
|
||||
%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
|
||||
%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_211 equ 1730 ; FIX(0.211164243)
|
||||
F_0_509 equ 4176 ; FIX(0.509795579)
|
||||
F_0_601 equ 4926 ; FIX(0.601344887)
|
||||
F_0_720 equ 5906 ; FIX(0.720959822)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_850 equ 6967 ; FIX(0.850430095)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_061 equ 8697 ; FIX(1.061594337)
|
||||
F_1_272 equ 10426 ; FIX(1.272758580)
|
||||
F_1_451 equ 11893 ; FIX(1.451774981)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_2_172 equ 17799 ; FIX(2.172734803)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_624 equ 29692 ; FIX(3.624509785)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
|
||||
F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
|
||||
F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
|
||||
F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
|
||||
F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
|
||||
F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_red_mmx)
|
||||
|
||||
EXTN(jconst_idct_red_mmx):
|
||||
|
||||
PW_F184_MF076 times 2 dw F_1_847, -F_0_765
|
||||
PW_F256_F089 times 2 dw F_2_562, F_0_899
|
||||
PW_F106_MF217 times 2 dw F_1_061, -F_2_172
|
||||
PW_MF060_MF050 times 2 dw -F_0_601, -F_0_509
|
||||
PW_F145_MF021 times 2 dw F_1_451, -F_0_211
|
||||
PW_F362_MF127 times 2 dw F_3_624, -F_1_272
|
||||
PW_F085_MF072 times 2 dw F_0_850, -F_0_720
|
||||
PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4 - 1)
|
||||
PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4 - 1)
|
||||
PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2 - 1)
|
||||
PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1)
|
||||
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 4x4 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; void *dct_table
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
|
||||
; mmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
|
||||
; JCOEF workspace[DCTSIZE2]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
|
||||
|
||||
EXTN(jsimd_idct_4x4_mmx):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [workspace]
|
||||
pushpic ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
lea edi, [workspace] ; JCOEF *wsptr
|
||||
mov ecx, DCTSIZE/4 ; ctr
|
||||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por mm0, mm1
|
||||
packsswb mm0, mm0
|
||||
movd eax, mm0
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw mm0, PASS1_BITS
|
||||
|
||||
movq mm2, mm0 ; mm0=in0=(00 01 02 03)
|
||||
punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
|
||||
punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
|
||||
|
||||
movq mm1, mm0
|
||||
punpckldq mm0, mm0 ; mm0=(00 00 00 00)
|
||||
punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
|
||||
movq mm3, mm2
|
||||
punpckldq mm2, mm2 ; mm2=(02 02 02 02)
|
||||
punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
jmp near .nextcolumn
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm0
|
||||
punpcklwd mm4, mm1
|
||||
punpckhwd mm5, mm1
|
||||
movq mm0, mm4
|
||||
movq mm1, mm5
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
|
||||
|
||||
movq mm6, mm2
|
||||
movq mm7, mm2
|
||||
punpcklwd mm6, mm3
|
||||
punpckhwd mm7, mm3
|
||||
movq mm2, mm6
|
||||
movq mm3, mm7
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
|
||||
|
||||
paddd mm6, mm4 ; mm6=tmp2L
|
||||
paddd mm7, mm5 ; mm7=tmp2H
|
||||
paddd mm2, mm0 ; mm2=tmp0L
|
||||
paddd mm3, mm1 ; mm3=tmp0H
|
||||
|
||||
movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
|
||||
movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
pxor mm1, mm1
|
||||
pxor mm2, mm2
|
||||
punpcklwd mm1, mm4 ; mm1=tmp0L
|
||||
punpckhwd mm2, mm4 ; mm2=tmp0H
|
||||
psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
|
||||
psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
|
||||
|
||||
movq mm3, mm5 ; mm5=in2=z2
|
||||
punpcklwd mm5, mm0 ; mm0=in6=z3
|
||||
punpckhwd mm3, mm0
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
|
||||
|
||||
movq mm4, mm1
|
||||
movq mm0, mm2
|
||||
paddd mm1, mm5 ; mm1=tmp10L
|
||||
paddd mm2, mm3 ; mm2=tmp10H
|
||||
psubd mm4, mm5 ; mm4=tmp12L
|
||||
psubd mm0, mm3 ; mm0=tmp12H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm3, mm2
|
||||
paddd mm1, mm6 ; mm1=data0L
|
||||
paddd mm2, mm7 ; mm2=data0H
|
||||
psubd mm5, mm6 ; mm5=data3L
|
||||
psubd mm3, mm7 ; mm3=data3H
|
||||
|
||||
movq mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd mm1, mm6
|
||||
paddd mm2, mm6
|
||||
psrad mm1, DESCALE_P1_4
|
||||
psrad mm2, DESCALE_P1_4
|
||||
paddd mm5, mm6
|
||||
paddd mm3, mm6
|
||||
psrad mm5, DESCALE_P1_4
|
||||
psrad mm3, DESCALE_P1_4
|
||||
|
||||
packssdw mm1, mm2 ; mm1=data0=(00 01 02 03)
|
||||
packssdw mm5, mm3 ; mm5=data3=(30 31 32 33)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
|
||||
movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
|
||||
|
||||
movq mm2, mm4
|
||||
movq mm3, mm0
|
||||
paddd mm4, mm7 ; mm4=data1L
|
||||
paddd mm0, mm6 ; mm0=data1H
|
||||
psubd mm2, mm7 ; mm2=data2L
|
||||
psubd mm3, mm6 ; mm3=data2H
|
||||
|
||||
movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd mm4, mm7
|
||||
paddd mm0, mm7
|
||||
psrad mm4, DESCALE_P1_4
|
||||
psrad mm0, DESCALE_P1_4
|
||||
paddd mm2, mm7
|
||||
paddd mm3, mm7
|
||||
psrad mm2, DESCALE_P1_4
|
||||
psrad mm3, DESCALE_P1_4
|
||||
|
||||
packssdw mm4, mm0 ; mm4=data1=(10 11 12 13)
|
||||
packssdw mm2, mm3 ; mm2=data2=(20 21 22 23)
|
||||
|
||||
movq mm6, mm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm1, mm4 ; mm1=(00 10 01 11)
|
||||
punpckhwd mm6, mm4 ; mm6=(02 12 03 13)
|
||||
movq mm7, mm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd mm2, mm5 ; mm2=(20 30 21 31)
|
||||
punpckhwd mm7, mm5 ; mm7=(22 32 23 33)
|
||||
|
||||
movq mm0, mm1 ; transpose coefficients(phase 2)
|
||||
punpckldq mm1, mm2 ; mm1=(00 10 20 30)
|
||||
punpckhdq mm0, mm2 ; mm0=(01 11 21 31)
|
||||
movq mm3, mm6 ; transpose coefficients(phase 2)
|
||||
punpckldq mm6, mm7 ; mm6=(02 12 22 32)
|
||||
punpckhdq mm3, mm7 ; mm3=(03 13 23 33)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
|
||||
|
||||
.nextcolumn:
|
||||
add esi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
|
||||
dec ecx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
lea esi, [workspace] ; JCOEF *wsptr
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
movq mm4, mm0
|
||||
movq mm5, mm0
|
||||
punpcklwd mm4, mm1
|
||||
punpckhwd mm5, mm1
|
||||
movq mm0, mm4
|
||||
movq mm1, mm5
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
|
||||
pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
|
||||
|
||||
movq mm6, mm2
|
||||
movq mm7, mm2
|
||||
punpcklwd mm6, mm3
|
||||
punpckhwd mm7, mm3
|
||||
movq mm2, mm6
|
||||
movq mm3, mm7
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
|
||||
pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
|
||||
|
||||
paddd mm6, mm4 ; mm6=tmp2L
|
||||
paddd mm7, mm5 ; mm7=tmp2H
|
||||
paddd mm2, mm0 ; mm2=tmp0L
|
||||
paddd mm3, mm1 ; mm3=tmp0H
|
||||
|
||||
movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
|
||||
movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
pxor mm1, mm1
|
||||
pxor mm2, mm2
|
||||
punpcklwd mm1, mm4 ; mm1=tmp0L
|
||||
punpckhwd mm2, mm4 ; mm2=tmp0H
|
||||
psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
|
||||
psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
|
||||
|
||||
movq mm3, mm5 ; mm5=in2=z2
|
||||
punpcklwd mm5, mm0 ; mm0=in6=z3
|
||||
punpckhwd mm3, mm0
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
|
||||
|
||||
movq mm4, mm1
|
||||
movq mm0, mm2
|
||||
paddd mm1, mm5 ; mm1=tmp10L
|
||||
paddd mm2, mm3 ; mm2=tmp10H
|
||||
psubd mm4, mm5 ; mm4=tmp12L
|
||||
psubd mm0, mm3 ; mm0=tmp12H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm3, mm2
|
||||
paddd mm1, mm6 ; mm1=data0L
|
||||
paddd mm2, mm7 ; mm2=data0H
|
||||
psubd mm5, mm6 ; mm5=data3L
|
||||
psubd mm3, mm7 ; mm3=data3H
|
||||
|
||||
movq mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4]
|
||||
|
||||
paddd mm1, mm6
|
||||
paddd mm2, mm6
|
||||
psrad mm1, DESCALE_P2_4
|
||||
psrad mm2, DESCALE_P2_4
|
||||
paddd mm5, mm6
|
||||
paddd mm3, mm6
|
||||
psrad mm5, DESCALE_P2_4
|
||||
psrad mm3, DESCALE_P2_4
|
||||
|
||||
packssdw mm1, mm2 ; mm1=data0=(00 10 20 30)
|
||||
packssdw mm5, mm3 ; mm5=data3=(03 13 23 33)
|
||||
|
||||
movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
|
||||
movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
|
||||
|
||||
movq mm2, mm4
|
||||
movq mm3, mm0
|
||||
paddd mm4, mm7 ; mm4=data1L
|
||||
paddd mm0, mm6 ; mm0=data1H
|
||||
psubd mm2, mm7 ; mm2=data2L
|
||||
psubd mm3, mm6 ; mm3=data2H
|
||||
|
||||
movq mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4]
|
||||
|
||||
paddd mm4, mm7
|
||||
paddd mm0, mm7
|
||||
psrad mm4, DESCALE_P2_4
|
||||
psrad mm0, DESCALE_P2_4
|
||||
paddd mm2, mm7
|
||||
paddd mm3, mm7
|
||||
psrad mm2, DESCALE_P2_4
|
||||
psrad mm3, DESCALE_P2_4
|
||||
|
||||
packssdw mm4, mm0 ; mm4=data1=(01 11 21 31)
|
||||
packssdw mm2, mm3 ; mm2=data2=(02 12 22 32)
|
||||
|
||||
movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb mm1, mm2 ; mm1=(00 10 20 30 02 12 22 32)
|
||||
packsswb mm4, mm5 ; mm4=(01 11 21 31 03 13 23 33)
|
||||
paddb mm1, mm6
|
||||
paddb mm4, mm6
|
||||
|
||||
movq mm7, mm1 ; transpose coefficients(phase 1)
|
||||
punpcklbw mm1, mm4 ; mm1=(00 01 10 11 20 21 30 31)
|
||||
punpckhbw mm7, mm4 ; mm7=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movq mm0, mm1 ; transpose coefficients(phase 2)
|
||||
punpcklwd mm1, mm7 ; mm1=(00 01 02 03 10 11 12 13)
|
||||
punpckhwd mm0, mm7 ; mm0=(20 21 22 23 30 31 32 33)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
|
||||
psrlq mm1, 4*BYTE_BIT
|
||||
psrlq mm0, 4*BYTE_BIT
|
||||
|
||||
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 2x2 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; void *dct_table
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
|
||||
|
||||
EXTN(jsimd_idct_2x2_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov edx, POINTER [dct_table(ebp)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
|
||||
|
||||
; | input: | result: |
|
||||
; | 00 01 ** 03 ** 05 ** 07 | |
|
||||
; | 10 11 ** 13 ** 15 ** 17 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
|
||||
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
|
||||
; | 50 51 ** 53 ** 55 ** 57 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 70 71 ** 73 ** 75 ** 77 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
|
||||
; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
|
||||
|
||||
pcmpeqd mm7, mm7
|
||||
pslld mm7, WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
|
||||
|
||||
movq mm4, mm0 ; mm4=(10 11 ** 13)
|
||||
movq mm5, mm2 ; mm5=(50 51 ** 53)
|
||||
punpcklwd mm4, mm1 ; mm4=(10 30 11 31)
|
||||
punpcklwd mm5, mm3 ; mm5=(50 70 51 71)
|
||||
pmaddwd mm4, [GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
psrld mm0, WORD_BIT ; mm0=(11 -- 13 --)
|
||||
pand mm1, mm7 ; mm1=(-- 31 -- 33)
|
||||
psrld mm2, WORD_BIT ; mm2=(51 -- 53 --)
|
||||
pand mm3, mm7 ; mm3=(-- 71 -- 73)
|
||||
por mm0, mm1 ; mm0=(11 31 13 33)
|
||||
por mm2, mm3 ; mm2=(51 71 53 73)
|
||||
pmaddwd mm0, [GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd mm4, mm5 ; mm4=tmp0[col0 col1]
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
|
||||
movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
|
||||
; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
|
||||
|
||||
psrld mm6, WORD_BIT ; mm6=(15 -- 17 --)
|
||||
pand mm1, mm7 ; mm1=(-- 35 -- 37)
|
||||
psrld mm3, WORD_BIT ; mm3=(55 -- 57 --)
|
||||
pand mm5, mm7 ; mm5=(-- 75 -- 77)
|
||||
por mm6, mm1 ; mm6=(15 35 17 37)
|
||||
por mm3, mm5 ; mm3=(55 75 57 77)
|
||||
pmaddwd mm6, [GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm3, [GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd mm0, mm2 ; mm0=tmp0[col1 col3]
|
||||
paddd mm6, mm3 ; mm6=tmp0[col5 col7]
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
|
||||
pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
|
||||
|
||||
movq mm2, mm1 ; mm2=(00 01 ** 03)
|
||||
pslld mm1, WORD_BIT ; mm1=(-- 00 -- **)
|
||||
psrad mm1, (WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****]
|
||||
|
||||
pand mm2, mm7 ; mm2=(-- 01 -- 03)
|
||||
pand mm5, mm7 ; mm5=(-- 05 -- 07)
|
||||
psrad mm2, (WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3]
|
||||
psrad mm5, (WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm3, mm1
|
||||
paddd mm1, mm4 ; mm1=data0[col0 ****]=(A0 **)
|
||||
psubd mm3, mm4 ; mm3=data1[col0 ****]=(B0 **)
|
||||
punpckldq mm1, mm3 ; mm1=(A0 B0)
|
||||
|
||||
movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2]
|
||||
|
||||
movq mm4, mm2
|
||||
movq mm3, mm5
|
||||
paddd mm2, mm0 ; mm2=data0[col1 col3]=(A1 A3)
|
||||
paddd mm5, mm6 ; mm5=data0[col5 col7]=(A5 A7)
|
||||
psubd mm4, mm0 ; mm4=data1[col1 col3]=(B1 B3)
|
||||
psubd mm3, mm6 ; mm3=data1[col5 col7]=(B5 B7)
|
||||
|
||||
paddd mm1, mm7
|
||||
psrad mm1, DESCALE_P1_2
|
||||
|
||||
paddd mm2, mm7
|
||||
paddd mm5, mm7
|
||||
psrad mm2, DESCALE_P1_2
|
||||
psrad mm5, DESCALE_P1_2
|
||||
paddd mm4, mm7
|
||||
paddd mm3, mm7
|
||||
psrad mm4, DESCALE_P1_2
|
||||
psrad mm3, DESCALE_P1_2
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(ebp)]
|
||||
|
||||
; | input:| result:|
|
||||
; | A0 B0 | |
|
||||
; | A1 B1 | C0 C1 |
|
||||
; | A3 B3 | D0 D1 |
|
||||
; | A5 B5 | |
|
||||
; | A7 B7 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
packssdw mm2, mm4 ; mm2=(A1 A3 B1 B3)
|
||||
packssdw mm5, mm3 ; mm5=(A5 A7 B5 B7)
|
||||
pmaddwd mm2, [GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd mm2, mm5 ; mm2=tmp0[row0 row1]
|
||||
|
||||
; -- Even part
|
||||
|
||||
pslld mm1, (CONST_BITS+2) ; mm1=tmp10[row0 row1]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movq mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2]
|
||||
|
||||
movq mm6, mm1
|
||||
paddd mm1, mm2 ; mm1=data0[row0 row1]=(C0 C1)
|
||||
psubd mm6, mm2 ; mm6=data1[row0 row1]=(D0 D1)
|
||||
|
||||
paddd mm1, mm0
|
||||
paddd mm6, mm0
|
||||
psrad mm1, DESCALE_P2_2
|
||||
psrad mm6, DESCALE_P2_2
|
||||
|
||||
movq mm7, mm1 ; transpose coefficients
|
||||
punpckldq mm1, mm6 ; mm1=(C0 D0)
|
||||
punpckhdq mm7, mm6 ; mm7=(C1 D1)
|
||||
|
||||
packssdw mm1, mm7 ; mm1=(C0 D0 C1 D1)
|
||||
packsswb mm1, mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
|
||||
paddb mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||
|
||||
movd ecx, mm1
|
||||
movd ebx, mm1 ; ebx=(C0 D0 C1 D1)
|
||||
shr ecx, 2*BYTE_BIT ; ecx=(C1 D1 -- --)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov word [edx+eax*SIZEOF_JSAMPLE], bx
|
||||
mov word [esi+eax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
592
TMessagesProj/jni/mozjpeg/simd/i386/jidctred-sse2.asm
Normal file
592
TMessagesProj/jni/mozjpeg/simd/i386/jidctred-sse2.asm
Normal file
|
|
@ -0,0 +1,592 @@
|
|||
;
|
||||
; jidctred.asm - reduced-size IDCT (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains inverse-DCT routines that produce reduced-size
|
||||
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
|
||||
; The following code is based directly on the IJG's original jidctred.c;
|
||||
; see the jidctred.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
|
||||
%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
|
||||
%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
|
||||
%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_211 equ 1730 ; FIX(0.211164243)
|
||||
F_0_509 equ 4176 ; FIX(0.509795579)
|
||||
F_0_601 equ 4926 ; FIX(0.601344887)
|
||||
F_0_720 equ 5906 ; FIX(0.720959822)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_850 equ 6967 ; FIX(0.850430095)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_061 equ 8697 ; FIX(1.061594337)
|
||||
F_1_272 equ 10426 ; FIX(1.272758580)
|
||||
F_1_451 equ 11893 ; FIX(1.451774981)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_2_172 equ 17799 ; FIX(2.172734803)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_624 equ 29692 ; FIX(3.624509785)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
|
||||
F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
|
||||
F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
|
||||
F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
|
||||
F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
|
||||
F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_red_sse2)
|
||||
|
||||
EXTN(jconst_idct_red_sse2):
|
||||
|
||||
PW_F184_MF076 times 4 dw F_1_847, -F_0_765
|
||||
PW_F256_F089 times 4 dw F_2_562, F_0_899
|
||||
PW_F106_MF217 times 4 dw F_1_061, -F_2_172
|
||||
PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509
|
||||
PW_F145_MF021 times 4 dw F_1_451, -F_0_211
|
||||
PW_F362_MF127 times 4 dw F_3_624, -F_1_272
|
||||
PW_F085_MF072 times 4 dw F_0_850, -F_0_720
|
||||
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1)
|
||||
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1)
|
||||
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
|
||||
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 4x4 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; void *dct_table
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
%define original_ebp ebp + 0
|
||||
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
|
||||
|
||||
EXTN(jsimd_idct_4x4_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
lea esp, [wk(0)]
|
||||
pushpic ebx
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
; mov eax, [original_ebp]
|
||||
mov edx, POINTER [dct_table(eax)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
por xmm0, xmm1
|
||||
packsswb xmm0, xmm0
|
||||
packsswb xmm0, xmm0
|
||||
movd eax, xmm0
|
||||
test eax, eax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw xmm0, PASS1_BITS
|
||||
|
||||
movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
|
||||
pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
|
||||
pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
|
||||
pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
|
||||
|
||||
jmp near .column_end
|
||||
alignx 16, 7
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm4, xmm1
|
||||
punpckhwd xmm5, xmm1
|
||||
movdqa xmm0, xmm4
|
||||
movdqa xmm1, xmm5
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
|
||||
|
||||
movdqa xmm6, xmm2
|
||||
movdqa xmm7, xmm2
|
||||
punpcklwd xmm6, xmm3
|
||||
punpckhwd xmm7, xmm3
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm3, xmm7
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
|
||||
pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
|
||||
|
||||
paddd xmm6, xmm4 ; xmm6=tmp2L
|
||||
paddd xmm7, xmm5 ; xmm7=tmp2H
|
||||
paddd xmm2, xmm0 ; xmm2=tmp0L
|
||||
paddd xmm3, xmm1 ; xmm3=tmp0H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm2, xmm2
|
||||
punpcklwd xmm1, xmm4 ; xmm1=tmp0L
|
||||
punpckhwd xmm2, xmm4 ; xmm2=tmp0H
|
||||
psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
|
||||
psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
|
||||
|
||||
movdqa xmm3, xmm5 ; xmm5=in2=z2
|
||||
punpcklwd xmm5, xmm0 ; xmm0=in6=z3
|
||||
punpckhwd xmm3, xmm0
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
|
||||
pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
|
||||
|
||||
movdqa xmm4, xmm1
|
||||
movdqa xmm0, xmm2
|
||||
paddd xmm1, xmm5 ; xmm1=tmp10L
|
||||
paddd xmm2, xmm3 ; xmm2=tmp10H
|
||||
psubd xmm4, xmm5 ; xmm4=tmp12L
|
||||
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm3, xmm2
|
||||
paddd xmm1, xmm6 ; xmm1=data0L
|
||||
paddd xmm2, xmm7 ; xmm2=data0H
|
||||
psubd xmm5, xmm6 ; xmm5=data3L
|
||||
psubd xmm3, xmm7 ; xmm3=data3H
|
||||
|
||||
movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm6
|
||||
psrad xmm1, DESCALE_P1_4
|
||||
psrad xmm2, DESCALE_P1_4
|
||||
paddd xmm5, xmm6
|
||||
paddd xmm3, xmm6
|
||||
psrad xmm5, DESCALE_P1_4
|
||||
psrad xmm3, DESCALE_P1_4
|
||||
|
||||
packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
|
||||
packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
|
||||
|
||||
movdqa xmm2, xmm4
|
||||
movdqa xmm3, xmm0
|
||||
paddd xmm4, xmm7 ; xmm4=data1L
|
||||
paddd xmm0, xmm6 ; xmm0=data1H
|
||||
psubd xmm2, xmm7 ; xmm2=data2L
|
||||
psubd xmm3, xmm6 ; xmm3=data2H
|
||||
|
||||
movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd xmm4, xmm7
|
||||
paddd xmm0, xmm7
|
||||
psrad xmm4, DESCALE_P1_4
|
||||
psrad xmm0, DESCALE_P1_4
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm7
|
||||
psrad xmm2, DESCALE_P1_4
|
||||
psrad xmm3, DESCALE_P1_4
|
||||
|
||||
packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||
packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
|
||||
|
||||
movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov eax, [original_ebp]
|
||||
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(eax)]
|
||||
|
||||
; -- Even part
|
||||
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm4, xmm1 ; xmm4=tmp0
|
||||
psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
punpckhwd xmm1, xmm0
|
||||
punpckhwd xmm6, xmm3
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm2, xmm6
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
|
||||
pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
|
||||
|
||||
paddd xmm6, xmm1 ; xmm6=tmp2
|
||||
paddd xmm2, xmm5 ; xmm2=tmp0
|
||||
|
||||
; -- Even part
|
||||
|
||||
punpcklwd xmm0, xmm3
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
|
||||
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm4, xmm0 ; xmm4=tmp10
|
||||
psubd xmm7, xmm0 ; xmm7=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
|
||||
|
||||
movdqa xmm5, xmm4
|
||||
movdqa xmm3, xmm7
|
||||
paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
|
||||
paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
|
||||
psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
|
||||
psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
|
||||
|
||||
paddd xmm4, xmm1
|
||||
paddd xmm7, xmm1
|
||||
psrad xmm4, DESCALE_P2_4
|
||||
psrad xmm7, DESCALE_P2_4
|
||||
paddd xmm5, xmm1
|
||||
paddd xmm3, xmm1
|
||||
psrad xmm5, DESCALE_P2_4
|
||||
psrad xmm3, DESCALE_P2_4
|
||||
|
||||
packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
|
||||
packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
|
||||
|
||||
movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
|
||||
|
||||
packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
|
||||
paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||
|
||||
pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
|
||||
pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
|
||||
pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
|
||||
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
|
||||
mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
|
||||
movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
poppic ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 2x2 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
%define dct_table(b) (b) + 8 ; void *dct_table
|
||||
%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
|
||||
%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
|
||||
%define output_col(b) (b) + 20 ; JDIMENSION output_col
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
|
||||
|
||||
EXTN(jsimd_idct_2x2_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
get_GOT ebx ; get GOT address
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov edx, POINTER [dct_table(ebp)] ; quantptr
|
||||
mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
|
||||
|
||||
; | input: | result: |
|
||||
; | 00 01 ** 03 ** 05 ** 07 | |
|
||||
; | 10 11 ** 13 ** 15 ** 17 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
|
||||
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
|
||||
; | 50 51 ** 53 ** 55 ** 57 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 70 71 ** 73 ** 75 ** 77 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
|
||||
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
|
||||
|
||||
pcmpeqd xmm7, xmm7
|
||||
pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
|
||||
|
||||
movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
|
||||
movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
|
||||
punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
|
||||
punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
|
||||
pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
|
||||
pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
|
||||
psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
|
||||
pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
|
||||
por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
|
||||
por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
|
||||
pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
|
||||
paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; xmm6=(00 01 ** 03 ** 05 ** 07)
|
||||
|
||||
movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
|
||||
pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
|
||||
pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
|
||||
psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
|
||||
psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
movdqa xmm5, xmm1
|
||||
paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
|
||||
paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
|
||||
psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
|
||||
psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
|
||||
|
||||
movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
|
||||
|
||||
punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
|
||||
|
||||
movdqa xmm7, xmm1
|
||||
punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
|
||||
punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
|
||||
|
||||
paddd xmm6, xmm2
|
||||
psrad xmm6, DESCALE_P1_2
|
||||
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm7, xmm2
|
||||
psrad xmm1, DESCALE_P1_2
|
||||
psrad xmm7, DESCALE_P1_2
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [output_col(ebp)]
|
||||
|
||||
; | input:| result:|
|
||||
; | A0 B0 | |
|
||||
; | A1 B1 | C0 C1 |
|
||||
; | A3 B3 | D0 D1 |
|
||||
; | A5 B5 | |
|
||||
; | A7 B7 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
|
||||
packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
|
||||
pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
|
||||
pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
|
||||
|
||||
paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
|
||||
|
||||
; -- Even part
|
||||
|
||||
pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
|
||||
psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
|
||||
|
||||
punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
|
||||
|
||||
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
|
||||
psrad xmm6, DESCALE_P2_2
|
||||
|
||||
packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
|
||||
packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
|
||||
paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
|
||||
|
||||
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
|
||||
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
|
||||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov word [edx+eax*SIZEOF_JSAMPLE], bx
|
||||
mov word [esi+eax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
230
TMessagesProj/jni/mozjpeg/simd/i386/jquant-3dn.asm
Normal file
230
TMessagesProj/jni/mozjpeg/simd/i386/jquant-3dn.asm
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
;
|
||||
; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
%define sample_data ebp + 8 ; JSAMPARRAY sample_data
|
||||
%define start_col ebp + 12 ; JDIMENSION start_col
|
||||
%define workspace ebp + 16 ; FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
|
||||
|
||||
EXTN(jsimd_convsamp_float_3dnow):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
pcmpeqw mm7, mm7
|
||||
psllw mm7, 7
|
||||
packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
|
||||
|
||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [start_col]
|
||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/2
|
||||
alignx 16, 7
|
||||
.convloop:
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||
movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||
|
||||
psubb mm0, mm7 ; mm0=(01234567)
|
||||
psubb mm1, mm7 ; mm1=(89ABCDEF)
|
||||
|
||||
punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
|
||||
punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
|
||||
punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
|
||||
punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
|
||||
|
||||
punpcklwd mm4, mm2 ; mm4=(***0***1)
|
||||
punpckhwd mm2, mm2 ; mm2=(***2***3)
|
||||
punpcklwd mm5, mm0 ; mm5=(***4***5)
|
||||
punpckhwd mm0, mm0 ; mm0=(***6***7)
|
||||
|
||||
psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
|
||||
psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
|
||||
pi2fd mm4, mm4
|
||||
pi2fd mm2, mm2
|
||||
psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
|
||||
psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
|
||||
pi2fd mm5, mm5
|
||||
pi2fd mm0, mm0
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
|
||||
movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
|
||||
movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
|
||||
|
||||
punpcklwd mm6, mm3 ; mm6=(***8***9)
|
||||
punpckhwd mm3, mm3 ; mm3=(***A***B)
|
||||
punpcklwd mm4, mm1 ; mm4=(***C***D)
|
||||
punpckhwd mm1, mm1 ; mm1=(***E***F)
|
||||
|
||||
psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
|
||||
psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
|
||||
pi2fd mm6, mm6
|
||||
pi2fd mm3, mm3
|
||||
psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
|
||||
psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
|
||||
pi2fd mm4, mm4
|
||||
pi2fd mm1, mm1
|
||||
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
|
||||
movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
|
||||
movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
|
||||
|
||||
add esi, byte 2*SIZEOF_JSAMPROW
|
||||
add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec ecx
|
||||
jnz near .convloop
|
||||
|
||||
femms ; empty MMX/3DNow! state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
%define coef_block ebp + 8 ; JCOEFPTR coef_block
|
||||
%define divisors ebp + 12 ; FAST_FLOAT *divisors
|
||||
%define workspace ebp + 16 ; FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
|
||||
|
||||
EXTN(jsimd_quantize_float_3dnow):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
|
||||
movd mm7, eax
|
||||
punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F}
|
||||
|
||||
mov esi, POINTER [workspace]
|
||||
mov edx, POINTER [divisors]
|
||||
mov edi, JCOEFPTR [coef_block]
|
||||
mov eax, DCTSIZE2/16
|
||||
alignx 16, 7
|
||||
.quantloop:
|
||||
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
pfadd mm0, mm7 ; mm0=(00 ** 01 **)
|
||||
pfadd mm1, mm7 ; mm1=(02 ** 03 **)
|
||||
pfadd mm2, mm7 ; mm0=(04 ** 05 **)
|
||||
pfadd mm3, mm7 ; mm1=(06 ** 07 **)
|
||||
|
||||
movq mm4, mm0
|
||||
punpcklwd mm0, mm1 ; mm0=(00 02 ** **)
|
||||
punpckhwd mm4, mm1 ; mm4=(01 03 ** **)
|
||||
movq mm5, mm2
|
||||
punpcklwd mm2, mm3 ; mm2=(04 06 ** **)
|
||||
punpckhwd mm5, mm3 ; mm5=(05 07 ** **)
|
||||
|
||||
punpcklwd mm0, mm4 ; mm0=(00 01 02 03)
|
||||
punpcklwd mm2, mm5 ; mm2=(04 05 06 07)
|
||||
|
||||
movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
|
||||
movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
|
||||
pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
pfadd mm6, mm7 ; mm0=(10 ** 11 **)
|
||||
pfadd mm1, mm7 ; mm4=(12 ** 13 **)
|
||||
pfadd mm3, mm7 ; mm0=(14 ** 15 **)
|
||||
pfadd mm4, mm7 ; mm4=(16 ** 17 **)
|
||||
|
||||
movq mm5, mm6
|
||||
punpcklwd mm6, mm1 ; mm6=(10 12 ** **)
|
||||
punpckhwd mm5, mm1 ; mm5=(11 13 ** **)
|
||||
movq mm1, mm3
|
||||
punpcklwd mm3, mm4 ; mm3=(14 16 ** **)
|
||||
punpckhwd mm1, mm4 ; mm1=(15 17 ** **)
|
||||
|
||||
punpcklwd mm6, mm5 ; mm6=(10 11 12 13)
|
||||
punpcklwd mm3, mm1 ; mm3=(14 15 16 17)
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
|
||||
|
||||
add esi, byte 16*SIZEOF_FAST_FLOAT
|
||||
add edx, byte 16*SIZEOF_FAST_FLOAT
|
||||
add edi, byte 16*SIZEOF_JCOEF
|
||||
dec eax
|
||||
jnz near .quantloop
|
||||
|
||||
femms ; empty MMX/3DNow! state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
276
TMessagesProj/jni/mozjpeg/simd/i386/jquant-mmx.asm
Normal file
276
TMessagesProj/jni/mozjpeg/simd/i386/jquant-mmx.asm
Normal file
|
|
@ -0,0 +1,276 @@
|
|||
;
|
||||
; jquant.asm - sample data conversion and quantization (MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define sample_data ebp + 8 ; JSAMPARRAY sample_data
|
||||
%define start_col ebp + 12 ; JDIMENSION start_col
|
||||
%define workspace ebp + 16 ; DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_mmx)
|
||||
|
||||
EXTN(jsimd_convsamp_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
pxor mm6, mm6 ; mm6=(all 0's)
|
||||
pcmpeqw mm7, mm7
|
||||
psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
|
||||
|
||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [start_col]
|
||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.convloop:
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
|
||||
movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
|
||||
|
||||
mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
|
||||
movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
|
||||
|
||||
movq mm4, mm0
|
||||
punpcklbw mm0, mm6 ; mm0=(0123)
|
||||
punpckhbw mm4, mm6 ; mm4=(4567)
|
||||
movq mm5, mm1
|
||||
punpcklbw mm1, mm6 ; mm1=(89AB)
|
||||
punpckhbw mm5, mm6 ; mm5=(CDEF)
|
||||
|
||||
paddw mm0, mm7
|
||||
paddw mm4, mm7
|
||||
paddw mm1, mm7
|
||||
paddw mm5, mm7
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
|
||||
|
||||
movq mm0, mm2
|
||||
punpcklbw mm2, mm6 ; mm2=(GHIJ)
|
||||
punpckhbw mm0, mm6 ; mm0=(KLMN)
|
||||
movq mm4, mm3
|
||||
punpcklbw mm3, mm6 ; mm3=(OPQR)
|
||||
punpckhbw mm4, mm6 ; mm4=(STUV)
|
||||
|
||||
paddw mm2, mm7
|
||||
paddw mm0, mm7
|
||||
paddw mm3, mm7
|
||||
paddw mm4, mm7
|
||||
|
||||
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
|
||||
movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
|
||||
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
|
||||
|
||||
add esi, byte 4*SIZEOF_JSAMPROW
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz short .convloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; This implementation is based on an algorithm described in
|
||||
; "How to optimize for the Pentium family of microprocessors"
|
||||
; (http://www.agner.org/assem/).
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define RECIPROCAL(m, n, b) \
|
||||
MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define CORRECTION(m, n, b) \
|
||||
MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define SCALE(m, n, b) \
|
||||
MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define SHIFT(m, n, b) \
|
||||
MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
|
||||
%define coef_block ebp + 8 ; JCOEFPTR coef_block
|
||||
%define divisors ebp + 12 ; DCTELEM *divisors
|
||||
%define workspace ebp + 16 ; DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_mmx)
|
||||
|
||||
EXTN(jsimd_quantize_mmx):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, POINTER [workspace]
|
||||
mov edx, POINTER [divisors]
|
||||
mov edi, JCOEFPTR [coef_block]
|
||||
mov ah, 2
|
||||
alignx 16, 7
|
||||
.quantloop1:
|
||||
mov al, DCTSIZE2/8/2
|
||||
alignx 16, 7
|
||||
.quantloop2:
|
||||
movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
|
||||
movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
|
||||
|
||||
movq mm0, mm2
|
||||
movq mm1, mm3
|
||||
|
||||
psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise
|
||||
psraw mm3, (WORD_BIT-1)
|
||||
|
||||
pxor mm0, mm2 ; val = -val
|
||||
pxor mm1, mm3
|
||||
psubw mm0, mm2
|
||||
psubw mm1, mm3
|
||||
|
||||
;
|
||||
; MMX is an annoyingly crappy instruction set. It has two
|
||||
; misfeatures that are causing problems here:
|
||||
;
|
||||
; - All multiplications are signed.
|
||||
;
|
||||
; - The second operand for the shifts is not treated as packed.
|
||||
;
|
||||
;
|
||||
; We work around the first problem by implementing this algorithm:
|
||||
;
|
||||
; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
|
||||
; {
|
||||
; enum { SHORT_BIT = 16 };
|
||||
; signed short sx = (signed short)x;
|
||||
; signed short sy = (signed short)y;
|
||||
; signed long sz;
|
||||
;
|
||||
; sz = (long)sx * (long)sy; /* signed multiply */
|
||||
;
|
||||
; if (sx < 0) sz += (long)sy << SHORT_BIT;
|
||||
; if (sy < 0) sz += (long)sx << SHORT_BIT;
|
||||
;
|
||||
; return (unsigned long)sz;
|
||||
; }
|
||||
;
|
||||
; (note that a negative sx adds _sy_ and vice versa)
|
||||
;
|
||||
; For the second problem, we replace the shift by a multiplication.
|
||||
; Unfortunately that means we have to deal with the signed issue again.
|
||||
;
|
||||
|
||||
paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
|
||||
paddw mm1, MMWORD [CORRECTION(0,1,edx)]
|
||||
|
||||
movq mm4, mm0 ; store current value for later
|
||||
movq mm5, mm1
|
||||
pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
|
||||
pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
|
||||
paddw mm0, mm4 ; reciprocal is always negative (MSB=1),
|
||||
paddw mm1, mm5 ; so we always need to add the initial value
|
||||
; (input value is never negative as we
|
||||
; inverted it at the start of this routine)
|
||||
|
||||
; here it gets a bit tricky as both scale
|
||||
; and mm0/mm1 can be negative
|
||||
movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
|
||||
movq mm7, MMWORD [SCALE(0,1,edx)]
|
||||
movq mm4, mm0
|
||||
movq mm5, mm1
|
||||
pmulhw mm0, mm6
|
||||
pmulhw mm1, mm7
|
||||
|
||||
psraw mm6, (WORD_BIT-1) ; determine if scale is negative
|
||||
psraw mm7, (WORD_BIT-1)
|
||||
|
||||
pand mm6, mm4 ; and add input if it is
|
||||
pand mm7, mm5
|
||||
paddw mm0, mm6
|
||||
paddw mm1, mm7
|
||||
|
||||
psraw mm4, (WORD_BIT-1) ; then check if negative input
|
||||
psraw mm5, (WORD_BIT-1)
|
||||
|
||||
pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
|
||||
pand mm5, MMWORD [SCALE(0,1,edx)]
|
||||
paddw mm0, mm4
|
||||
paddw mm1, mm5
|
||||
|
||||
pxor mm0, mm2 ; val = -val
|
||||
pxor mm1, mm3
|
||||
psubw mm0, mm2
|
||||
psubw mm1, mm3
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
|
||||
|
||||
add esi, byte 8*SIZEOF_DCTELEM
|
||||
add edx, byte 8*SIZEOF_DCTELEM
|
||||
add edi, byte 8*SIZEOF_JCOEF
|
||||
dec al
|
||||
jnz near .quantloop2
|
||||
dec ah
|
||||
jnz near .quantloop1 ; to avoid branch misprediction
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
208
TMessagesProj/jni/mozjpeg/simd/i386/jquant-sse.asm
Normal file
208
TMessagesProj/jni/mozjpeg/simd/i386/jquant-sse.asm
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
;
|
||||
; jquant.asm - sample data conversion and quantization (SSE & MMX)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
%define sample_data ebp + 8 ; JSAMPARRAY sample_data
|
||||
%define start_col ebp + 12 ; JDIMENSION start_col
|
||||
%define workspace ebp + 16 ; FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
|
||||
|
||||
EXTN(jsimd_convsamp_float_sse):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
pcmpeqw mm7, mm7
|
||||
psllw mm7, 7
|
||||
packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
|
||||
|
||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [start_col]
|
||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/2
|
||||
alignx 16, 7
|
||||
.convloop:
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||
movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||
|
||||
psubb mm0, mm7 ; mm0=(01234567)
|
||||
psubb mm1, mm7 ; mm1=(89ABCDEF)
|
||||
|
||||
punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
|
||||
punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
|
||||
punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
|
||||
punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
|
||||
|
||||
punpcklwd mm4, mm2 ; mm4=(***0***1)
|
||||
punpckhwd mm2, mm2 ; mm2=(***2***3)
|
||||
punpcklwd mm5, mm0 ; mm5=(***4***5)
|
||||
punpckhwd mm0, mm0 ; mm0=(***6***7)
|
||||
|
||||
psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
|
||||
psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
|
||||
cvtpi2ps xmm0, mm4 ; xmm0=(01**)
|
||||
cvtpi2ps xmm1, mm2 ; xmm1=(23**)
|
||||
psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
|
||||
psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
|
||||
cvtpi2ps xmm2, mm5 ; xmm2=(45**)
|
||||
cvtpi2ps xmm3, mm0 ; xmm3=(67**)
|
||||
|
||||
punpcklwd mm6, mm3 ; mm6=(***8***9)
|
||||
punpckhwd mm3, mm3 ; mm3=(***A***B)
|
||||
punpcklwd mm4, mm1 ; mm4=(***C***D)
|
||||
punpckhwd mm1, mm1 ; mm1=(***E***F)
|
||||
|
||||
psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
|
||||
psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
|
||||
cvtpi2ps xmm4, mm6 ; xmm4=(89**)
|
||||
cvtpi2ps xmm5, mm3 ; xmm5=(AB**)
|
||||
psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
|
||||
psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
|
||||
cvtpi2ps xmm6, mm4 ; xmm6=(CD**)
|
||||
cvtpi2ps xmm7, mm1 ; xmm7=(EF**)
|
||||
|
||||
movlhps xmm0, xmm1 ; xmm0=(0123)
|
||||
movlhps xmm2, xmm3 ; xmm2=(4567)
|
||||
movlhps xmm4, xmm5 ; xmm4=(89AB)
|
||||
movlhps xmm6, xmm7 ; xmm6=(CDEF)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
|
||||
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
|
||||
add esi, byte 2*SIZEOF_JSAMPROW
|
||||
add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec ecx
|
||||
jnz near .convloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
%define coef_block ebp + 8 ; JCOEFPTR coef_block
|
||||
%define divisors ebp + 12 ; FAST_FLOAT *divisors
|
||||
%define workspace ebp + 16 ; FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_float_sse)
|
||||
|
||||
EXTN(jsimd_quantize_float_sse):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, POINTER [workspace]
|
||||
mov edx, POINTER [divisors]
|
||||
mov edi, JCOEFPTR [coef_block]
|
||||
mov eax, DCTSIZE2/16
|
||||
alignx 16, 7
|
||||
.quantloop:
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movhlps xmm4, xmm0
|
||||
movhlps xmm5, xmm1
|
||||
|
||||
cvtps2pi mm0, xmm0
|
||||
cvtps2pi mm1, xmm1
|
||||
cvtps2pi mm4, xmm4
|
||||
cvtps2pi mm5, xmm5
|
||||
|
||||
movhlps xmm6, xmm2
|
||||
movhlps xmm7, xmm3
|
||||
|
||||
cvtps2pi mm2, xmm2
|
||||
cvtps2pi mm3, xmm3
|
||||
cvtps2pi mm6, xmm6
|
||||
cvtps2pi mm7, xmm7
|
||||
|
||||
packssdw mm0, mm4
|
||||
packssdw mm1, mm5
|
||||
packssdw mm2, mm6
|
||||
packssdw mm3, mm7
|
||||
|
||||
movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
|
||||
movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
|
||||
movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
|
||||
movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
|
||||
|
||||
add esi, byte 16*SIZEOF_FAST_FLOAT
|
||||
add edx, byte 16*SIZEOF_FAST_FLOAT
|
||||
add edi, byte 16*SIZEOF_JCOEF
|
||||
dec eax
|
||||
jnz short .quantloop
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
168
TMessagesProj/jni/mozjpeg/simd/i386/jquantf-sse2.asm
Normal file
168
TMessagesProj/jni/mozjpeg/simd/i386/jquantf-sse2.asm
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
;
|
||||
; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
%define sample_data ebp + 8 ; JSAMPARRAY sample_data
|
||||
%define start_col ebp + 12 ; JDIMENSION start_col
|
||||
%define workspace ebp + 16 ; FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
|
||||
|
||||
EXTN(jsimd_convsamp_float_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
pcmpeqw xmm7, xmm7
|
||||
psllw xmm7, 7
|
||||
packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
|
||||
|
||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [start_col]
|
||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/2
|
||||
alignx 16, 7
|
||||
.convloop:
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||
|
||||
psubb xmm0, xmm7 ; xmm0=(01234567)
|
||||
psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
|
||||
|
||||
punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
|
||||
punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
|
||||
|
||||
punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
|
||||
punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
|
||||
punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
|
||||
punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
|
||||
|
||||
psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
|
||||
psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
|
||||
cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
|
||||
cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
|
||||
psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
|
||||
psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
|
||||
cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
|
||||
cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
|
||||
add esi, byte 2*SIZEOF_JSAMPROW
|
||||
add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec ecx
|
||||
jnz short .convloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
%define coef_block ebp + 8 ; JCOEFPTR coef_block
|
||||
%define divisors ebp + 12 ; FAST_FLOAT *divisors
|
||||
%define workspace ebp + 16 ; FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
|
||||
|
||||
EXTN(jsimd_quantize_float_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, POINTER [workspace]
|
||||
mov edx, POINTER [divisors]
|
||||
mov edi, JCOEFPTR [coef_block]
|
||||
mov eax, DCTSIZE2/16
|
||||
alignx 16, 7
|
||||
.quantloop:
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
cvtps2dq xmm0, xmm0
|
||||
cvtps2dq xmm1, xmm1
|
||||
cvtps2dq xmm2, xmm2
|
||||
cvtps2dq xmm3, xmm3
|
||||
|
||||
packssdw xmm0, xmm1
|
||||
packssdw xmm2, xmm3
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
|
||||
|
||||
add esi, byte 16*SIZEOF_FAST_FLOAT
|
||||
add edx, byte 16*SIZEOF_FAST_FLOAT
|
||||
add edi, byte 16*SIZEOF_JCOEF
|
||||
dec eax
|
||||
jnz short .quantloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
188
TMessagesProj/jni/mozjpeg/simd/i386/jquanti-avx2.asm
Normal file
188
TMessagesProj/jni/mozjpeg/simd/i386/jquanti-avx2.asm
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
;
|
||||
; jquanti.asm - sample data conversion and quantization (AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, 2018, D. R. Commander.
|
||||
; Copyright (C) 2016, Matthieu Darbois.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define sample_data ebp + 8 ; JSAMPARRAY sample_data
|
||||
%define start_col ebp + 12 ; JDIMENSION start_col
|
||||
%define workspace ebp + 16 ; DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_avx2)
|
||||
|
||||
EXTN(jsimd_convsamp_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [start_col]
|
||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||
|
||||
mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||
|
||||
mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||
movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||
|
||||
mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
|
||||
movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
|
||||
|
||||
vinserti128 ymm0, ymm0, xmm1, 1
|
||||
vinserti128 ymm2, ymm2, xmm3, 1
|
||||
vinserti128 ymm4, ymm4, xmm5, 1
|
||||
vinserti128 ymm6, ymm6, xmm7, 1
|
||||
|
||||
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
|
||||
vpunpcklbw ymm0, ymm0, ymm1
|
||||
vpunpcklbw ymm2, ymm2, ymm1
|
||||
vpunpcklbw ymm4, ymm4, ymm1
|
||||
vpunpcklbw ymm6, ymm6, ymm1
|
||||
|
||||
vpcmpeqw ymm7, ymm7, ymm7
|
||||
vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm2, ymm2, ymm7
|
||||
vpaddw ymm4, ymm4, ymm7
|
||||
vpaddw ymm6, ymm6, ymm7
|
||||
|
||||
vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
|
||||
vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
|
||||
vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
|
||||
vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
|
||||
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; This implementation is based on an algorithm described in
|
||||
; "How to optimize for the Pentium family of microprocessors"
|
||||
; (http://www.agner.org/assem/).
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define RECIPROCAL(m, n, b) \
|
||||
YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define CORRECTION(m, n, b) \
|
||||
YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define SCALE(m, n, b) \
|
||||
YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
|
||||
%define coef_block ebp + 8 ; JCOEFPTR coef_block
|
||||
%define divisors ebp + 12 ; DCTELEM *divisors
|
||||
%define workspace ebp + 16 ; DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_avx2)
|
||||
|
||||
EXTN(jsimd_quantize_avx2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, POINTER [workspace]
|
||||
mov edx, POINTER [divisors]
|
||||
mov edi, JCOEFPTR [coef_block]
|
||||
|
||||
vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
|
||||
vpabsw ymm0, ymm4
|
||||
vpabsw ymm1, ymm5
|
||||
vpabsw ymm2, ymm6
|
||||
vpabsw ymm3, ymm7
|
||||
|
||||
vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
|
||||
vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)]
|
||||
vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)]
|
||||
vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)]
|
||||
vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
|
||||
vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
|
||||
vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
|
||||
vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
|
||||
vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale
|
||||
vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)]
|
||||
vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)]
|
||||
vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)]
|
||||
|
||||
vpsignw ymm0, ymm0, ymm4
|
||||
vpsignw ymm1, ymm1, ymm5
|
||||
vpsignw ymm2, ymm2, ymm6
|
||||
vpsignw ymm3, ymm3, ymm7
|
||||
|
||||
vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
|
||||
vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
|
||||
vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
|
||||
vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
|
||||
|
||||
vzeroupper
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
201
TMessagesProj/jni/mozjpeg/simd/i386/jquanti-sse2.asm
Normal file
201
TMessagesProj/jni/mozjpeg/simd/i386/jquanti-sse2.asm
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
;
|
||||
; jquanti.asm - sample data conversion and quantization (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define sample_data ebp + 8 ; JSAMPARRAY sample_data
|
||||
%define start_col ebp + 12 ; JDIMENSION start_col
|
||||
%define workspace ebp + 16 ; DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_sse2)
|
||||
|
||||
EXTN(jsimd_convsamp_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
pxor xmm6, xmm6 ; xmm6=(all 0's)
|
||||
pcmpeqw xmm7, xmm7
|
||||
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
|
||||
mov eax, JDIMENSION [start_col]
|
||||
mov edi, POINTER [workspace] ; (DCTELEM *)
|
||||
mov ecx, DCTSIZE/4
|
||||
alignx 16, 7
|
||||
.convloop:
|
||||
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
|
||||
movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
|
||||
|
||||
mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
|
||||
movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
|
||||
|
||||
punpcklbw xmm0, xmm6 ; xmm0=(01234567)
|
||||
punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm1, xmm7
|
||||
punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
|
||||
punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
|
||||
paddw xmm2, xmm7
|
||||
paddw xmm3, xmm7
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
add esi, byte 4*SIZEOF_JSAMPROW
|
||||
add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec ecx
|
||||
jnz short .convloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; This implementation is based on an algorithm described in
|
||||
; "How to optimize for the Pentium family of microprocessors"
|
||||
; (http://www.agner.org/assem/).
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define RECIPROCAL(m, n, b) \
|
||||
XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define CORRECTION(m, n, b) \
|
||||
XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define SCALE(m, n, b) \
|
||||
XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
|
||||
%define coef_block ebp + 8 ; JCOEFPTR coef_block
|
||||
%define divisors ebp + 12 ; DCTELEM *divisors
|
||||
%define workspace ebp + 16 ; DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_sse2)
|
||||
|
||||
EXTN(jsimd_quantize_sse2):
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
; push ebx ; unused
|
||||
; push ecx ; unused
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, POINTER [workspace]
|
||||
mov edx, POINTER [divisors]
|
||||
mov edi, JCOEFPTR [coef_block]
|
||||
mov eax, DCTSIZE2/32
|
||||
alignx 16, 7
|
||||
.quantloop:
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm0, xmm4
|
||||
movdqa xmm1, xmm5
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm3, xmm7
|
||||
psraw xmm4, (WORD_BIT-1)
|
||||
psraw xmm5, (WORD_BIT-1)
|
||||
psraw xmm6, (WORD_BIT-1)
|
||||
psraw xmm7, (WORD_BIT-1)
|
||||
pxor xmm0, xmm4
|
||||
pxor xmm1, xmm5
|
||||
pxor xmm2, xmm6
|
||||
pxor xmm3, xmm7
|
||||
psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
|
||||
psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
|
||||
psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
|
||||
psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
|
||||
|
||||
paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
|
||||
paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
|
||||
paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
|
||||
paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
|
||||
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
|
||||
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
|
||||
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
|
||||
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
|
||||
pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
|
||||
pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
|
||||
pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
|
||||
pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
|
||||
|
||||
pxor xmm0, xmm4
|
||||
pxor xmm1, xmm5
|
||||
pxor xmm2, xmm6
|
||||
pxor xmm3, xmm7
|
||||
psubw xmm0, xmm4
|
||||
psubw xmm1, xmm5
|
||||
psubw xmm2, xmm6
|
||||
psubw xmm3, xmm7
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
add esi, byte 32*SIZEOF_DCTELEM
|
||||
add edx, byte 32*SIZEOF_DCTELEM
|
||||
add edi, byte 32*SIZEOF_JCOEF
|
||||
dec eax
|
||||
jnz near .quantloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; unused
|
||||
; pop ebx ; unused
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
1253
TMessagesProj/jni/mozjpeg/simd/i386/jsimd.c
Normal file
1253
TMessagesProj/jni/mozjpeg/simd/i386/jsimd.c
Normal file
File diff suppressed because it is too large
Load diff
135
TMessagesProj/jni/mozjpeg/simd/i386/jsimdcpu.asm
Normal file
135
TMessagesProj/jni/mozjpeg/simd/i386/jsimdcpu.asm
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
;
|
||||
; jsimdcpu.asm - SIMD instruction support check
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
;
|
||||
; Check if the CPU supports SIMD instructions
|
||||
;
|
||||
; GLOBAL(unsigned int)
|
||||
; jpeg_simd_cpu_support(void)
|
||||
;
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jpeg_simd_cpu_support)
|
||||
|
||||
EXTN(jpeg_simd_cpu_support):
|
||||
push ebx
|
||||
; push ecx ; need not be preserved
|
||||
; push edx ; need not be preserved
|
||||
; push esi ; unused
|
||||
push edi
|
||||
|
||||
xor edi, edi ; simd support flag
|
||||
|
||||
pushfd
|
||||
pop eax
|
||||
mov edx, eax
|
||||
xor eax, 1<<21 ; flip ID bit in EFLAGS
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
xor eax, edx
|
||||
jz near .return ; CPUID is not supported
|
||||
|
||||
; Check whether CPUID leaf 07H is supported
|
||||
; (leaf 07H is used to check for AVX2 instruction support)
|
||||
xor eax, eax
|
||||
cpuid
|
||||
test eax, eax
|
||||
jz near .return
|
||||
cmp eax, 7
|
||||
jl short .no_avx2 ; Maximum leaf < 07H
|
||||
|
||||
; Check for AVX2 instruction support
|
||||
mov eax, 7
|
||||
xor ecx, ecx
|
||||
cpuid
|
||||
mov eax, ebx
|
||||
test eax, 1<<5 ; bit5:AVX2
|
||||
jz short .no_avx2
|
||||
|
||||
; Check for AVX2 O/S support
|
||||
mov eax, 1
|
||||
xor ecx, ecx
|
||||
cpuid
|
||||
test ecx, 1<<27
|
||||
jz short .no_avx2 ; O/S does not support XSAVE
|
||||
test ecx, 1<<28
|
||||
jz short .no_avx2 ; CPU does not support AVX2
|
||||
|
||||
xor ecx, ecx
|
||||
xgetbv
|
||||
and eax, 6
|
||||
cmp eax, 6 ; O/S does not manage XMM/YMM state
|
||||
; using XSAVE
|
||||
jnz short .no_avx2
|
||||
|
||||
or edi, JSIMD_AVX2
|
||||
.no_avx2:
|
||||
|
||||
; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
|
||||
xor eax, eax
|
||||
inc eax
|
||||
cpuid
|
||||
mov eax, edx ; eax = Standard feature flags
|
||||
|
||||
; Check for MMX instruction support
|
||||
test eax, 1<<23 ; bit23:MMX
|
||||
jz short .no_mmx
|
||||
or edi, byte JSIMD_MMX
|
||||
.no_mmx:
|
||||
test eax, 1<<25 ; bit25:SSE
|
||||
jz short .no_sse
|
||||
or edi, byte JSIMD_SSE
|
||||
.no_sse:
|
||||
test eax, 1<<26 ; bit26:SSE2
|
||||
jz short .no_sse2
|
||||
or edi, byte JSIMD_SSE2
|
||||
.no_sse2:
|
||||
|
||||
; Check for 3DNow! instruction support
|
||||
mov eax, 0x80000000
|
||||
cpuid
|
||||
cmp eax, 0x80000000
|
||||
jbe short .return
|
||||
|
||||
mov eax, 0x80000001
|
||||
cpuid
|
||||
mov eax, edx ; eax = Extended feature flags
|
||||
|
||||
test eax, 1<<31 ; bit31:3DNow!(vendor independent)
|
||||
jz short .no_3dnow
|
||||
or edi, byte JSIMD_3DNOW
|
||||
.no_3dnow:
|
||||
|
||||
.return:
|
||||
mov eax, edi
|
||||
|
||||
pop edi
|
||||
; pop esi ; unused
|
||||
; pop edx ; need not be preserved
|
||||
; pop ecx ; need not be preserved
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
Loading…
Add table
Add a link
Reference in a new issue