Repo created
This commit is contained in:
parent
81b91f4139
commit
f8c34fa5ee
22732 changed files with 4815320 additions and 2 deletions
558
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolext-avx2.asm
Normal file
558
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolext-avx2.asm
Normal file
|
|
@ -0,0 +1,558 @@
|
|||
;
|
||||
; jccolext.asm - colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rsi, r12
|
||||
mov ecx, r13d
|
||||
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
|
||||
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
|
||||
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
||||
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rsi, r11
|
||||
mov eax, r14d
|
||||
test rax, rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rdx
|
||||
push rbx
|
||||
push rdi
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr0
|
||||
mov rbx, JSAMPROW [rbx] ; outptr1
|
||||
mov rdx, JSAMPROW [rdx] ; outptr2
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push rax
|
||||
push rdx
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
vmovd xmmA, eax
|
||||
pop rdx
|
||||
pop rax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
vmovd xmmF, XMM_DWORD [rsi+rcx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_DWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
vmovq xmmB, XMM_MMWORD [rsi+rcx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
vmovdqu xmmB, XMM_MMWORD [rsi+rcx]
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
vpor ymmA, ymmB
|
||||
.column_ld32:
|
||||
test cl, SIZEOF_YMMWORD
|
||||
jz short .column_ld64
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
.column_ld64:
|
||||
test cl, 2*SIZEOF_YMMWORD
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
vmovdqa ymmB, ymmA
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vmovdqu ymmC, ymmA
|
||||
vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
|
||||
vmovdqa ymmG, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
|
||||
; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
|
||||
vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
|
||||
; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
|
||||
vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
|
||||
; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
|
||||
|
||||
vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
|
||||
; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
|
||||
vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
|
||||
; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
|
||||
; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
|
||||
vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
|
||||
; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
|
||||
vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
|
||||
; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
|
||||
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
|
||||
; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
|
||||
; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
|
||||
; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
|
||||
vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
|
||||
; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
|
||||
; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
|
||||
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
|
||||
; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
vpxor ymmH, ymmH, ymmH
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmB, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
|
||||
vmovdqa ymmF, ymmD
|
||||
vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_XMMWORD/16
|
||||
vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
vmovdqa xmmF, xmmA
|
||||
vperm2i128 ymmF, ymmF, ymmF, 1
|
||||
vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
vpor ymmA, ymmA, ymmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
vmovdqa ymmE, ymmA
|
||||
vmovdqa ymmH, ymmF
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
|
||||
vmovdqa ymmB, ymmF
|
||||
vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
|
||||
; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
|
||||
vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
|
||||
; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
|
||||
|
||||
vmovdqa ymmC, ymmF
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
|
||||
; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
|
||||
; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
|
||||
vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
|
||||
; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
|
||||
|
||||
vmovdqa ymmG, ymmD
|
||||
vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
|
||||
; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
|
||||
vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
|
||||
; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vmovdqa ymmH, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
|
||||
; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
|
||||
; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
vpxor ymmF, ymmF, ymmF
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmD, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
|
||||
vmovdqa ymmG, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vpunpcklbw ymmF, ymmF, ymmH
|
||||
vpunpckhbw ymmH, ymmH, ymmH
|
||||
vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
|
||||
; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE
|
||||
vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO
|
||||
vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE
|
||||
vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO
|
||||
|
||||
vmovdqa ymm6, ymm1
|
||||
vpunpcklwd ymm1, ymm1, ymm3
|
||||
vpunpckhwd ymm6, ymm6, ymm3
|
||||
vmovdqa ymm7, ymm1
|
||||
vmovdqa ymm4, ymm6
|
||||
vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||
vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||
|
||||
vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
vpxor ymm6, ymm6, ymm6
|
||||
vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL
|
||||
vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH
|
||||
vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500)
|
||||
vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm7, ymm7, ymm1
|
||||
vpaddd ymm4, ymm4, ymm6
|
||||
vpaddd ymm7, ymm7, ymm5
|
||||
vpaddd ymm4, ymm4, ymm5
|
||||
vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH
|
||||
vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO
|
||||
|
||||
vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE
|
||||
|
||||
vmovdqa ymm6, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm2
|
||||
vpunpckhwd ymm6, ymm6, ymm2
|
||||
vmovdqa ymm5, ymm0
|
||||
vmovdqa ymm4, ymm6
|
||||
vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||
vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||
|
||||
vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
vpxor ymm0, ymm0, ymm0
|
||||
vpxor ymm6, ymm6, ymm6
|
||||
vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL
|
||||
vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH
|
||||
vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500)
|
||||
vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm5, ymm5, ymm0
|
||||
vpaddd ymm4, ymm4, ymm6
|
||||
vpaddd ymm5, ymm5, ymm1
|
||||
vpaddd ymm4, ymm4, ymm1
|
||||
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH
|
||||
vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE
|
||||
|
||||
vpsllw ymm7, ymm7, BYTE_BIT
|
||||
vpor ymm5, ymm5, ymm7 ; ymm5=Cb
|
||||
vmovdqu YMMWORD [rbx], ymm5 ; Save Cb
|
||||
|
||||
vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO
|
||||
vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE
|
||||
vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO
|
||||
|
||||
vmovdqa ymm4, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm3
|
||||
vpunpckhwd ymm4, ymm4, ymm3
|
||||
vmovdqa ymm7, ymm0
|
||||
vmovdqa ymm5, ymm4
|
||||
vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||
vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||
|
||||
vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm0, ymm0, YMMWORD [wk(4)]
|
||||
vpaddd ymm4, ymm4, YMMWORD [wk(5)]
|
||||
vpaddd ymm0, ymm0, ymm3
|
||||
vpaddd ymm4, ymm4, ymm3
|
||||
vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
|
||||
vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
|
||||
|
||||
vpxor ymm3, ymm3, ymm3
|
||||
vpxor ymm4, ymm4, ymm4
|
||||
vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL
|
||||
vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH
|
||||
vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500)
|
||||
vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm7, ymm7, ymm3
|
||||
vpaddd ymm5, ymm5, ymm4
|
||||
vpaddd ymm7, ymm7, ymm1
|
||||
vpaddd ymm5, ymm5, ymm1
|
||||
vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL
|
||||
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH
|
||||
vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO
|
||||
|
||||
vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE
|
||||
|
||||
vmovdqa ymm4, ymm6
|
||||
vpunpcklwd ymm6, ymm6, ymm2
|
||||
vpunpckhwd ymm4, ymm4, ymm2
|
||||
vmovdqa ymm1, ymm6
|
||||
vmovdqa ymm5, ymm4
|
||||
vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||
vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||
|
||||
vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm6, ymm6, YMMWORD [wk(6)]
|
||||
vpaddd ymm4, ymm4, YMMWORD [wk(7)]
|
||||
vpaddd ymm6, ymm6, ymm2
|
||||
vpaddd ymm4, ymm4, ymm2
|
||||
vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
|
||||
vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
|
||||
|
||||
vpsllw ymm0, ymm0, BYTE_BIT
|
||||
vpor ymm6, ymm6, ymm0 ; ymm6=Y
|
||||
vmovdqu YMMWORD [rdi], ymm6 ; Save Y
|
||||
|
||||
vpxor ymm2, ymm2, ymm2
|
||||
vpxor ymm4, ymm4, ymm4
|
||||
vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL
|
||||
vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH
|
||||
vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500)
|
||||
vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500)
|
||||
|
||||
vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ]
|
||||
|
||||
vpaddd ymm1, ymm1, ymm2
|
||||
vpaddd ymm5, ymm5, ymm4
|
||||
vpaddd ymm1, ymm1, ymm0
|
||||
vpaddd ymm5, ymm5, ymm0
|
||||
vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL
|
||||
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH
|
||||
vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE
|
||||
|
||||
vpsllw ymm7, ymm7, BYTE_BIT
|
||||
vpor ymm1, ymm1, ymm7 ; ymm1=Cr
|
||||
vmovdqu YMMWORD [rdx], ymm1 ; Save Cr
|
||||
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr0
|
||||
add rbx, byte SIZEOF_YMMWORD ; outptr1
|
||||
add rdx, byte SIZEOF_YMMWORD ; outptr2
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
test rcx, rcx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop rcx ; col
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add rdi, byte SIZEOF_JSAMPROW
|
||||
add rbx, byte SIZEOF_JSAMPROW
|
||||
add rdx, byte SIZEOF_JSAMPROW
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
483
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolext-sse2.asm
Normal file
483
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolext-sse2.asm
Normal file
|
|
@ -0,0 +1,483 @@
|
|||
;
|
||||
; jccolext.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rsi, r12
|
||||
mov ecx, r13d
|
||||
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
|
||||
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
|
||||
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
||||
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rsi, r11
|
||||
mov eax, r14d
|
||||
test rax, rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rdx
|
||||
push rbx
|
||||
push rdi
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr0
|
||||
mov rbx, JSAMPROW [rbx] ; outptr1
|
||||
mov rdx, JSAMPROW [rdx] ; outptr2
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push rax
|
||||
push rdx
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
movd xmmA, eax
|
||||
pop rdx
|
||||
pop rax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
movd xmmF, XMM_DWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_DWORD
|
||||
por xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
movq xmmB, XMM_MMWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
movdqa xmmF, xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .rgb_ycc_cnv
|
||||
.column_ld32:
|
||||
test cl, 2*SIZEOF_XMMWORD
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movdqa xmmB, xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||
|
||||
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
pxor xmmH, xmmH
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmB, xmmE
|
||||
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
|
||||
movdqa xmmF, xmmD
|
||||
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_XMMWORD/16
|
||||
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmE
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
movdqa xmmE, xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movdqa xmmF, xmmA
|
||||
movdqa xmmH, xmmE
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||
|
||||
movdqa xmmC, xmmF
|
||||
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||
|
||||
movdqa xmmB, xmmA
|
||||
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||
|
||||
movdqa xmmG, xmmD
|
||||
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
movdqa xmmH, xmmB
|
||||
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
pxor xmmF, xmmF
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmD, xmmB
|
||||
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
|
||||
movdqa xmmG, xmmE
|
||||
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
punpcklbw xmmF, xmmH
|
||||
punpckhbw xmmH, xmmH
|
||||
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
||||
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
|
||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
|
||||
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm6, xmm3
|
||||
movdqa xmm7, xmm1
|
||||
movdqa xmm4, xmm6
|
||||
pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||
pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm6, xmm6
|
||||
punpcklwd xmm1, xmm5 ; xmm1=BOL
|
||||
punpckhwd xmm6, xmm5 ; xmm6=BOH
|
||||
psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
|
||||
psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
|
||||
|
||||
movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm4, xmm6
|
||||
paddd xmm7, xmm5
|
||||
paddd xmm4, xmm5
|
||||
psrld xmm7, SCALEBITS ; xmm7=CbOL
|
||||
psrld xmm4, SCALEBITS ; xmm4=CbOH
|
||||
packssdw xmm7, xmm4 ; xmm7=CbO
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm6, xmm2
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm4, xmm6
|
||||
pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||
pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||
|
||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm6, xmm6
|
||||
punpcklwd xmm0, xmm1 ; xmm0=BEL
|
||||
punpckhwd xmm6, xmm1 ; xmm6=BEH
|
||||
psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
|
||||
psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
|
||||
|
||||
movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm5, xmm0
|
||||
paddd xmm4, xmm6
|
||||
paddd xmm5, xmm1
|
||||
paddd xmm4, xmm1
|
||||
psrld xmm5, SCALEBITS ; xmm5=CbEL
|
||||
psrld xmm4, SCALEBITS ; xmm4=CbEH
|
||||
packssdw xmm5, xmm4 ; xmm5=CbE
|
||||
|
||||
psllw xmm7, BYTE_BIT
|
||||
por xmm5, xmm7 ; xmm5=Cb
|
||||
movdqa XMMWORD [rbx], xmm5 ; Save Cb
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
|
||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm3
|
||||
punpckhwd xmm4, xmm3
|
||||
movdqa xmm7, xmm0
|
||||
movdqa xmm5, xmm4
|
||||
pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||
pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||
|
||||
movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
||||
|
||||
paddd xmm0, XMMWORD [wk(4)]
|
||||
paddd xmm4, XMMWORD [wk(5)]
|
||||
paddd xmm0, xmm3
|
||||
paddd xmm4, xmm3
|
||||
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||
packssdw xmm0, xmm4 ; xmm0=YO
|
||||
|
||||
pxor xmm3, xmm3
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm3, xmm1 ; xmm3=ROL
|
||||
punpckhwd xmm4, xmm1 ; xmm4=ROH
|
||||
psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
|
||||
psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
|
||||
|
||||
movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm7, xmm3
|
||||
paddd xmm5, xmm4
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm5, xmm1
|
||||
psrld xmm7, SCALEBITS ; xmm7=CrOL
|
||||
psrld xmm5, SCALEBITS ; xmm5=CrOH
|
||||
packssdw xmm7, xmm5 ; xmm7=CrO
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpcklwd xmm6, xmm2
|
||||
punpckhwd xmm4, xmm2
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm5, xmm4
|
||||
pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||
pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||
|
||||
movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
||||
|
||||
paddd xmm6, XMMWORD [wk(6)]
|
||||
paddd xmm4, XMMWORD [wk(7)]
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm4, xmm2
|
||||
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||
packssdw xmm6, xmm4 ; xmm6=YE
|
||||
|
||||
psllw xmm0, BYTE_BIT
|
||||
por xmm6, xmm0 ; xmm6=Y
|
||||
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
||||
|
||||
pxor xmm2, xmm2
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm2, xmm3 ; xmm2=REL
|
||||
punpckhwd xmm4, xmm3 ; xmm4=REH
|
||||
psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
|
||||
psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
|
||||
|
||||
movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm5, xmm4
|
||||
paddd xmm1, xmm0
|
||||
paddd xmm5, xmm0
|
||||
psrld xmm1, SCALEBITS ; xmm1=CrEL
|
||||
psrld xmm5, SCALEBITS ; xmm5=CrEH
|
||||
packssdw xmm1, xmm5 ; xmm1=CrE
|
||||
|
||||
psllw xmm7, BYTE_BIT
|
||||
por xmm1, xmm7 ; xmm1=Cr
|
||||
movdqa XMMWORD [rdx], xmm1 ; Save Cr
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr0
|
||||
add rbx, byte SIZEOF_XMMWORD ; outptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; outptr2
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test rcx, rcx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop rcx ; col
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add rdi, byte SIZEOF_JSAMPROW
|
||||
add rbx, byte SIZEOF_JSAMPROW
|
||||
add rdx, byte SIZEOF_JSAMPROW
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
121
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolor-avx2.asm
Normal file
121
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolor-avx2.asm
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
;
|
||||
; jccolor.asm - colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_081 equ 5329 ; FIX(0.08131)
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_168 equ 11059 ; FIX(0.16874)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_331 equ 21709 ; FIX(0.33126)
|
||||
F_0_418 equ 27439 ; FIX(0.41869)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
|
||||
|
||||
EXTN(jconst_rgb_ycc_convert_avx2):
|
||||
|
||||
PW_F0299_F0337 times 8 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 8 dw F_0_114, F_0_250
|
||||
PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331
|
||||
PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418
|
||||
PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
|
||||
(CENTERJSAMPLE << SCALEBITS)
|
||||
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
|
||||
%include "jccolext-avx2.asm"
|
||||
120
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolor-sse2.asm
Normal file
120
TMessagesProj/jni/mozjpeg/simd/x86_64/jccolor-sse2.asm
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
;
|
||||
; jccolor.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_081 equ 5329 ; FIX(0.08131)
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_168 equ 11059 ; FIX(0.16874)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_331 equ 21709 ; FIX(0.33126)
|
||||
F_0_418 equ 27439 ; FIX(0.41869)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
|
||||
|
||||
EXTN(jconst_rgb_ycc_convert_sse2):
|
||||
|
||||
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
|
||||
PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331
|
||||
PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418
|
||||
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
|
||||
(CENTERJSAMPLE << SCALEBITS)
|
||||
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
|
||||
%include "jccolext-sse2.asm"
|
||||
113
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgray-avx2.asm
Normal file
113
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgray-avx2.asm
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
;
|
||||
; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_avx2):
|
||||
|
||||
PW_F0299_F0337 times 8 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 8 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
|
||||
%include "jcgryext-avx2.asm"
|
||||
112
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgray-sse2.asm
Normal file
112
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgray-sse2.asm
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
;
|
||||
; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jconst_rgb_gray_convert_sse2):
|
||||
|
||||
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
|
||||
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
|
||||
%include "jcgryext-sse2.asm"
|
||||
437
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgryext-avx2.asm
Normal file
437
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgryext-avx2.asm
Normal file
|
|
@ -0,0 +1,437 @@
|
|||
;
|
||||
; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rsi, r12
|
||||
mov ecx, r13d
|
||||
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
|
||||
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rsi, r11
|
||||
mov eax, r14d
|
||||
test rax, rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr0
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push rax
|
||||
push rdx
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
vmovd xmmA, eax
|
||||
pop rdx
|
||||
pop rax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
vmovd xmmF, XMM_DWORD [rsi+rcx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_DWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
vmovq xmmB, XMM_MMWORD [rsi+rcx]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
vmovdqu xmmB, XMM_MMWORD [rsi+rcx]
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
vpor ymmA, ymmB
|
||||
.column_ld32:
|
||||
test cl, SIZEOF_YMMWORD
|
||||
jz short .column_ld64
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
.column_ld64:
|
||||
test cl, 2*SIZEOF_YMMWORD
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
vmovdqa ymmB, ymmA
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vmovdqu ymmC, ymmA
|
||||
vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
|
||||
vmovdqa ymmG, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
|
||||
; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
|
||||
vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
|
||||
; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
|
||||
vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
|
||||
; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
|
||||
|
||||
vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
|
||||
; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
|
||||
vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
|
||||
; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
|
||||
; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
|
||||
vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
|
||||
; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
|
||||
vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
|
||||
; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
|
||||
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
|
||||
; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
|
||||
; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
|
||||
; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
|
||||
vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
|
||||
; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
|
||||
|
||||
vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
|
||||
; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
|
||||
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
|
||||
; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
vpxor ymmH, ymmH, ymmH
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmB, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
|
||||
vmovdqa ymmF, ymmD
|
||||
vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_XMMWORD/16
|
||||
vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
vpslldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
vpor xmmA, xmmA, xmmF
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
vmovdqa xmmF, xmmA
|
||||
vperm2i128 ymmF, ymmF, ymmF, 1
|
||||
vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
vpor ymmA, ymmA, ymmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
vmovdqa ymmF, ymmA
|
||||
vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
vmovdqa ymmE, ymmA
|
||||
vmovdqa ymmH, ymmF
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
|
||||
vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
|
||||
vmovdqa ymmB, ymmF
|
||||
vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
vmovdqa ymmD, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
|
||||
; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
|
||||
vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
|
||||
; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
|
||||
|
||||
vmovdqa ymmC, ymmF
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
|
||||
; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
|
||||
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
|
||||
; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
|
||||
|
||||
vmovdqa ymmB, ymmA
|
||||
vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
|
||||
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
|
||||
vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
|
||||
; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
|
||||
|
||||
vmovdqa ymmG, ymmD
|
||||
vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
|
||||
; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
|
||||
vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
|
||||
; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
|
||||
|
||||
vmovdqa ymmE, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
|
||||
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
|
||||
; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vmovdqa ymmH, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
|
||||
; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
|
||||
; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
vpxor ymmF, ymmF, ymmF
|
||||
|
||||
vmovdqa ymmC, ymmA
|
||||
vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
|
||||
vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
|
||||
|
||||
vmovdqa ymmD, ymmB
|
||||
vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
|
||||
vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
|
||||
|
||||
vmovdqa ymmG, ymmE
|
||||
vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
|
||||
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
|
||||
|
||||
vpunpcklbw ymmF, ymmF, ymmH
|
||||
vpunpckhbw ymmH, ymmH, ymmH
|
||||
vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
|
||||
vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
|
||||
; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
vmovdqa ymm6, ymm1
|
||||
vpunpcklwd ymm1, ymm1, ymm3
|
||||
vpunpckhwd ymm6, ymm6, ymm3
|
||||
vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
vmovdqa ymm6, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm2
|
||||
vpunpckhwd ymm6, ymm6, ymm2
|
||||
vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
vmovdqa ymm0, ymm5 ; ymm0=BO
|
||||
vmovdqa ymm6, ymm4 ; ymm6=BE
|
||||
|
||||
vmovdqa ymm4, ymm0
|
||||
vpunpcklwd ymm0, ymm0, ymm3
|
||||
vpunpckhwd ymm4, ymm4, ymm3
|
||||
vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpaddd ymm4, ymm4, ymm7
|
||||
vpaddd ymm0, ymm0, ymm3
|
||||
vpaddd ymm4, ymm4, ymm3
|
||||
vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
|
||||
vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
|
||||
|
||||
vmovdqa ymm4, ymm6
|
||||
vpunpcklwd ymm6, ymm6, ymm2
|
||||
vpunpckhwd ymm4, ymm4, ymm2
|
||||
vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF]
|
||||
|
||||
vpaddd ymm6, ymm6, YMMWORD [wk(0)]
|
||||
vpaddd ymm4, ymm4, YMMWORD [wk(1)]
|
||||
vpaddd ymm6, ymm6, ymm2
|
||||
vpaddd ymm4, ymm4, ymm2
|
||||
vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
|
||||
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
|
||||
vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
|
||||
|
||||
vpsllw ymm0, ymm0, BYTE_BIT
|
||||
vpor ymm6, ymm6, ymm0 ; ymm6=Y
|
||||
vmovdqu YMMWORD [rdi], ymm6 ; Save Y
|
||||
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr0
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
test rcx, rcx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop rcx ; col
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add rdi, byte SIZEOF_JSAMPROW
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
362
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgryext-sse2.asm
Normal file
362
TMessagesProj/jni/mozjpeg/simd/x86_64/jcgryext-sse2.asm
Normal file
|
|
@ -0,0 +1,362 @@
|
|||
;
|
||||
; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
; JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
; int num_rows);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rsi, r12
|
||||
mov ecx, r13d
|
||||
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
|
||||
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rsi, r11
|
||||
mov eax, r14d
|
||||
test rax, rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr0
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push rax
|
||||
push rdx
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
movd xmmA, eax
|
||||
pop rdx
|
||||
pop rax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
movd xmmF, XMM_DWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_DWORD
|
||||
por xmmA, xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
movq xmmB, XMM_MMWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
movdqa xmmF, xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .rgb_gray_cnv
|
||||
.column_ld32:
|
||||
test cl, 2*SIZEOF_XMMWORD
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmB, xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||
|
||||
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
pxor xmmH, xmmH
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmB, xmmE
|
||||
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
|
||||
movdqa xmmF, xmmD
|
||||
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_XMMWORD/16
|
||||
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA, xmmE
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
movdqa xmmE, xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_gray_cnv
|
||||
movdqa xmmF, xmmA
|
||||
movdqa xmmH, xmmE
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_gray_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_gray_cnv:
|
||||
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||
|
||||
movdqa xmmC, xmmF
|
||||
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||
|
||||
movdqa xmmB, xmmA
|
||||
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||
|
||||
movdqa xmmG, xmmD
|
||||
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||
|
||||
movdqa xmmE, xmmA
|
||||
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
movdqa xmmH, xmmB
|
||||
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
pxor xmmF, xmmF
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmD, xmmB
|
||||
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
|
||||
movdqa xmmG, xmmE
|
||||
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
punpcklbw xmmF, xmmH
|
||||
punpckhbw xmmH, xmmH
|
||||
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
||||
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm6, xmm3
|
||||
pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm6, xmm2
|
||||
pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
movdqa xmm0, xmm5 ; xmm0=BO
|
||||
movdqa xmm6, xmm4 ; xmm6=BE
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm3
|
||||
punpckhwd xmm4, xmm3
|
||||
pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
|
||||
movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
||||
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm4, xmm7
|
||||
paddd xmm0, xmm3
|
||||
paddd xmm4, xmm3
|
||||
psrld xmm0, SCALEBITS ; xmm0=YOL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YOH
|
||||
packssdw xmm0, xmm4 ; xmm0=YO
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpcklwd xmm6, xmm2
|
||||
punpckhwd xmm4, xmm2
|
||||
pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
|
||||
movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
||||
|
||||
paddd xmm6, XMMWORD [wk(0)]
|
||||
paddd xmm4, XMMWORD [wk(1)]
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm4, xmm2
|
||||
psrld xmm6, SCALEBITS ; xmm6=YEL
|
||||
psrld xmm4, SCALEBITS ; xmm4=YEH
|
||||
packssdw xmm6, xmm4 ; xmm6=YE
|
||||
|
||||
psllw xmm0, BYTE_BIT
|
||||
por xmm6, xmm0 ; xmm6=Y
|
||||
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr0
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test rcx, rcx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop rcx ; col
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add rdi, byte SIZEOF_JSAMPROW
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
346
TMessagesProj/jni/mozjpeg/simd/x86_64/jchuff-sse2.asm
Normal file
346
TMessagesProj/jni/mozjpeg/simd/x86_64/jchuff-sse2.asm
Normal file
|
|
@ -0,0 +1,346 @@
|
|||
;
|
||||
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Matthieu Darbois.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation for Huffman coding of one block.
|
||||
; The following code is based directly on jchuff.c; see jchuff.c for more
|
||||
; details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_huff_encode_one_block)
|
||||
|
||||
EXTN(jconst_huff_encode_one_block):
|
||||
|
||||
%include "jpeg_nbits_table.inc"
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
; These macros perform the same task as the emit_bits() function in the
|
||||
; original libjpeg code. In addition to reducing overhead by explicitly
|
||||
; inlining the code, additional performance is achieved by taking into
|
||||
; account the size of the bit buffer and waiting until it is almost full
|
||||
; before emptying it. This mostly benefits 64-bit platforms, since 6
|
||||
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
|
||||
|
||||
%macro EMIT_BYTE 0
|
||||
sub put_bits, 8 ; put_bits -= 8;
|
||||
mov rdx, put_buffer
|
||||
mov ecx, put_bits
|
||||
shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
|
||||
mov byte [buffer], dl ; *buffer++ = c;
|
||||
add buffer, 1
|
||||
cmp dl, 0xFF ; need to stuff a zero byte?
|
||||
jne %%.EMIT_BYTE_END
|
||||
mov byte [buffer], 0 ; *buffer++ = 0;
|
||||
add buffer, 1
|
||||
%%.EMIT_BYTE_END:
|
||||
%endmacro
|
||||
|
||||
%macro PUT_BITS 1
|
||||
add put_bits, ecx ; put_bits += size;
|
||||
shl put_buffer, cl ; put_buffer = (put_buffer << size);
|
||||
or put_buffer, %1
|
||||
%endmacro
|
||||
|
||||
%macro CHECKBUF31 0
|
||||
cmp put_bits, 32 ; if (put_bits > 31) {
|
||||
jl %%.CHECKBUF31_END
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
%%.CHECKBUF31_END:
|
||||
%endmacro
|
||||
|
||||
%macro CHECKBUF47 0
|
||||
cmp put_bits, 48 ; if (put_bits > 47) {
|
||||
jl %%.CHECKBUF47_END
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
EMIT_BYTE
|
||||
%%.CHECKBUF47_END:
|
||||
%endmacro
|
||||
|
||||
%macro EMIT_BITS 2
|
||||
CHECKBUF47
|
||||
mov ecx, %2
|
||||
PUT_BITS %1
|
||||
%endmacro
|
||||
|
||||
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
|
||||
pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
|
||||
pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
|
||||
pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
|
||||
pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
|
||||
pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
|
||||
pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
|
||||
pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
|
||||
pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
|
||||
pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
|
||||
pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
|
||||
pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
|
||||
pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
|
||||
pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
|
||||
pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
|
||||
pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
|
||||
pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
|
||||
pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
|
||||
pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
|
||||
pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
|
||||
pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
|
||||
pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
|
||||
pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
|
||||
pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
|
||||
pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
|
||||
pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
|
||||
pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
|
||||
pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
|
||||
pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
|
||||
pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
|
||||
pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
|
||||
pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
|
||||
pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
|
||||
pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
|
||||
%if %1 != 32
|
||||
pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
|
||||
%else
|
||||
pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
|
||||
%endif
|
||||
pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
|
||||
paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
|
||||
paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
|
||||
pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
|
||||
pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
|
||||
pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
|
||||
movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
|
||||
movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
|
||||
movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
|
||||
movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
|
||||
movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
|
||||
movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
|
||||
movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
|
||||
movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Encode a single block's worth of coefficients.
|
||||
;
|
||||
; GLOBAL(JOCTET *)
|
||||
; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
|
||||
; JCOEFPTR block, int last_dc_val,
|
||||
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
|
||||
;
|
||||
|
||||
; r10 = working_state *state
|
||||
; r11 = JOCTET *buffer
|
||||
; r12 = JCOEFPTR block
|
||||
; r13d = int last_dc_val
|
||||
; r14 = c_derived_tbl *dctbl
|
||||
; r15 = c_derived_tbl *actbl
|
||||
|
||||
%define t1 rbp - (DCTSIZE2 * SIZEOF_WORD)
|
||||
%define t2 t1 - (DCTSIZE2 * SIZEOF_WORD)
|
||||
%define put_buffer r8
|
||||
%define put_bits r9d
|
||||
%define buffer rax
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
|
||||
|
||||
EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [t2]
|
||||
push_xmm 4
|
||||
collect_args 6
|
||||
push rbx
|
||||
|
||||
mov buffer, r11 ; r11 is now sratch
|
||||
|
||||
mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
|
||||
mov put_bits, dword [r10+24] ; put_bits = state->cur.put_bits;
|
||||
push r10 ; r10 is now scratch
|
||||
|
||||
; Encode the DC coefficient difference per section F.1.2.1
|
||||
movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
|
||||
sub edi, r13d ; r13 is not used anymore
|
||||
mov ebx, edi
|
||||
|
||||
; This is a well-known technique for obtaining the absolute value
|
||||
; without a branch. It is derived from an assembly language technique
|
||||
; presented in "How to Optimize for the Pentium Processors",
|
||||
; Copyright (c) 1996, 1997 by Agner Fog.
|
||||
mov esi, edi
|
||||
sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
|
||||
xor edi, esi ; temp ^= temp3;
|
||||
sub edi, esi ; temp -= temp3;
|
||||
|
||||
; For a negative input, want temp2 = bitwise complement of abs(input)
|
||||
; This code assumes we are on a two's complement machine
|
||||
add ebx, esi ; temp2 += temp3;
|
||||
|
||||
; Find the number of bits needed for the magnitude of the coefficient
|
||||
lea r11, [rel jpeg_nbits_table]
|
||||
movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
|
||||
; Emit the Huffman-coded symbol for the number of bits
|
||||
mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
|
||||
movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
|
||||
EMIT_BITS r11, esi ; EMIT_BITS(code, size)
|
||||
|
||||
; Mask off any extra bits in code
|
||||
mov esi, 1
|
||||
mov ecx, edi
|
||||
shl esi, cl
|
||||
dec esi
|
||||
and ebx, esi ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
|
||||
; Emit that number of bits of the value, if positive,
|
||||
; or the complement of its magnitude, if negative.
|
||||
EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
|
||||
|
||||
; Prepare data
|
||||
xor ebx, ebx
|
||||
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
|
||||
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
|
||||
27, 20, 13, 6, 7, 14, 21, 28, 35, \
|
||||
xmm0, xmm1, xmm2, xmm3
|
||||
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
|
||||
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
|
||||
53, 60, 61, 54, 47, 55, 62, 63, 63, \
|
||||
xmm4, xmm5, xmm6, xmm7
|
||||
|
||||
pxor xmm8, xmm8
|
||||
pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
|
||||
pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
|
||||
pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
|
||||
pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
|
||||
pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
|
||||
pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
|
||||
pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
|
||||
pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
|
||||
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
|
||||
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
|
||||
packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
|
||||
packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
|
||||
pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
|
||||
pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
|
||||
pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
|
||||
pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
|
||||
shl r12, 16
|
||||
shl r14, 16
|
||||
or r11, r12
|
||||
or r13, r14
|
||||
shl r13, 32
|
||||
or r11, r13
|
||||
not r11 ; index = ~index;
|
||||
|
||||
;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
|
||||
;jmp .EFN
|
||||
|
||||
mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
|
||||
movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
|
||||
lea rsi, [t1]
|
||||
.BLOOP:
|
||||
bsf r12, r11 ; r = __builtin_ctzl(index);
|
||||
jz .ELOOP
|
||||
mov rcx, r12
|
||||
lea rsi, [rsi+r12*2] ; k += r;
|
||||
shr r11, cl ; index >>= r;
|
||||
movzx rdi, word [rsi] ; temp = t1[k];
|
||||
lea rbx, [rel jpeg_nbits_table]
|
||||
movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
|
||||
.BRLOOP:
|
||||
cmp r12, 16 ; while (r > 15) {
|
||||
jl .ERLOOP
|
||||
EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
|
||||
sub r12, 16 ; r -= 16;
|
||||
jmp .BRLOOP
|
||||
.ERLOOP:
|
||||
; Emit Huffman symbol for run length / number of bits
|
||||
CHECKBUF31 ; uses rcx, rdx
|
||||
|
||||
shl r12, 4 ; temp3 = (r << 4) + nbits;
|
||||
add r12, rdi
|
||||
mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
|
||||
movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
|
||||
PUT_BITS rbx
|
||||
|
||||
;EMIT_CODE(code, size)
|
||||
|
||||
movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
|
||||
; Mask off any extra bits in code
|
||||
mov rcx, rdi
|
||||
mov rdx, 1
|
||||
shl rdx, cl
|
||||
dec rdx
|
||||
and rbx, rdx ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
PUT_BITS rbx ; PUT_BITS(temp2, nbits)
|
||||
|
||||
shr r11, 1 ; index >>= 1;
|
||||
add rsi, 2 ; ++k;
|
||||
jmp .BLOOP
|
||||
.ELOOP:
|
||||
; If the last coef(s) were zero, emit an end-of-block code
|
||||
lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
|
||||
cmp rdi, rsi ; if (r > 0) {
|
||||
je .EFN
|
||||
mov ebx, INT [r15] ; code = actbl->ehufco[0];
|
||||
movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
|
||||
EMIT_BITS rbx, r12d
|
||||
.EFN:
|
||||
pop r10
|
||||
; Save put_buffer & put_bits
|
||||
mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
|
||||
mov dword [r10+24], put_bits ; state->cur.put_bits = put_bits;
|
||||
|
||||
pop rbx
|
||||
uncollect_args 6
|
||||
pop_xmm 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
637
TMessagesProj/jni/mozjpeg/simd/x86_64/jcphuff-sse2.asm
Normal file
637
TMessagesProj/jni/mozjpeg/simd/x86_64/jcphuff-sse2.asm
Normal file
|
|
@ -0,0 +1,637 @@
|
|||
;
|
||||
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
|
||||
; (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2016, 2018, Matthieu Darbois
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation of data preparation for progressive
|
||||
; Huffman encoding. See jcphuff.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
|
||||
%macro LOAD16 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
|
||||
mov T0d, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0d, INT [LUT + 1*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
mov T0d, INT [LUT + 2*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
mov T0d, INT [LUT + 3*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
mov T0d, INT [LUT + 4*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
mov T0d, INT [LUT + 5*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
mov T0d, INT [LUT + 6*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
|
||||
mov T0d, INT [LUT + 7*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 15*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD15 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
pxor X1, X1
|
||||
|
||||
mov T0d, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0d, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0d, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0d, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0d, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0d, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0d, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0d, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD15:
|
||||
%endmacro
|
||||
|
||||
%macro LOAD8 0
|
||||
pxor N0, N0
|
||||
|
||||
mov T0d, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
|
||||
mov T0d, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0d, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0d, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0d, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0d, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0d, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0d, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD7 0
|
||||
pxor N0, N0
|
||||
pxor X0, X0
|
||||
|
||||
mov T1d, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 0
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD7:
|
||||
%endmacro
|
||||
|
||||
%macro REDUCE0 0
|
||||
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
|
||||
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
|
||||
movdqa xmm2, XMMWORD [VALUES + (16*2)]
|
||||
movdqa xmm3, XMMWORD [VALUES + (24*2)]
|
||||
movdqa xmm4, XMMWORD [VALUES + (32*2)]
|
||||
movdqa xmm5, XMMWORD [VALUES + (40*2)]
|
||||
movdqa xmm6, XMMWORD [VALUES + (48*2)]
|
||||
movdqa xmm7, XMMWORD [VALUES + (56*2)]
|
||||
|
||||
pcmpeqw xmm0, ZERO
|
||||
pcmpeqw xmm1, ZERO
|
||||
pcmpeqw xmm2, ZERO
|
||||
pcmpeqw xmm3, ZERO
|
||||
pcmpeqw xmm4, ZERO
|
||||
pcmpeqw xmm5, ZERO
|
||||
pcmpeqw xmm6, ZERO
|
||||
pcmpeqw xmm7, ZERO
|
||||
|
||||
packsswb xmm0, xmm1
|
||||
packsswb xmm2, xmm3
|
||||
packsswb xmm4, xmm5
|
||||
packsswb xmm6, xmm7
|
||||
|
||||
pmovmskb eax, xmm0
|
||||
pmovmskb ecx, xmm2
|
||||
pmovmskb edx, xmm4
|
||||
pmovmskb esi, xmm6
|
||||
|
||||
shl rcx, 16
|
||||
shl rdx, 32
|
||||
shl rsi, 48
|
||||
|
||||
or rax, rcx
|
||||
or rdx, rsi
|
||||
or rax, rdx
|
||||
|
||||
not rax
|
||||
|
||||
mov MMWORD [r15], rax
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_first().
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *values,
|
||||
; size_t *zerobits)
|
||||
;
|
||||
; r10 = const JCOEF *block
|
||||
; r11 = const int *jpeg_natural_order_start
|
||||
; r12 = int Sl
|
||||
; r13 = int Al
|
||||
; r14 = JCOEF *values
|
||||
; r15 = size_t *zerobits
|
||||
|
||||
%define ZERO xmm9
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define LUT r11
|
||||
%define T0 rcx
|
||||
%define T0d ecx
|
||||
%define T1 rdx
|
||||
%define T1d edx
|
||||
%define BLOCK r10
|
||||
%define VALUES r14
|
||||
%define LEN r12d
|
||||
%define LENEND r13d
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [rbp - 16]
|
||||
collect_args 6
|
||||
|
||||
movdqa XMMWORD [rbp - 16], ZERO
|
||||
|
||||
movd AL, r13d
|
||||
pxor ZERO, ZERO
|
||||
mov K, LEN
|
||||
mov LENEND, LEN
|
||||
and K, -16
|
||||
and LENEND, 7
|
||||
shr K, 4
|
||||
jz .ELOOP16
|
||||
.BLOOP16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
dec K
|
||||
jnz .BLOOP16
|
||||
test LEN, 15
|
||||
je .PADDING
|
||||
.ELOOP16:
|
||||
test LEN, 8
|
||||
jz .TRY7
|
||||
test LEN, 7
|
||||
jz .TRY8
|
||||
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
pxor N0, X0
|
||||
pxor N1, X1
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
||||
add VALUES, 16*2
|
||||
jmp .PADDING
|
||||
.TRY8:
|
||||
LOAD8
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
jmp .PADDING
|
||||
.TRY7:
|
||||
LOAD7
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
pxor N0, X0
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
||||
add VALUES, 8*2
|
||||
.PADDING:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDING
|
||||
align 16
|
||||
.ZEROLOOP:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOP
|
||||
.EPADDING:
|
||||
sub VALUES, DCTSIZE2*2
|
||||
|
||||
REDUCE0
|
||||
|
||||
movdqa ZERO, XMMWORD [rbp - 16]
|
||||
uncollect_args 6
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T0d
|
||||
%undef T1
|
||||
%undef T1d
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
%undef LENEND
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_refine().
|
||||
;
|
||||
; GLOBAL(int)
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *absvalues,
|
||||
; size_t *bits)
|
||||
;
|
||||
; r10 = const JCOEF *block
|
||||
; r11 = const int *jpeg_natural_order_start
|
||||
; r12 = int Sl
|
||||
; r13 = int Al
|
||||
; r14 = JCOEF *values
|
||||
; r15 = size_t *bits
|
||||
|
||||
%define ZERO xmm9
|
||||
%define ONE xmm5
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define KK r9d
|
||||
%define EOB r8d
|
||||
%define SIGN rdi
|
||||
%define LUT r11
|
||||
%define T0 rcx
|
||||
%define T0d ecx
|
||||
%define T1 rdx
|
||||
%define T1d edx
|
||||
%define BLOCK r10
|
||||
%define VALUES r14
|
||||
%define LEN r12d
|
||||
%define LENEND r13d
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [rbp - 16]
|
||||
collect_args 6
|
||||
|
||||
movdqa XMMWORD [rbp - 16], ZERO
|
||||
|
||||
xor SIGN, SIGN
|
||||
xor EOB, EOB
|
||||
xor KK, KK
|
||||
movd AL, r13d
|
||||
pxor ZERO, ZERO
|
||||
pcmpeqw ONE, ONE
|
||||
psrlw ONE, 15
|
||||
mov K, LEN
|
||||
mov LENEND, LEN
|
||||
and K, -16
|
||||
and LENEND, 7
|
||||
shr K, 4
|
||||
jz .ELOOPR16
|
||||
.BLOOPR16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 16 ; make room for sizebits
|
||||
shl T0, 48
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER16 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER16:
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
add KK, 16
|
||||
dec K
|
||||
jnz .BLOOPR16
|
||||
.ELOOPR16:
|
||||
test LEN, 8
|
||||
jz .TRYR7
|
||||
test LEN, 7
|
||||
jz .TRYR8
|
||||
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 16 ; make room for sizebits
|
||||
shl T0, 48
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER15 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER15:
|
||||
add VALUES, 16*2
|
||||
jmp .PADDINGR
|
||||
.TRYR8:
|
||||
LOAD8
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 8 ; make room for sizebits
|
||||
shl T0, 56
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER8 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER8:
|
||||
add VALUES, 8*2
|
||||
jmp .PADDINGR
|
||||
.TRYR7:
|
||||
LOAD7
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 8 ; make room for sizebits
|
||||
shl T0, 56
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER7 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER7:
|
||||
add VALUES, 8*2
|
||||
.PADDINGR:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDINGR
|
||||
align 16
|
||||
.ZEROLOOPR:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
shr SIGN, 8
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOPR
|
||||
.EPADDINGR:
|
||||
not SIGN
|
||||
sub VALUES, DCTSIZE2*2
|
||||
mov MMWORD [r15+SIZEOF_MMWORD], SIGN
|
||||
|
||||
REDUCE0
|
||||
|
||||
mov eax, EOB
|
||||
movdqa ZERO, XMMWORD [rbp - 16]
|
||||
uncollect_args 6
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef ONE
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef KK
|
||||
%undef EOB
|
||||
%undef SIGN
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T0d
|
||||
%undef T1
|
||||
%undef T1d
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
%undef LENEND
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
366
TMessagesProj/jni/mozjpeg/simd/x86_64/jcsample-avx2.asm
Normal file
366
TMessagesProj/jni/mozjpeg/simd/x86_64/jcsample-avx2.asm
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
;
|
||||
; jcsample.asm - downsampling (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12d = JDIMENSION v_samp_factor
|
||||
; r13d = JDIMENSION width_in_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
mov ecx, r13d
|
||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, r10d
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push rcx
|
||||
shl rcx, 1 ; output_cols * 2
|
||||
sub rcx, rdx
|
||||
jle short .expand_end
|
||||
|
||||
mov rax, r11
|
||||
test rax, rax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov rsi, r14 ; input_data
|
||||
.expandloop:
|
||||
push rax
|
||||
push rcx
|
||||
|
||||
mov rdi, JSAMPROW [rsi]
|
||||
add rdi, rdx
|
||||
mov al, JSAMPLE [rdi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
dec rax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop rcx ; output_cols
|
||||
|
||||
; -- h2v1_downsample
|
||||
|
||||
mov eax, r12d ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov rdx, 0x00010000 ; bias pattern
|
||||
vmovd xmm7, edx
|
||||
vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
||||
vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
|
||||
vpcmpeqw ymm6, ymm6, ymm6
|
||||
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov rsi, r14 ; input_data
|
||||
mov rdi, r15 ; output_data
|
||||
.rowloop:
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae short .columnloop
|
||||
|
||||
.columnloop_r24:
|
||||
; rcx can possibly be 8, 16, 24
|
||||
cmp rcx, 24
|
||||
jne .columnloop_r16
|
||||
vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r16:
|
||||
cmp rcx, 16
|
||||
jne .columnloop_r8
|
||||
vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r8:
|
||||
vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
|
||||
.downsample:
|
||||
vpsrlw ymm2, ymm0, BYTE_BIT
|
||||
vpand ymm0, ymm0, ymm6
|
||||
vpsrlw ymm3, ymm1, BYTE_BIT
|
||||
vpand ymm1, ymm1, ymm6
|
||||
|
||||
vpaddw ymm0, ymm0, ymm2
|
||||
vpaddw ymm1, ymm1, ymm3
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm1, ymm1, ymm7
|
||||
vpsrlw ymm0, ymm0, 1
|
||||
vpsrlw ymm1, ymm1, 1
|
||||
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
|
||||
|
||||
sub rcx, byte SIZEOF_YMMWORD ; outcol
|
||||
add rsi, byte 2*SIZEOF_YMMWORD ; inptr
|
||||
add rdi, byte 1*SIZEOF_YMMWORD ; outptr
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae short .columnloop
|
||||
test rcx, rcx
|
||||
jnz near .columnloop_r24
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
uncollect_args 6
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12d = JDIMENSION v_samp_factor
|
||||
; r13d = JDIMENSION width_in_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
mov ecx, r13d
|
||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, r10d
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push rcx
|
||||
shl rcx, 1 ; output_cols * 2
|
||||
sub rcx, rdx
|
||||
jle short .expand_end
|
||||
|
||||
mov rax, r11
|
||||
test rax, rax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov rsi, r14 ; input_data
|
||||
.expandloop:
|
||||
push rax
|
||||
push rcx
|
||||
|
||||
mov rdi, JSAMPROW [rsi]
|
||||
add rdi, rdx
|
||||
mov al, JSAMPLE [rdi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
dec rax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop rcx ; output_cols
|
||||
|
||||
; -- h2v2_downsample
|
||||
|
||||
mov eax, r12d ; rowctr
|
||||
test rax, rax
|
||||
jle near .return
|
||||
|
||||
mov rdx, 0x00020001 ; bias pattern
|
||||
vmovd xmm7, edx
|
||||
vpcmpeqw ymm6, ymm6, ymm6
|
||||
vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
|
||||
vperm2i128 ymm7, ymm7, ymm7, 0
|
||||
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov rsi, r14 ; input_data
|
||||
mov rdi, r15 ; output_data
|
||||
.rowloop:
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae short .columnloop
|
||||
|
||||
.columnloop_r24:
|
||||
cmp rcx, 24
|
||||
jne .columnloop_r16
|
||||
vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
|
||||
vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r16:
|
||||
cmp rcx, 16
|
||||
jne .columnloop_r8
|
||||
vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vpxor ymm2, ymm2, ymm2
|
||||
vpxor ymm3, ymm3, ymm3
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop_r8:
|
||||
vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||
vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
vpxor ymm2, ymm2, ymm2
|
||||
vpxor ymm3, ymm3, ymm3
|
||||
mov rcx, SIZEOF_YMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
|
||||
vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
|
||||
.downsample:
|
||||
vpand ymm4, ymm0, ymm6
|
||||
vpsrlw ymm0, ymm0, BYTE_BIT
|
||||
vpand ymm5, ymm1, ymm6
|
||||
vpsrlw ymm1, ymm1, BYTE_BIT
|
||||
vpaddw ymm0, ymm0, ymm4
|
||||
vpaddw ymm1, ymm1, ymm5
|
||||
|
||||
vpand ymm4, ymm2, ymm6
|
||||
vpsrlw ymm2, ymm2, BYTE_BIT
|
||||
vpand ymm5, ymm3, ymm6
|
||||
vpsrlw ymm3, ymm3, BYTE_BIT
|
||||
vpaddw ymm2, ymm2, ymm4
|
||||
vpaddw ymm3, ymm3, ymm5
|
||||
|
||||
vpaddw ymm0, ymm0, ymm1
|
||||
vpaddw ymm2, ymm2, ymm3
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm2, ymm2, ymm7
|
||||
vpsrlw ymm0, ymm0, 2
|
||||
vpsrlw ymm2, ymm2, 2
|
||||
|
||||
vpackuswb ymm0, ymm0, ymm2
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
|
||||
|
||||
sub rcx, byte SIZEOF_YMMWORD ; outcol
|
||||
add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
|
||||
add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
|
||||
add rdi, byte 1*SIZEOF_YMMWORD ; outptr
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jae near .columnloop
|
||||
test rcx, rcx
|
||||
jnz near .columnloop_r24
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
|
||||
add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
|
||||
dec rax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
uncollect_args 6
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
329
TMessagesProj/jni/mozjpeg/simd/x86_64/jcsample-sse2.asm
Normal file
329
TMessagesProj/jni/mozjpeg/simd/x86_64/jcsample-sse2.asm
Normal file
|
|
@ -0,0 +1,329 @@
|
|||
;
|
||||
; jcsample.asm - downsampling (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12d = JDIMENSION v_samp_factor
|
||||
; r13d = JDIMENSION width_in_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
mov ecx, r13d
|
||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, r10d
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push rcx
|
||||
shl rcx, 1 ; output_cols * 2
|
||||
sub rcx, rdx
|
||||
jle short .expand_end
|
||||
|
||||
mov rax, r11
|
||||
test rax, rax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov rsi, r14 ; input_data
|
||||
.expandloop:
|
||||
push rax
|
||||
push rcx
|
||||
|
||||
mov rdi, JSAMPROW [rsi]
|
||||
add rdi, rdx
|
||||
mov al, JSAMPLE [rdi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
dec rax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop rcx ; output_cols
|
||||
|
||||
; -- h2v1_downsample
|
||||
|
||||
mov eax, r12d ; rowctr
|
||||
test eax, eax
|
||||
jle near .return
|
||||
|
||||
mov rdx, 0x00010000 ; bias pattern
|
||||
movd xmm7, edx
|
||||
pcmpeqw xmm6, xmm6
|
||||
pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
||||
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov rsi, r14 ; input_data
|
||||
mov rdi, r15 ; output_data
|
||||
.rowloop:
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
|
||||
.columnloop_r8:
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
pxor xmm1, xmm1
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
.downsample:
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
|
||||
pand xmm0, xmm6
|
||||
psrlw xmm2, BYTE_BIT
|
||||
pand xmm1, xmm6
|
||||
psrlw xmm3, BYTE_BIT
|
||||
|
||||
paddw xmm0, xmm2
|
||||
paddw xmm1, xmm3
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm1, xmm7
|
||||
psrlw xmm0, 1
|
||||
psrlw xmm1, 1
|
||||
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD ; outcol
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
test rcx, rcx
|
||||
jnz short .columnloop_r8
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args 6
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor,
|
||||
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
|
||||
; JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12d = JDIMENSION v_samp_factor
|
||||
; r13d = JDIMENSION width_in_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
mov ecx, r13d
|
||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov edx, r10d
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push rcx
|
||||
shl rcx, 1 ; output_cols * 2
|
||||
sub rcx, rdx
|
||||
jle short .expand_end
|
||||
|
||||
mov rax, r11
|
||||
test rax, rax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov rsi, r14 ; input_data
|
||||
.expandloop:
|
||||
push rax
|
||||
push rcx
|
||||
|
||||
mov rdi, JSAMPROW [rsi]
|
||||
add rdi, rdx
|
||||
mov al, JSAMPLE [rdi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
dec rax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop rcx ; output_cols
|
||||
|
||||
; -- h2v2_downsample
|
||||
|
||||
mov eax, r12d ; rowctr
|
||||
test rax, rax
|
||||
jle near .return
|
||||
|
||||
mov rdx, 0x00020001 ; bias pattern
|
||||
movd xmm7, edx
|
||||
pcmpeqw xmm6, xmm6
|
||||
pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
|
||||
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov rsi, r14 ; input_data
|
||||
mov rdi, r15 ; output_data
|
||||
.rowloop:
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
|
||||
.columnloop_r8:
|
||||
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
pxor xmm2, xmm2
|
||||
pxor xmm3, xmm3
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
||||
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
.downsample:
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm1
|
||||
pand xmm0, xmm6
|
||||
psrlw xmm4, BYTE_BIT
|
||||
pand xmm1, xmm6
|
||||
psrlw xmm5, BYTE_BIT
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm5
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm5, xmm3
|
||||
pand xmm2, xmm6
|
||||
psrlw xmm4, BYTE_BIT
|
||||
pand xmm3, xmm6
|
||||
psrlw xmm5, BYTE_BIT
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
|
||||
paddw xmm0, xmm1
|
||||
paddw xmm2, xmm3
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm2, xmm7
|
||||
psrlw xmm0, 2
|
||||
psrlw xmm2, 2
|
||||
|
||||
packuswb xmm0, xmm2
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD ; outcol
|
||||
add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
|
||||
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test rcx, rcx
|
||||
jnz near .columnloop_r8
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
|
||||
add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
|
||||
dec rax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args 6
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
495
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolext-avx2.asm
Normal file
495
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolext-avx2.asm
Normal file
|
|
@ -0,0 +1,495 @@
|
|||
;
|
||||
; jdcolext.asm - colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
; JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
; int num_rows)
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION out_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12d = JDIMENSION input_row
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d ; num_cols
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rdi, r11
|
||||
mov ecx, r12d
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
||||
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
||||
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rdi, r13
|
||||
mov eax, r14d
|
||||
test rax, rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rax
|
||||
push rdi
|
||||
push rdx
|
||||
push rbx
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr0
|
||||
mov rbx, JSAMPROW [rbx] ; inptr1
|
||||
mov rdx, JSAMPROW [rdx] ; inptr2
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
.columnloop:
|
||||
|
||||
vmovdqu ymm5, YMMWORD [rbx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
vmovdqu ymm1, YMMWORD [rdx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpcmpeqw ymm0, ymm0, ymm0
|
||||
vpcmpeqw ymm7, ymm7, ymm7
|
||||
vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..}
|
||||
vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
|
||||
vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
|
||||
vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
|
||||
vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
|
||||
|
||||
vpaddw ymm2, ymm4, ymm7
|
||||
vpaddw ymm3, ymm5, ymm7
|
||||
vpaddw ymm6, ymm0, ymm7
|
||||
vpaddw ymm7, ymm1, ymm7
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE
|
||||
vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO
|
||||
vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE
|
||||
vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO
|
||||
|
||||
vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbE * -FIX(0.22800))
|
||||
vpmulhw ymm5, ymm5, [rel PW_MF0228] ; ymm5=(2*CbO * -FIX(0.22800))
|
||||
vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrE * FIX(0.40200))
|
||||
vpmulhw ymm1, ymm1, [rel PW_F0402] ; ymm1=(2*CrO * FIX(0.40200))
|
||||
|
||||
vpaddw ymm4, ymm4, [rel PW_ONE]
|
||||
vpaddw ymm5, ymm5, [rel PW_ONE]
|
||||
vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800))
|
||||
vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800))
|
||||
vpaddw ymm0, ymm0, [rel PW_ONE]
|
||||
vpaddw ymm1, ymm1, [rel PW_ONE]
|
||||
vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200))
|
||||
vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200))
|
||||
|
||||
vpaddw ymm4, ymm4, ymm2
|
||||
vpaddw ymm5, ymm5, ymm3
|
||||
vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||
vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||
vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||
vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E
|
||||
vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O
|
||||
|
||||
vpunpckhwd ymm4, ymm2, ymm6
|
||||
vpunpcklwd ymm2, ymm2, ymm6
|
||||
vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285]
|
||||
vpmaddwd ymm4, ymm4, [rel PW_MF0344_F0285]
|
||||
vpunpckhwd ymm5, ymm3, ymm7
|
||||
vpunpcklwd ymm3, ymm3, ymm7
|
||||
vpmaddwd ymm3, ymm3, [rel PW_MF0344_F0285]
|
||||
vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285]
|
||||
|
||||
vpaddd ymm2, ymm2, [rel PD_ONEHALF]
|
||||
vpaddd ymm4, ymm4, [rel PD_ONEHALF]
|
||||
vpsrad ymm2, ymm2, SCALEBITS
|
||||
vpsrad ymm4, ymm4, SCALEBITS
|
||||
vpaddd ymm3, ymm3, [rel PD_ONEHALF]
|
||||
vpaddd ymm5, ymm5, [rel PD_ONEHALF]
|
||||
vpsrad ymm3, ymm3, SCALEBITS
|
||||
vpsrad ymm5, ymm5, SCALEBITS
|
||||
|
||||
vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||
vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||
vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||
vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||
|
||||
vmovdqu ymm5, YMMWORD [rsi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpcmpeqw ymm4, ymm4, ymm4
|
||||
vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE
|
||||
vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO
|
||||
|
||||
vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
|
||||
vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
|
||||
vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
|
||||
vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
|
||||
; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
|
||||
; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
|
||||
|
||||
vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
|
||||
; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
|
||||
vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
|
||||
; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
|
||||
; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
|
||||
|
||||
vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
|
||||
; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
|
||||
|
||||
vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
|
||||
; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
|
||||
vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
|
||||
; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
|
||||
vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
|
||||
; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
|
||||
|
||||
vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
|
||||
; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
|
||||
vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
|
||||
; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
|
||||
|
||||
vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
|
||||
; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
|
||||
vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
|
||||
vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
|
||||
vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
|
||||
|
||||
vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
|
||||
; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
|
||||
vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
|
||||
; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
|
||||
vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
|
||||
; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
|
||||
vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
|
||||
|
||||
vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test rdi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st64:
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_YMMWORD
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmF
|
||||
sub rcx, byte 2*SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st31
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st31:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
vmovq XMM_MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_MMWORD
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
vmovd XMM_DWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_DWORD
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of rax to the output when it has enough
|
||||
; space.
|
||||
vmovd eax, xmmA
|
||||
cmp rcx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [rdi], ax
|
||||
add rdi, byte SIZEOF_WORD
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
shr rax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of rax to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
mov byte [rdi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%else
|
||||
vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%endif
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
|
||||
; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
|
||||
; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
|
||||
vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
|
||||
; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
|
||||
; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
|
||||
|
||||
vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
|
||||
; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
|
||||
; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
|
||||
vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
|
||||
; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
|
||||
vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
|
||||
; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
|
||||
|
||||
vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
|
||||
vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test rdi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
|
||||
.out0:
|
||||
add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st64:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/2
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmC
|
||||
vmovdqa ymmD, ymmH
|
||||
sub rcx, byte SIZEOF_YMMWORD/2
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/4
|
||||
jb short .column_st16
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub rcx, byte SIZEOF_YMMWORD/4
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/8
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD/8
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_YMMWORD/16
|
||||
jb short .column_st7
|
||||
vmovq MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_YMMWORD/16*4
|
||||
sub rcx, byte SIZEOF_YMMWORD/16
|
||||
vpsrldq xmmA, SIZEOF_YMMWORD/16*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
vmovd XMM_DWORD [rdi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.nextrow:
|
||||
pop rcx
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
pop rdi
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
add rbx, byte SIZEOF_JSAMPROW
|
||||
add rdx, byte SIZEOF_JSAMPROW
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
438
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolext-sse2.asm
Normal file
438
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolext-sse2.asm
Normal file
|
|
@ -0,0 +1,438 @@
|
|||
;
|
||||
; jdcolext.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
; JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
; int num_rows)
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION out_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12d = JDIMENSION input_row
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d ; num_cols
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rdi, r11
|
||||
mov ecx, r12d
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
||||
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
||||
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rdi, r13
|
||||
mov eax, r14d
|
||||
test rax, rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rax
|
||||
push rdi
|
||||
push rdx
|
||||
push rbx
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr0
|
||||
mov rbx, JSAMPROW [rbx] ; inptr1
|
||||
mov rdx, JSAMPROW [rdx] ; inptr2
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
|
||||
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm4, xmm4
|
||||
pcmpeqw xmm7, xmm7
|
||||
psrlw xmm4, BYTE_BIT
|
||||
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
|
||||
psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
||||
pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
|
||||
psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
||||
|
||||
paddw xmm4, xmm7
|
||||
paddw xmm5, xmm7
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm1, xmm7
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movdqa xmm2, xmm4 ; xmm2=CbE
|
||||
movdqa xmm3, xmm5 ; xmm3=CbO
|
||||
paddw xmm4, xmm4 ; xmm4=2*CbE
|
||||
paddw xmm5, xmm5 ; xmm5=2*CbO
|
||||
movdqa xmm6, xmm0 ; xmm6=CrE
|
||||
movdqa xmm7, xmm1 ; xmm7=CrO
|
||||
paddw xmm0, xmm0 ; xmm0=2*CrE
|
||||
paddw xmm1, xmm1 ; xmm1=2*CrO
|
||||
|
||||
pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
|
||||
pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
|
||||
pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
|
||||
pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
|
||||
|
||||
paddw xmm4, [rel PW_ONE]
|
||||
paddw xmm5, [rel PW_ONE]
|
||||
psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
|
||||
psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
|
||||
paddw xmm0, [rel PW_ONE]
|
||||
paddw xmm1, [rel PW_ONE]
|
||||
psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
|
||||
psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
|
||||
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm5, xmm3
|
||||
paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||
paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||
paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||
paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm5, xmm3
|
||||
punpcklwd xmm2, xmm6
|
||||
punpckhwd xmm4, xmm6
|
||||
pmaddwd xmm2, [rel PW_MF0344_F0285]
|
||||
pmaddwd xmm4, [rel PW_MF0344_F0285]
|
||||
punpcklwd xmm3, xmm7
|
||||
punpckhwd xmm5, xmm7
|
||||
pmaddwd xmm3, [rel PW_MF0344_F0285]
|
||||
pmaddwd xmm5, [rel PW_MF0344_F0285]
|
||||
|
||||
paddd xmm2, [rel PD_ONEHALF]
|
||||
paddd xmm4, [rel PD_ONEHALF]
|
||||
psrad xmm2, SCALEBITS
|
||||
psrad xmm4, SCALEBITS
|
||||
paddd xmm3, [rel PD_ONEHALF]
|
||||
paddd xmm5, [rel PD_ONEHALF]
|
||||
psrad xmm3, SCALEBITS
|
||||
psrad xmm5, SCALEBITS
|
||||
|
||||
packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||
packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||
psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||
psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||
|
||||
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm4, xmm4
|
||||
psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
|
||||
psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
||||
|
||||
paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
||||
paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
||||
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||
|
||||
paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
||||
paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
||||
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||
|
||||
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
||||
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
||||
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
movdqa xmmH, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||
|
||||
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||
|
||||
movdqa xmmC, xmmD
|
||||
movdqa xmmB, xmmD
|
||||
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||
|
||||
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||
|
||||
movdqa xmmF, xmmE
|
||||
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||
|
||||
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||
movdqa xmmB, xmmE
|
||||
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||
|
||||
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||
movdqa xmmB, xmmF
|
||||
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||
|
||||
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
movq XMM_MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_MMWORD
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
psrldq xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
movd XMM_DWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_DWORD
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
psrldq xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of rax to the output when it has enough
|
||||
; space.
|
||||
movd eax, xmmA
|
||||
cmp rcx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [rdi], ax
|
||||
add rdi, byte SIZEOF_WORD
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
shr rax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of rax to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
mov byte [rdi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%else
|
||||
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%endif
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||
movdqa xmmG, xmmB
|
||||
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
movdqa xmmH, xmmC
|
||||
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmC
|
||||
movdqa xmmD, xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_XMMWORD/8
|
||||
jb short .column_st7
|
||||
movq MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD/8*4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
psrldq xmmA, SIZEOF_XMMWORD/8*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
movd XMM_DWORD [rdi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.nextrow:
|
||||
pop rcx
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
pop rdi
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
add rbx, byte SIZEOF_JSAMPROW
|
||||
add rdx, byte SIZEOF_JSAMPROW
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
118
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolor-avx2.asm
Normal file
118
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolor-avx2.asm
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
;
|
||||
; jdcolor.asm - colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_avx2):
|
||||
|
||||
PW_F0402 times 16 dw F_0_402
|
||||
PW_MF0228 times 16 dw -F_0_228
|
||||
PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
|
||||
PW_ONE times 16 dw 1
|
||||
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
|
||||
%include "jdcolext-avx2.asm"
|
||||
117
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolor-sse2.asm
Normal file
117
TMessagesProj/jni/mozjpeg/simd/x86_64/jdcolor-sse2.asm
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
;
|
||||
; jdcolor.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_sse2):
|
||||
|
||||
PW_F0402 times 8 dw F_0_402
|
||||
PW_MF0228 times 8 dw -F_0_228
|
||||
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
|
||||
PW_ONE times 8 dw 1
|
||||
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
|
||||
%include "jdcolext-sse2.asm"
|
||||
136
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmerge-avx2.asm
Normal file
136
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmerge-avx2.asm
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
;
|
||||
; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_merged_upsample_avx2)
|
||||
|
||||
EXTN(jconst_merged_upsample_avx2):
|
||||
|
||||
PW_F0402 times 16 dw F_0_402
|
||||
PW_MF0228 times 16 dw -F_0_228
|
||||
PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
|
||||
PW_ONE times 16 dw 1
|
||||
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extrgb_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extrgb_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extrgbx_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extrgbx_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extbgr_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extbgr_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extbgrx_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extbgrx_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extxbgr_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extxbgr_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_avx2 \
|
||||
jsimd_h2v1_extxrgb_merged_upsample_avx2
|
||||
%define jsimd_h2v2_merged_upsample_avx2 \
|
||||
jsimd_h2v2_extxrgb_merged_upsample_avx2
|
||||
%include "jdmrgext-avx2.asm"
|
||||
135
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmerge-sse2.asm
Normal file
135
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmerge-sse2.asm
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
;
|
||||
; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_merged_upsample_sse2)
|
||||
|
||||
EXTN(jconst_merged_upsample_sse2):
|
||||
|
||||
PW_F0402 times 8 dw F_0_402
|
||||
PW_MF0228 times 8 dw -F_0_228
|
||||
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
|
||||
PW_ONE times 8 dw 1
|
||||
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extrgb_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extrgb_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extrgbx_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extrgbx_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extbgr_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extbgr_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extbgrx_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extbgrx_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extxbgr_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extxbgr_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_h2v1_merged_upsample_sse2 \
|
||||
jsimd_h2v1_extxrgb_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 \
|
||||
jsimd_h2v2_extxrgb_merged_upsample_sse2
|
||||
%include "jdmrgext-sse2.asm"
|
||||
593
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmrgext-avx2.asm
Normal file
593
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmrgext-avx2.asm
Normal file
|
|
@ -0,0 +1,593 @@
|
|||
;
|
||||
; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d ; col
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rdi, r11
|
||||
mov ecx, r12d
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
mov rdi, r13
|
||||
mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
pop rcx ; col
|
||||
|
||||
.columnloop:
|
||||
|
||||
vmovdqu ymm6, YMMWORD [rbx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
vmovdqu ymm7, YMMWORD [rdx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
|
||||
vpcmpeqw ymm3, ymm3, ymm3
|
||||
vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
|
||||
vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
|
||||
vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL
|
||||
vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
|
||||
vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL
|
||||
vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
|
||||
|
||||
vpaddw ymm5, ymm6, ymm3
|
||||
vpaddw ymm2, ymm4, ymm3
|
||||
vpaddw ymm1, ymm7, ymm3
|
||||
vpaddw ymm3, ymm0, ymm3
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH
|
||||
vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL
|
||||
vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH
|
||||
vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL
|
||||
|
||||
vpmulhw ymm6, ymm6, [rel PW_MF0228] ; ymm6=(2*CbH * -FIX(0.22800))
|
||||
vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbL * -FIX(0.22800))
|
||||
vpmulhw ymm7, ymm7, [rel PW_F0402] ; ymm7=(2*CrH * FIX(0.40200))
|
||||
vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrL * FIX(0.40200))
|
||||
|
||||
vpaddw ymm6, ymm6, [rel PW_ONE]
|
||||
vpaddw ymm4, ymm4, [rel PW_ONE]
|
||||
vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800))
|
||||
vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800))
|
||||
vpaddw ymm7, ymm7, [rel PW_ONE]
|
||||
vpaddw ymm0, ymm0, [rel PW_ONE]
|
||||
vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200))
|
||||
vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200))
|
||||
|
||||
vpaddw ymm6, ymm6, ymm5
|
||||
vpaddw ymm4, ymm4, ymm2
|
||||
vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||
vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||
vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||
vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H
|
||||
vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H
|
||||
|
||||
vpunpckhwd ymm6, ymm5, ymm1
|
||||
vpunpcklwd ymm5, ymm5, ymm1
|
||||
vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285]
|
||||
vpmaddwd ymm6, ymm6, [rel PW_MF0344_F0285]
|
||||
vpunpckhwd ymm7, ymm2, ymm3
|
||||
vpunpcklwd ymm2, ymm2, ymm3
|
||||
vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285]
|
||||
vpmaddwd ymm7, ymm7, [rel PW_MF0344_F0285]
|
||||
|
||||
vpaddd ymm5, ymm5, [rel PD_ONEHALF]
|
||||
vpaddd ymm6, ymm6, [rel PD_ONEHALF]
|
||||
vpsrad ymm5, ymm5, SCALEBITS
|
||||
vpsrad ymm6, ymm6, SCALEBITS
|
||||
vpaddd ymm2, ymm2, [rel PD_ONEHALF]
|
||||
vpaddd ymm7, ymm7, [rel PD_ONEHALF]
|
||||
vpsrad ymm2, ymm2, SCALEBITS
|
||||
vpsrad ymm7, ymm7, SCALEBITS
|
||||
|
||||
vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||
vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||
vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||
vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||
|
||||
vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H
|
||||
|
||||
mov al, 2 ; Yctr
|
||||
jmp short .Yloop_1st
|
||||
|
||||
.Yloop_2nd:
|
||||
vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
|
||||
vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
|
||||
vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
|
||||
|
||||
.Yloop_1st:
|
||||
vmovdqu ymm7, YMMWORD [rsi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
|
||||
|
||||
vpcmpeqw ymm6, ymm6, ymm6
|
||||
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE
|
||||
vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO
|
||||
|
||||
vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H)
|
||||
vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H)
|
||||
vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H)
|
||||
|
||||
vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
|
||||
vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
|
||||
vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
|
||||
|
||||
vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
|
||||
vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
|
||||
vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
|
||||
vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
|
||||
; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
|
||||
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
|
||||
; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
|
||||
|
||||
vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
|
||||
; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
|
||||
vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
|
||||
; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
|
||||
; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
|
||||
|
||||
vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
|
||||
; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
|
||||
|
||||
vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
|
||||
; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
|
||||
vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
|
||||
; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
|
||||
vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
|
||||
; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
|
||||
|
||||
vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
|
||||
; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
|
||||
vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
|
||||
; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
|
||||
|
||||
vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
|
||||
; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
|
||||
vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
|
||||
vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
|
||||
vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
|
||||
; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
|
||||
|
||||
vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
|
||||
; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
|
||||
vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
|
||||
; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
|
||||
vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
|
||||
; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
|
||||
vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
|
||||
|
||||
vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
|
||||
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
|
||||
vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
|
||||
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
|
||||
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
|
||||
vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
|
||||
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test rdi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add rbx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st64:
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_YMMWORD
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmF
|
||||
sub rcx, byte 2*SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st31
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st31:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
vmovq XMM_MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_MMWORD
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
vmovd XMM_DWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_DWORD
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of rax to the output when it has enough
|
||||
; space.
|
||||
vmovd eax, xmmA
|
||||
cmp rcx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [rdi], ax
|
||||
add rdi, byte SIZEOF_WORD
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
shr rax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of rax to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .endcolumn
|
||||
mov byte [rdi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%else
|
||||
vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
|
||||
vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
|
||||
%endif
|
||||
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
|
||||
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
|
||||
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
|
||||
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
|
||||
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
|
||||
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
|
||||
; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
|
||||
; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
|
||||
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
|
||||
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
|
||||
; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
|
||||
vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
|
||||
; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
|
||||
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
|
||||
; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
|
||||
|
||||
vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
|
||||
; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
|
||||
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
|
||||
; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
|
||||
vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
|
||||
; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
|
||||
vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
|
||||
; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
|
||||
|
||||
vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
|
||||
vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
|
||||
|
||||
vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
|
||||
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
|
||||
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
|
||||
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
|
||||
vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
|
||||
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test rdi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
|
||||
.out0:
|
||||
add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr0
|
||||
dec al
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add rbx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st64:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/2
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmC
|
||||
vmovdqa ymmD, ymmH
|
||||
sub rcx, byte SIZEOF_YMMWORD/2
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/4
|
||||
jb short .column_st16
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub rcx, byte SIZEOF_YMMWORD/4
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/8
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
sub rcx, byte SIZEOF_YMMWORD/8
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_YMMWORD/16
|
||||
jb short .column_st7
|
||||
vmovq MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_YMMWORD/16*4
|
||||
sub rcx, byte SIZEOF_YMMWORD/16
|
||||
vpsrldq xmmA, SIZEOF_YMMWORD/16*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .endcolumn
|
||||
vmovd XMM_DWORD [rdi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.endcolumn:
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov eax, r10d
|
||||
|
||||
mov rdi, r11
|
||||
mov ecx, r12d
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
mov rdi, r13
|
||||
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
push rdx ; inptr2
|
||||
push rbx ; inptr1
|
||||
push rsi ; inptr00
|
||||
mov rbx, rsp
|
||||
|
||||
push rdi
|
||||
push rcx
|
||||
push rax
|
||||
|
||||
%ifdef WIN64
|
||||
mov r8, rcx
|
||||
mov r9, rdi
|
||||
mov rcx, rax
|
||||
mov rdx, rbx
|
||||
%else
|
||||
mov rdx, rcx
|
||||
mov rcx, rdi
|
||||
mov rdi, rax
|
||||
mov rsi, rbx
|
||||
%endif
|
||||
|
||||
call EXTN(jsimd_h2v1_merged_upsample_avx2)
|
||||
|
||||
pop rax
|
||||
pop rcx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
add rdi, byte SIZEOF_JSAMPROW ; outptr1
|
||||
add rsi, byte SIZEOF_JSAMPROW ; inptr01
|
||||
|
||||
push rdx ; inptr2
|
||||
push rbx ; inptr1
|
||||
push rsi ; inptr00
|
||||
mov rbx, rsp
|
||||
|
||||
push rdi
|
||||
push rcx
|
||||
push rax
|
||||
|
||||
%ifdef WIN64
|
||||
mov r8, rcx
|
||||
mov r9, rdi
|
||||
mov rcx, rax
|
||||
mov rdx, rbx
|
||||
%else
|
||||
mov rdx, rcx
|
||||
mov rcx, rdi
|
||||
mov rdi, rax
|
||||
mov rsi, rbx
|
||||
%endif
|
||||
|
||||
call EXTN(jsimd_h2v1_merged_upsample_avx2)
|
||||
|
||||
pop rax
|
||||
pop rcx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
535
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmrgext-sse2.asm
Normal file
535
TMessagesProj/jni/mozjpeg/simd/x86_64/jdmrgext-sse2.asm
Normal file
|
|
@ -0,0 +1,535 @@
|
|||
;
|
||||
; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d ; col
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rdi, r11
|
||||
mov ecx, r12d
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
mov rdi, r13
|
||||
mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
pop rcx ; col
|
||||
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
|
||||
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
|
||||
|
||||
pxor xmm1, xmm1 ; xmm1=(all 0's)
|
||||
pcmpeqw xmm3, xmm3
|
||||
psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
|
||||
punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
|
||||
movdqa xmm0, xmm7
|
||||
punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
|
||||
punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
|
||||
|
||||
paddw xmm6, xmm3
|
||||
paddw xmm4, xmm3
|
||||
paddw xmm7, xmm3
|
||||
paddw xmm0, xmm3
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movdqa xmm5, xmm6 ; xmm5=CbH
|
||||
movdqa xmm2, xmm4 ; xmm2=CbL
|
||||
paddw xmm6, xmm6 ; xmm6=2*CbH
|
||||
paddw xmm4, xmm4 ; xmm4=2*CbL
|
||||
movdqa xmm1, xmm7 ; xmm1=CrH
|
||||
movdqa xmm3, xmm0 ; xmm3=CrL
|
||||
paddw xmm7, xmm7 ; xmm7=2*CrH
|
||||
paddw xmm0, xmm0 ; xmm0=2*CrL
|
||||
|
||||
pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
|
||||
pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
|
||||
pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
|
||||
pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
|
||||
|
||||
paddw xmm6, [rel PW_ONE]
|
||||
paddw xmm4, [rel PW_ONE]
|
||||
psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
|
||||
psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
|
||||
paddw xmm7, [rel PW_ONE]
|
||||
paddw xmm0, [rel PW_ONE]
|
||||
psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
|
||||
psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
|
||||
|
||||
paddw xmm6, xmm5
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||
paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||
paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||
paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
movdqa xmm7, xmm2
|
||||
punpcklwd xmm5, xmm1
|
||||
punpckhwd xmm6, xmm1
|
||||
pmaddwd xmm5, [rel PW_MF0344_F0285]
|
||||
pmaddwd xmm6, [rel PW_MF0344_F0285]
|
||||
punpcklwd xmm2, xmm3
|
||||
punpckhwd xmm7, xmm3
|
||||
pmaddwd xmm2, [rel PW_MF0344_F0285]
|
||||
pmaddwd xmm7, [rel PW_MF0344_F0285]
|
||||
|
||||
paddd xmm5, [rel PD_ONEHALF]
|
||||
paddd xmm6, [rel PD_ONEHALF]
|
||||
psrad xmm5, SCALEBITS
|
||||
psrad xmm6, SCALEBITS
|
||||
paddd xmm2, [rel PD_ONEHALF]
|
||||
paddd xmm7, [rel PD_ONEHALF]
|
||||
psrad xmm2, SCALEBITS
|
||||
psrad xmm7, SCALEBITS
|
||||
|
||||
packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||
packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||
psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||
psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
|
||||
|
||||
mov al, 2 ; Yctr
|
||||
jmp short .Yloop_1st
|
||||
|
||||
.Yloop_2nd:
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
|
||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
|
||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
|
||||
|
||||
.Yloop_1st:
|
||||
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm6, xmm6
|
||||
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
|
||||
psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
|
||||
|
||||
movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
|
||||
movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
|
||||
movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
|
||||
|
||||
paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
|
||||
paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
|
||||
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
|
||||
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
|
||||
|
||||
paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
|
||||
paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
|
||||
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
|
||||
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
|
||||
|
||||
paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
|
||||
paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
|
||||
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
|
||||
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||
|
||||
movdqa xmmG, xmmA
|
||||
movdqa xmmH, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||
|
||||
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||
|
||||
movdqa xmmC, xmmD
|
||||
movdqa xmmB, xmmD
|
||||
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||
|
||||
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||
|
||||
movdqa xmmF, xmmE
|
||||
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||
|
||||
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||
movdqa xmmB, xmmE
|
||||
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||
|
||||
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||
movdqa xmmB, xmmF
|
||||
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||
|
||||
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
movq XMM_MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_MMWORD
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
psrldq xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
movd XMM_DWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_DWORD
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
psrldq xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of rax to the output when it has enough
|
||||
; space.
|
||||
movd eax, xmmA
|
||||
cmp rcx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov word [rdi], ax
|
||||
add rdi, byte SIZEOF_WORD
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
shr rax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of rax to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .endcolumn
|
||||
mov byte [rdi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%else
|
||||
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%endif
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||
|
||||
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||
|
||||
movdqa xmmC, xmmA
|
||||
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||
movdqa xmmG, xmmB
|
||||
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD, xmmA
|
||||
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
movdqa xmmH, xmmC
|
||||
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmC
|
||||
movdqa xmmD, xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA, xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_XMMWORD/8
|
||||
jb short .column_st7
|
||||
movq XMM_MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD/8*4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
psrldq xmmA, SIZEOF_XMMWORD/8*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .endcolumn
|
||||
movd XMM_DWORD [rdi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.endcolumn:
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov eax, r10d
|
||||
|
||||
mov rdi, r11
|
||||
mov ecx, r12d
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
mov rdi, r13
|
||||
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
push rdx ; inptr2
|
||||
push rbx ; inptr1
|
||||
push rsi ; inptr00
|
||||
mov rbx, rsp
|
||||
|
||||
push rdi
|
||||
push rcx
|
||||
push rax
|
||||
|
||||
%ifdef WIN64
|
||||
mov r8, rcx
|
||||
mov r9, rdi
|
||||
mov rcx, rax
|
||||
mov rdx, rbx
|
||||
%else
|
||||
mov rdx, rcx
|
||||
mov rcx, rdi
|
||||
mov rdi, rax
|
||||
mov rsi, rbx
|
||||
%endif
|
||||
|
||||
call EXTN(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
pop rax
|
||||
pop rcx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
add rdi, byte SIZEOF_JSAMPROW ; outptr1
|
||||
add rsi, byte SIZEOF_JSAMPROW ; inptr01
|
||||
|
||||
push rdx ; inptr2
|
||||
push rbx ; inptr1
|
||||
push rsi ; inptr00
|
||||
mov rbx, rsp
|
||||
|
||||
push rdi
|
||||
push rcx
|
||||
push rax
|
||||
|
||||
%ifdef WIN64
|
||||
mov r8, rcx
|
||||
mov r9, rdi
|
||||
mov rcx, rax
|
||||
mov rdx, rbx
|
||||
%else
|
||||
mov rdx, rcx
|
||||
mov rcx, rdi
|
||||
mov rdi, rax
|
||||
mov rsi, rbx
|
||||
%endif
|
||||
|
||||
call EXTN(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
pop rax
|
||||
pop rcx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
695
TMessagesProj/jni/mozjpeg/simd/x86_64/jdsample-avx2.asm
Normal file
695
TMessagesProj/jni/mozjpeg/simd/x86_64/jdsample-avx2.asm
Normal file
|
|
@ -0,0 +1,695 @@
|
|||
;
|
||||
; jdsample.asm - upsampling (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fancy_upsample_avx2)
|
||||
|
||||
EXTN(jconst_fancy_upsample_avx2):
|
||||
|
||||
PW_ONE times 16 dw 1
|
||||
PW_TWO times 16 dw 2
|
||||
PW_THREE times 16 dw 3
|
||||
PW_SEVEN times 16 dw 7
|
||||
PW_EIGHT times 16 dw 8
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; The upsampling algorithm is linear interpolation between pixel centers,
|
||||
; also known as a "triangle filter". This is a good compromise between
|
||||
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
|
||||
; of the way between input pixel centers.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
push_xmm 3
|
||||
collect_args 4
|
||||
|
||||
mov eax, r11d ; colctr
|
||||
test rax, rax
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
|
||||
vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
|
||||
vpcmpeqb xmm9, xmm9, xmm9
|
||||
vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
|
||||
|
||||
vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1)
|
||||
vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
|
||||
|
||||
.rowloop:
|
||||
push rax ; colctr
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
test rax, SIZEOF_YMMWORD-1
|
||||
jz short .skip
|
||||
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
.skip:
|
||||
vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
|
||||
add rax, byte SIZEOF_YMMWORD-1
|
||||
and rax, byte -SIZEOF_YMMWORD
|
||||
cmp rax, byte SIZEOF_YMMWORD
|
||||
ja short .columnloop
|
||||
|
||||
.columnloop_last:
|
||||
vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
jmp short .upsample
|
||||
|
||||
.columnloop:
|
||||
vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
|
||||
vperm2i128 ymm6, ymm0, ymm6, 0x20
|
||||
vpslldq ymm6, ymm6, 15
|
||||
|
||||
.upsample:
|
||||
vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
|
||||
|
||||
vperm2i128 ymm2, ymm0, ymm1, 0x20
|
||||
vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
|
||||
vperm2i128 ymm4, ymm0, ymm1, 0x03
|
||||
vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
|
||||
|
||||
vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
|
||||
vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
|
||||
|
||||
vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
|
||||
|
||||
vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
|
||||
vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
|
||||
vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
|
||||
vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
|
||||
vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
|
||||
vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
|
||||
|
||||
vpmullw ymm1, ymm1, [rel PW_THREE]
|
||||
vpmullw ymm4, ymm4, [rel PW_THREE]
|
||||
vpaddw ymm2, ymm2, [rel PW_ONE]
|
||||
vpaddw ymm5, ymm5, [rel PW_ONE]
|
||||
vpaddw ymm3, ymm3, [rel PW_TWO]
|
||||
vpaddw ymm6, ymm6, [rel PW_TWO]
|
||||
|
||||
vpaddw ymm2, ymm2, ymm1
|
||||
vpaddw ymm5, ymm5, ymm4
|
||||
vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
|
||||
vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
|
||||
vpaddw ymm3, ymm3, ymm1
|
||||
vpaddw ymm6, ymm6, ymm4
|
||||
vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
|
||||
vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
|
||||
|
||||
vpsllw ymm3, ymm3, BYTE_BIT
|
||||
vpsllw ymm6, ymm6, BYTE_BIT
|
||||
vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
|
||||
vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
|
||||
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
|
||||
|
||||
sub rax, byte SIZEOF_YMMWORD
|
||||
add rsi, byte 1*SIZEOF_YMMWORD ; inptr
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
cmp rax, byte SIZEOF_YMMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rcx ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
uncollect_args 4
|
||||
pop_xmm 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; Again a triangle filter; see comments for h2v1 case, above.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
push_xmm 3
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov eax, r11d ; colctr
|
||||
test rax, rax
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rax ; colctr
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
|
||||
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
|
||||
vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's)
|
||||
vpcmpeqb xmm9, xmm9, xmm9
|
||||
vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
|
||||
vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2)
|
||||
vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
|
||||
|
||||
test rax, SIZEOF_YMMWORD-1
|
||||
jz short .skip
|
||||
push rdx
|
||||
mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
pop rdx
|
||||
.skip:
|
||||
; -- process the first column block
|
||||
|
||||
vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
|
||||
vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
|
||||
vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
|
||||
|
||||
vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpmullw ymm0, ymm0, [rel PW_THREE]
|
||||
vpmullw ymm4, ymm4, [rel PW_THREE]
|
||||
|
||||
vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
|
||||
vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
|
||||
|
||||
vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm1
|
||||
vmovdqa YMMWORD [wk(1)], ymm2
|
||||
|
||||
add rax, byte SIZEOF_YMMWORD-1
|
||||
and rax, byte -SIZEOF_YMMWORD
|
||||
cmp rax, byte SIZEOF_YMMWORD
|
||||
ja short .columnloop
|
||||
|
||||
.columnloop_last:
|
||||
; -- process the last column block
|
||||
|
||||
vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
|
||||
vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
|
||||
|
||||
vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
|
||||
vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
|
||||
|
||||
jmp near .upsample
|
||||
|
||||
.columnloop:
|
||||
; -- process the next column block
|
||||
|
||||
vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
|
||||
vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
|
||||
vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
|
||||
|
||||
vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
|
||||
vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
|
||||
vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vpmullw ymm0, ymm0, [rel PW_THREE]
|
||||
vpmullw ymm4, ymm4, [rel PW_THREE]
|
||||
|
||||
vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
|
||||
vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
|
||||
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
|
||||
vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
|
||||
|
||||
vperm2i128 ymm1, ymm8, ymm1, 0x20
|
||||
vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
|
||||
vperm2i128 ymm2, ymm8, ymm2, 0x20
|
||||
vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
|
||||
|
||||
vmovdqa YMMWORD [wk(2)], ymm1
|
||||
vmovdqa YMMWORD [wk(3)], ymm2
|
||||
|
||||
.upsample:
|
||||
; -- process the upper row
|
||||
|
||||
vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vperm2i128 ymm0, ymm8, ymm7, 0x03
|
||||
vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
|
||||
vperm2i128 ymm4, ymm8, ymm3, 0x20
|
||||
vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
|
||||
|
||||
vperm2i128 ymm5, ymm8, ymm7, 0x03
|
||||
vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm6, ymm8, ymm3, 0x20
|
||||
vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
|
||||
vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vperm2i128 ymm2, ymm8, ymm3, 0x03
|
||||
vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
|
||||
vperm2i128 ymm4, ymm8, ymm3, 0x03
|
||||
vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm1, ymm8, ymm7, 0x20
|
||||
vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
|
||||
vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm4
|
||||
|
||||
vpmullw ymm7, ymm7, [rel PW_THREE]
|
||||
vpmullw ymm3, ymm3, [rel PW_THREE]
|
||||
vpaddw ymm1, ymm1, [rel PW_EIGHT]
|
||||
vpaddw ymm5, ymm5, [rel PW_EIGHT]
|
||||
vpaddw ymm0, ymm0, [rel PW_SEVEN]
|
||||
vpaddw ymm2, [rel PW_SEVEN]
|
||||
|
||||
vpaddw ymm1, ymm1, ymm7
|
||||
vpaddw ymm5, ymm5, ymm3
|
||||
vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
|
||||
vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm2, ymm2, ymm3
|
||||
vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
|
||||
vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
|
||||
|
||||
vpsllw ymm0, ymm0, BYTE_BIT
|
||||
vpsllw ymm2, ymm2, BYTE_BIT
|
||||
vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
|
||||
vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
|
||||
|
||||
vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
|
||||
vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
|
||||
|
||||
; -- process the lower row
|
||||
|
||||
vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||
vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
|
||||
|
||||
vperm2i128 ymm7, ymm8, ymm6, 0x03
|
||||
vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
|
||||
vperm2i128 ymm3, ymm8, ymm4, 0x20
|
||||
vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
|
||||
|
||||
vperm2i128 ymm0, ymm8, ymm6, 0x03
|
||||
vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm2, ymm8, ymm4, 0x20
|
||||
vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
|
||||
vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
|
||||
|
||||
vperm2i128 ymm5, ymm8, ymm4, 0x03
|
||||
vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
|
||||
vperm2i128 ymm3, ymm8, ymm4, 0x03
|
||||
vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
|
||||
vperm2i128 ymm1, ymm8, ymm6, 0x20
|
||||
vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
|
||||
vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
|
||||
vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
|
||||
|
||||
vmovdqa YMMWORD [wk(1)], ymm3
|
||||
|
||||
vpmullw ymm6, ymm6, [rel PW_THREE]
|
||||
vpmullw ymm4, ymm4, [rel PW_THREE]
|
||||
vpaddw ymm1, ymm1, [rel PW_EIGHT]
|
||||
vpaddw ymm0, ymm0, [rel PW_EIGHT]
|
||||
vpaddw ymm7, ymm7, [rel PW_SEVEN]
|
||||
vpaddw ymm5, ymm5, [rel PW_SEVEN]
|
||||
|
||||
vpaddw ymm1, ymm1, ymm6
|
||||
vpaddw ymm0, ymm0, ymm4
|
||||
vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
|
||||
vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
|
||||
vpaddw ymm7, ymm7, ymm6
|
||||
vpaddw ymm5, ymm5, ymm4
|
||||
vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
|
||||
vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
|
||||
|
||||
vpsllw ymm7, ymm7, BYTE_BIT
|
||||
vpsllw ymm5, ymm5, BYTE_BIT
|
||||
vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
|
||||
vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
|
||||
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
|
||||
|
||||
sub rax, byte SIZEOF_YMMWORD
|
||||
add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
|
||||
add rbx, byte 1*SIZEOF_YMMWORD ; inptr0
|
||||
add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
|
||||
add rdx, byte 2*SIZEOF_YMMWORD ; outptr0
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr1
|
||||
cmp rax, byte SIZEOF_YMMWORD
|
||||
ja near .columnloop
|
||||
test rax, rax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub rcx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 4
|
||||
pop_xmm 3
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
|
||||
mov edx, r11d
|
||||
add rdx, byte (SIZEOF_YMMWORD-1)
|
||||
and rdx, -SIZEOF_YMMWORD
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz short .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
mov rax, rdx ; colctr
|
||||
.columnloop:
|
||||
|
||||
cmp rax, byte SIZEOF_YMMWORD
|
||||
ja near .above_16
|
||||
|
||||
vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
vpunpckhbw xmm1, xmm0, xmm0
|
||||
vpunpcklbw xmm0, xmm0, xmm0
|
||||
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
jmp short .nextrow
|
||||
|
||||
.above_16:
|
||||
vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vpunpckhbw ymm1, ymm0, ymm0
|
||||
vpunpcklbw ymm0, ymm0, ymm0
|
||||
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
|
||||
|
||||
sub rax, byte 2*SIZEOF_YMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
jmp short .columnloop
|
||||
|
||||
.nextrow:
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rcx ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
.return:
|
||||
vzeroupper
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov edx, r11d
|
||||
add rdx, byte (SIZEOF_YMMWORD-1)
|
||||
and rdx, -SIZEOF_YMMWORD
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
mov rax, rdx ; colctr
|
||||
.columnloop:
|
||||
|
||||
cmp rax, byte SIZEOF_YMMWORD
|
||||
ja short .above_16
|
||||
|
||||
vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
vpunpckhbw xmm1, xmm0, xmm0
|
||||
vpunpcklbw xmm0, xmm0, xmm0
|
||||
|
||||
vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
|
||||
vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
jmp near .nextrow
|
||||
|
||||
.above_16:
|
||||
vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
|
||||
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vpunpckhbw ymm1, ymm0, ymm0
|
||||
vpunpcklbw ymm0, ymm0, ymm0
|
||||
|
||||
vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
|
||||
vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
|
||||
|
||||
sub rax, byte 2*SIZEOF_YMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr
|
||||
add rbx, 2*SIZEOF_YMMWORD ; outptr0
|
||||
add rdi, 2*SIZEOF_YMMWORD ; outptr1
|
||||
jmp short .columnloop
|
||||
|
||||
.nextrow:
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub rcx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
664
TMessagesProj/jni/mozjpeg/simd/x86_64/jdsample-sse2.asm
Normal file
664
TMessagesProj/jni/mozjpeg/simd/x86_64/jdsample-sse2.asm
Normal file
|
|
@ -0,0 +1,664 @@
|
|||
;
|
||||
; jdsample.asm - upsampling (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jconst_fancy_upsample_sse2):
|
||||
|
||||
PW_ONE times 8 dw 1
|
||||
PW_TWO times 8 dw 2
|
||||
PW_THREE times 8 dw 3
|
||||
PW_SEVEN times 8 dw 7
|
||||
PW_EIGHT times 8 dw 8
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; The upsampling algorithm is linear interpolation between pixel centers,
|
||||
; also known as a "triangle filter". This is a good compromise between
|
||||
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
|
||||
; of the way between input pixel centers.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
|
||||
mov eax, r11d ; colctr
|
||||
test rax, rax
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rax ; colctr
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
test rax, SIZEOF_XMMWORD-1
|
||||
jz short .skip
|
||||
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
.skip:
|
||||
pxor xmm0, xmm0 ; xmm0=(all 0's)
|
||||
pcmpeqb xmm7, xmm7
|
||||
psrldq xmm7, (SIZEOF_XMMWORD-1)
|
||||
pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
|
||||
add rax, byte SIZEOF_XMMWORD-1
|
||||
and rax, byte -SIZEOF_XMMWORD
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .columnloop
|
||||
|
||||
.columnloop_last:
|
||||
pcmpeqb xmm6, xmm6
|
||||
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||
pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
jmp short .upsample
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
pslldq xmm6, (SIZEOF_XMMWORD-1)
|
||||
|
||||
.upsample:
|
||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
|
||||
pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
|
||||
psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
|
||||
|
||||
por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
|
||||
por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
|
||||
|
||||
movdqa xmm7, xmm1
|
||||
psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
|
||||
|
||||
movdqa xmm4, xmm1
|
||||
punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5, xmm2
|
||||
punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
|
||||
punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||
movdqa xmm6, xmm3
|
||||
punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
|
||||
punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
pmullw xmm1, [rel PW_THREE]
|
||||
pmullw xmm4, [rel PW_THREE]
|
||||
paddw xmm2, [rel PW_ONE]
|
||||
paddw xmm5, [rel PW_ONE]
|
||||
paddw xmm3, [rel PW_TWO]
|
||||
paddw xmm6, [rel PW_TWO]
|
||||
|
||||
paddw xmm2, xmm1
|
||||
paddw xmm5, xmm4
|
||||
psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm3, xmm1
|
||||
paddw xmm6, xmm4
|
||||
psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm3, BYTE_BIT
|
||||
psllw xmm6, BYTE_BIT
|
||||
por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
|
||||
por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
|
||||
|
||||
sub rax, byte SIZEOF_XMMWORD
|
||||
add rsi, byte 1*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja near .columnloop
|
||||
test eax, eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rcx ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; Again a triangle filter; see comments for h2v1 case, above.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov eax, r11d ; colctr
|
||||
test rax, rax
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rax ; colctr
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
|
||||
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
|
||||
test rax, SIZEOF_XMMWORD-1
|
||||
jz short .skip
|
||||
push rdx
|
||||
mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
pop rdx
|
||||
.skip:
|
||||
; -- process the first column block
|
||||
|
||||
movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
|
||||
movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
|
||||
movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
|
||||
|
||||
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||
movdqa xmm4, xmm0
|
||||
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5, xmm1
|
||||
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm6, xmm2
|
||||
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||
|
||||
pmullw xmm0, [rel PW_THREE]
|
||||
pmullw xmm4, [rel PW_THREE]
|
||||
|
||||
pcmpeqb xmm7, xmm7
|
||||
psrldq xmm7, (SIZEOF_XMMWORD-2)
|
||||
|
||||
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
|
||||
|
||||
pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
|
||||
pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm1
|
||||
movdqa XMMWORD [wk(1)], xmm2
|
||||
|
||||
add rax, byte SIZEOF_XMMWORD-1
|
||||
and rax, byte -SIZEOF_XMMWORD
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .columnloop
|
||||
|
||||
.columnloop_last:
|
||||
; -- process the last column block
|
||||
|
||||
pcmpeqb xmm1, xmm1
|
||||
pslldq xmm1, (SIZEOF_XMMWORD-2)
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
||||
pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
|
||||
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
|
||||
|
||||
jmp near .upsample
|
||||
|
||||
.columnloop:
|
||||
; -- process the next column block
|
||||
|
||||
movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
|
||||
movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
|
||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
|
||||
|
||||
pxor xmm3, xmm3 ; xmm3=(all 0's)
|
||||
movdqa xmm4, xmm0
|
||||
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5, xmm1
|
||||
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm6, xmm2
|
||||
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||
|
||||
pmullw xmm0, [rel PW_THREE]
|
||||
pmullw xmm4, [rel PW_THREE]
|
||||
|
||||
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||
movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
|
||||
|
||||
pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
|
||||
pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm1
|
||||
movdqa XMMWORD [wk(3)], xmm2
|
||||
|
||||
.upsample:
|
||||
; -- process the upper row
|
||||
|
||||
movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
|
||||
pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
|
||||
movdqa xmm5, xmm7
|
||||
movdqa xmm6, xmm3
|
||||
psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
|
||||
pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
|
||||
|
||||
por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
|
||||
por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||
|
||||
movdqa xmm1, xmm7
|
||||
movdqa xmm2, xmm3
|
||||
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||
psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
|
||||
movdqa xmm4, xmm3
|
||||
psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
|
||||
|
||||
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4
|
||||
|
||||
pmullw xmm7, [rel PW_THREE]
|
||||
pmullw xmm3, [rel PW_THREE]
|
||||
paddw xmm1, [rel PW_EIGHT]
|
||||
paddw xmm5, [rel PW_EIGHT]
|
||||
paddw xmm0, [rel PW_SEVEN]
|
||||
paddw xmm2, [rel PW_SEVEN]
|
||||
|
||||
paddw xmm1, xmm7
|
||||
paddw xmm5, xmm3
|
||||
psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm2, xmm3
|
||||
psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm0, BYTE_BIT
|
||||
psllw xmm2, BYTE_BIT
|
||||
por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
|
||||
por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
|
||||
|
||||
; -- process the lower row
|
||||
|
||||
movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
|
||||
pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
|
||||
movdqa xmm0, xmm6
|
||||
movdqa xmm2, xmm4
|
||||
psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
|
||||
pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
|
||||
|
||||
por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
|
||||
por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
|
||||
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm5, xmm4
|
||||
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||
psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
|
||||
movdqa xmm3, xmm4
|
||||
psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
|
||||
|
||||
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
movdqa XMMWORD [wk(1)], xmm3
|
||||
|
||||
pmullw xmm6, [rel PW_THREE]
|
||||
pmullw xmm4, [rel PW_THREE]
|
||||
paddw xmm1, [rel PW_EIGHT]
|
||||
paddw xmm0, [rel PW_EIGHT]
|
||||
paddw xmm7, [rel PW_SEVEN]
|
||||
paddw xmm5, [rel PW_SEVEN]
|
||||
|
||||
paddw xmm1, xmm6
|
||||
paddw xmm0, xmm4
|
||||
psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm7, xmm6
|
||||
paddw xmm5, xmm4
|
||||
psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm7, BYTE_BIT
|
||||
psllw xmm5, BYTE_BIT
|
||||
por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
|
||||
por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub rax, byte SIZEOF_XMMWORD
|
||||
add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
|
||||
add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
|
||||
add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
|
||||
add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja near .columnloop
|
||||
test rax, rax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub rcx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
|
||||
mov edx, r11d
|
||||
add rdx, byte (2*SIZEOF_XMMWORD)-1
|
||||
and rdx, byte -(2*SIZEOF_XMMWORD)
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz short .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
mov rax, rdx ; colctr
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0
|
||||
punpckhbw xmm1, xmm1
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm2
|
||||
punpckhbw xmm3, xmm3
|
||||
|
||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte 4*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .columnloop
|
||||
|
||||
.nextrow:
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rcx ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11d = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov edx, r11d
|
||||
add rdx, byte (2*SIZEOF_XMMWORD)-1
|
||||
and rdx, byte -(2*SIZEOF_XMMWORD)
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
mov rax, rdx ; colctr
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0
|
||||
punpckhbw xmm1, xmm1
|
||||
|
||||
movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm2
|
||||
punpckhbw xmm3, xmm3
|
||||
|
||||
movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
|
||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
|
||||
add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
|
||||
jmp short .columnloop
|
||||
|
||||
.nextrow:
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub rcx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
355
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctflt-sse.asm
Normal file
355
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctflt-sse.asm
Normal file
|
|
@ -0,0 +1,355 @@
|
|||
;
|
||||
; jfdctflt.asm - floating-point FDCT (64-bit SSE)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the forward DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||
shufps %1, %2, 0x44
|
||||
%endmacro
|
||||
|
||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||
shufps %1, %2, 0xEE
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_float_sse)
|
||||
|
||||
EXTN(jconst_fdct_float_sse):
|
||||
|
||||
PD_0_382 times 4 dd 0.382683432365089771728460
|
||||
PD_0_707 times 4 dd 0.707106781186547524400844
|
||||
PD_0_541 times 4 dd 0.541196100146196984399723
|
||||
PD_1_306 times 4 dd 1.306562964876376527856643
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_float_sse(FAST_FLOAT *data)
|
||||
;
|
||||
|
||||
; r10 = FAST_FLOAT *data
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_float_sse)
|
||||
|
||||
EXTN(jsimd_fdct_float_sse):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov rdx, r10 ; (FAST_FLOAT *)
|
||||
mov rcx, DCTSIZE/4
|
||||
.rowloop:
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
|
||||
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
|
||||
|
||||
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
|
||||
unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
|
||||
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
|
||||
unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
|
||||
|
||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
|
||||
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
|
||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
|
||||
|
||||
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||
unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
|
||||
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
|
||||
unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
|
||||
|
||||
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
|
||||
unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
|
||||
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
|
||||
unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
|
||||
|
||||
movaps xmm0, xmm7
|
||||
movaps xmm5, xmm6
|
||||
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||
|
||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
|
||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
|
||||
unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
|
||||
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
|
||||
unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
|
||||
|
||||
movaps xmm2, xmm7
|
||||
movaps xmm3, xmm4
|
||||
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm1, xmm5
|
||||
movaps xmm6, xmm0
|
||||
subps xmm5, xmm7 ; xmm5=tmp13
|
||||
subps xmm0, xmm4 ; xmm0=tmp12
|
||||
addps xmm1, xmm7 ; xmm1=tmp10
|
||||
addps xmm6, xmm4 ; xmm6=tmp11
|
||||
|
||||
addps xmm0, xmm5
|
||||
mulps xmm0, [rel PD_0_707] ; xmm0=z1
|
||||
|
||||
movaps xmm7, xmm1
|
||||
movaps xmm4, xmm5
|
||||
subps xmm1, xmm6 ; xmm1=data4
|
||||
subps xmm5, xmm0 ; xmm5=data6
|
||||
addps xmm7, xmm6 ; xmm7=data0
|
||||
addps xmm4, xmm0 ; xmm4=data2
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||
|
||||
addps xmm2, xmm3 ; xmm2=tmp10
|
||||
addps xmm3, xmm6 ; xmm3=tmp11
|
||||
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||
|
||||
mulps xmm3, [rel PD_0_707] ; xmm3=z3
|
||||
|
||||
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||
subps xmm2, xmm6
|
||||
mulps xmm2, [rel PD_0_382] ; xmm2=z5
|
||||
mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||
mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||
addps xmm1, xmm2 ; xmm1=z2
|
||||
addps xmm6, xmm2 ; xmm6=z4
|
||||
|
||||
movaps xmm5, xmm0
|
||||
subps xmm0, xmm3 ; xmm0=z13
|
||||
addps xmm5, xmm3 ; xmm5=z11
|
||||
|
||||
movaps xmm7, xmm0
|
||||
movaps xmm4, xmm5
|
||||
subps xmm0, xmm1 ; xmm0=data3
|
||||
subps xmm5, xmm6 ; xmm5=data7
|
||||
addps xmm7, xmm1 ; xmm7=data5
|
||||
addps xmm4, xmm6 ; xmm4=data1
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec rcx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov rdx, r10 ; (FAST_FLOAT *)
|
||||
mov rcx, DCTSIZE/4
|
||||
.columnloop:
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
|
||||
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
|
||||
|
||||
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
|
||||
unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
|
||||
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
|
||||
unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
|
||||
|
||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
|
||||
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
|
||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
|
||||
|
||||
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
|
||||
unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
|
||||
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
|
||||
unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
|
||||
|
||||
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
|
||||
unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
|
||||
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
|
||||
unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
|
||||
|
||||
movaps xmm0, xmm7
|
||||
movaps xmm5, xmm6
|
||||
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
|
||||
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
|
||||
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
|
||||
|
||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
|
||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
|
||||
unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
|
||||
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
|
||||
unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
|
||||
|
||||
movaps xmm2, xmm7
|
||||
movaps xmm3, xmm4
|
||||
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
|
||||
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
|
||||
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm1, xmm5
|
||||
movaps xmm6, xmm0
|
||||
subps xmm5, xmm7 ; xmm5=tmp13
|
||||
subps xmm0, xmm4 ; xmm0=tmp12
|
||||
addps xmm1, xmm7 ; xmm1=tmp10
|
||||
addps xmm6, xmm4 ; xmm6=tmp11
|
||||
|
||||
addps xmm0, xmm5
|
||||
mulps xmm0, [rel PD_0_707] ; xmm0=z1
|
||||
|
||||
movaps xmm7, xmm1
|
||||
movaps xmm4, xmm5
|
||||
subps xmm1, xmm6 ; xmm1=data4
|
||||
subps xmm5, xmm0 ; xmm5=data6
|
||||
addps xmm7, xmm6 ; xmm7=data0
|
||||
addps xmm4, xmm0 ; xmm4=data2
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||
|
||||
addps xmm2, xmm3 ; xmm2=tmp10
|
||||
addps xmm3, xmm6 ; xmm3=tmp11
|
||||
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||
|
||||
mulps xmm3, [rel PD_0_707] ; xmm3=z3
|
||||
|
||||
movaps xmm1, xmm2 ; xmm1=tmp10
|
||||
subps xmm2, xmm6
|
||||
mulps xmm2, [rel PD_0_382] ; xmm2=z5
|
||||
mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||
mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||
addps xmm1, xmm2 ; xmm1=z2
|
||||
addps xmm6, xmm2 ; xmm6=z4
|
||||
|
||||
movaps xmm5, xmm0
|
||||
subps xmm0, xmm3 ; xmm0=z13
|
||||
addps xmm5, xmm3 ; xmm5=z11
|
||||
|
||||
movaps xmm7, xmm0
|
||||
movaps xmm4, xmm5
|
||||
subps xmm0, xmm1 ; xmm0=data3
|
||||
subps xmm5, xmm6 ; xmm5=data7
|
||||
addps xmm7, xmm1 ; xmm7=data5
|
||||
addps xmm4, xmm6 ; xmm4=data1
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
add rdx, byte 4*SIZEOF_FAST_FLOAT
|
||||
dec rcx
|
||||
jnz near .columnloop
|
||||
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
389
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctfst-sse2.asm
Normal file
389
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctfst-sse2.asm
Normal file
|
|
@ -0,0 +1,389 @@
|
|||
;
|
||||
; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the forward DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
|
||||
; for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_0_382 equ 98 ; FIX(0.382683433)
|
||||
F_0_541 equ 139 ; FIX(0.541196100)
|
||||
F_0_707 equ 181 ; FIX(0.707106781)
|
||||
F_1_306 equ 334 ; FIX(1.306562965)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
|
||||
F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_ifast_sse2)
|
||||
|
||||
EXTN(jconst_fdct_ifast_sse2):
|
||||
|
||||
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
|
||||
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
|
||||
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
|
||||
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_ifast_sse2(DCTELEM *data)
|
||||
;
|
||||
|
||||
; r10 = DCTELEM *data
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
|
||||
|
||||
EXTN(jsimd_fdct_ifast_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov rdx, r10 ; (DCTELEM *)
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
|
||||
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
|
||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
|
||||
|
||||
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
movdqa xmm3, xmm0
|
||||
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm0, xmm6
|
||||
psubw xmm3, xmm1 ; xmm3=tmp13
|
||||
psubw xmm6, xmm7 ; xmm6=tmp12
|
||||
paddw xmm4, xmm1 ; xmm4=tmp10
|
||||
paddw xmm0, xmm7 ; xmm0=tmp11
|
||||
|
||||
paddw xmm6, xmm3
|
||||
psllw xmm6, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
|
||||
|
||||
movdqa xmm1, xmm4
|
||||
movdqa xmm7, xmm3
|
||||
psubw xmm4, xmm0 ; xmm4=data4
|
||||
psubw xmm3, xmm6 ; xmm3=data6
|
||||
paddw xmm1, xmm0 ; xmm1=data0
|
||||
paddw xmm7, xmm6 ; xmm7=data2
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
paddw xmm2, xmm5 ; xmm2=tmp10
|
||||
paddw xmm5, xmm0 ; xmm5=tmp11
|
||||
paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
|
||||
|
||||
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
|
||||
|
||||
movdqa xmm4, xmm2 ; xmm4=tmp10
|
||||
psubw xmm2, xmm0
|
||||
pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
|
||||
pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||
pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
|
||||
paddw xmm4, xmm2 ; xmm4=z2
|
||||
paddw xmm0, xmm2 ; xmm0=z4
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
psubw xmm6, xmm5 ; xmm6=z13
|
||||
paddw xmm3, xmm5 ; xmm3=z11
|
||||
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm5, xmm3
|
||||
psubw xmm6, xmm4 ; xmm6=data3
|
||||
psubw xmm3, xmm0 ; xmm3=data7
|
||||
paddw xmm2, xmm4 ; xmm2=data5
|
||||
paddw xmm5, xmm0 ; xmm5=data1
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
|
||||
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
|
||||
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
|
||||
punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
|
||||
|
||||
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
|
||||
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
|
||||
punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
|
||||
movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
|
||||
punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
|
||||
|
||||
movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
|
||||
punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
|
||||
punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
|
||||
|
||||
movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
|
||||
movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
|
||||
punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
|
||||
|
||||
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
|
||||
punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
|
||||
movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
|
||||
punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
|
||||
|
||||
movdqa xmm5, xmm6
|
||||
movdqa xmm3, xmm1
|
||||
psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
|
||||
psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
|
||||
paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
|
||||
paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
|
||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
|
||||
punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
|
||||
movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
|
||||
punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm0, xmm2
|
||||
paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
|
||||
paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
|
||||
psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
|
||||
psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm1, xmm5
|
||||
psubw xmm3, xmm6 ; xmm3=tmp13
|
||||
psubw xmm5, xmm2 ; xmm5=tmp12
|
||||
paddw xmm4, xmm6 ; xmm4=tmp10
|
||||
paddw xmm1, xmm2 ; xmm1=tmp11
|
||||
|
||||
paddw xmm5, xmm3
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm2, xmm3
|
||||
psubw xmm4, xmm1 ; xmm4=data4
|
||||
psubw xmm3, xmm5 ; xmm3=data6
|
||||
paddw xmm6, xmm1 ; xmm6=data0
|
||||
paddw xmm2, xmm5 ; xmm2=data2
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||
|
||||
paddw xmm7, xmm0 ; xmm7=tmp10
|
||||
paddw xmm0, xmm1 ; xmm0=tmp11
|
||||
paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
|
||||
|
||||
psllw xmm7, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
|
||||
|
||||
movdqa xmm4, xmm7 ; xmm4=tmp10
|
||||
psubw xmm7, xmm1
|
||||
pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
|
||||
pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||
pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
|
||||
paddw xmm4, xmm7 ; xmm4=z2
|
||||
paddw xmm1, xmm7 ; xmm1=z4
|
||||
|
||||
movdqa xmm3, xmm5
|
||||
psubw xmm5, xmm0 ; xmm5=z13
|
||||
paddw xmm3, xmm0 ; xmm3=z11
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
movdqa xmm2, xmm3
|
||||
psubw xmm5, xmm4 ; xmm5=data3
|
||||
psubw xmm3, xmm1 ; xmm3=data7
|
||||
paddw xmm6, xmm4 ; xmm6=data5
|
||||
paddw xmm2, xmm1 ; xmm2=data1
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
|
||||
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
320
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctint-avx2.asm
Normal file
320
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctint-avx2.asm
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
;
|
||||
; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, 2018, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit matrix transpose using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%8: Temp registers
|
||||
|
||||
%macro dotranspose 8
|
||||
; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
|
||||
; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
|
||||
; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
|
||||
; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
|
||||
|
||||
vpunpcklwd %5, %1, %2
|
||||
vpunpckhwd %6, %1, %2
|
||||
vpunpcklwd %7, %3, %4
|
||||
vpunpckhwd %8, %3, %4
|
||||
; transpose coefficients(phase 1)
|
||||
; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53)
|
||||
; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57)
|
||||
; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73)
|
||||
; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77)
|
||||
|
||||
vpunpckldq %1, %5, %7
|
||||
vpunpckhdq %2, %5, %7
|
||||
vpunpckldq %3, %6, %8
|
||||
vpunpckhdq %4, %6, %8
|
||||
; transpose coefficients(phase 2)
|
||||
; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
|
||||
; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
|
||||
; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
|
||||
; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
|
||||
|
||||
vpermq %1, %1, 0x8D
|
||||
vpermq %2, %2, 0x8D
|
||||
vpermq %3, %3, 0xD8
|
||||
vpermq %4, %4, 0xD8
|
||||
; transpose coefficients(phase 3)
|
||||
; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70)
|
||||
; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
|
||||
; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
|
||||
; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77)
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit slow integer forward DCT using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%8: Temp registers
|
||||
; %9: Pass (1 or 2)
|
||||
|
||||
%macro dodct 9
|
||||
vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
|
||||
vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
|
||||
vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
|
||||
vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5
|
||||
|
||||
; -- Even part
|
||||
|
||||
vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1
|
||||
vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11
|
||||
vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12
|
||||
|
||||
vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10
|
||||
vpsignw %1, %1, [rel PW_1_NEG1] ; %1=tmp10_neg11
|
||||
vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11)
|
||||
%if %9 == 1
|
||||
vpsllw %1, %7, PASS1_BITS ; %1=data0_4
|
||||
%else
|
||||
vpaddw %7, %7, [rel PW_DESCALE_P2X]
|
||||
vpsraw %1, %7, PASS1_BITS ; %1=data0_4
|
||||
%endif
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13
|
||||
vpunpcklwd %2, %6, %7
|
||||
vpunpckhwd %6, %6, %7
|
||||
vpmaddwd %2, %2, [rel PW_F130_F054_MF130_F054] ; %2=data2_6L
|
||||
vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=data2_6H
|
||||
|
||||
vpaddd %2, %2, [rel PD_DESCALE_P %+ %9]
|
||||
vpaddd %6, %6, [rel PD_DESCALE_P %+ %9]
|
||||
vpsrad %2, %2, DESCALE_P %+ %9
|
||||
vpsrad %6, %6, DESCALE_P %+ %9
|
||||
|
||||
vpackssdw %3, %2, %6 ; %6=data2_6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3
|
||||
vpunpcklwd %6, %7, %2
|
||||
vpunpckhwd %7, %7, %2
|
||||
vpmaddwd %6, %6, [rel PW_MF078_F117_F078_F117] ; %6=z3_4L
|
||||
vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6
|
||||
vpunpcklwd %2, %8, %4
|
||||
vpunpckhwd %4, %8, %4
|
||||
vpmaddwd %2, %2, [rel PW_MF060_MF089_MF050_MF256] ; %2=tmp4_5L
|
||||
vpmaddwd %4, %4, [rel PW_MF060_MF089_MF050_MF256] ; %4=tmp4_5H
|
||||
|
||||
vpaddd %2, %2, %6 ; %2=data7_5L
|
||||
vpaddd %4, %4, %7 ; %4=data7_5H
|
||||
|
||||
vpaddd %2, %2, [rel PD_DESCALE_P %+ %9]
|
||||
vpaddd %4, %4, [rel PD_DESCALE_P %+ %9]
|
||||
vpsrad %2, %2, DESCALE_P %+ %9
|
||||
vpsrad %4, %4, DESCALE_P %+ %9
|
||||
|
||||
vpackssdw %4, %2, %4 ; %4=data7_5
|
||||
|
||||
vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4
|
||||
vpunpcklwd %8, %5, %2
|
||||
vpunpckhwd %5, %5, %2
|
||||
vpmaddwd %8, %8, [rel PW_F050_MF256_F060_MF089] ; %8=tmp6_7L
|
||||
vpmaddwd %5, %5, [rel PW_F050_MF256_F060_MF089] ; %5=tmp6_7H
|
||||
|
||||
vpaddd %8, %8, %6 ; %8=data3_1L
|
||||
vpaddd %5, %5, %7 ; %5=data3_1H
|
||||
|
||||
vpaddd %8, %8, [rel PD_DESCALE_P %+ %9]
|
||||
vpaddd %5, %5, [rel PD_DESCALE_P %+ %9]
|
||||
vpsrad %8, %8, DESCALE_P %+ %9
|
||||
vpsrad %5, %5, DESCALE_P %+ %9
|
||||
|
||||
vpackssdw %2, %8, %5 ; %2=data3_1
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_islow_avx2)
|
||||
|
||||
EXTN(jconst_fdct_islow_avx2):
|
||||
|
||||
PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
times 4 dw (F_0_541 - F_1_847), F_0_541
|
||||
PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
times 4 dw (F_1_175 - F_0_390), F_1_175
|
||||
PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562
|
||||
times 4 dw (F_1_501 - F_0_899), -F_0_899
|
||||
PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
|
||||
PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
|
||||
PW_1_NEG1 times 8 dw 1
|
||||
times 8 dw -1
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_islow_avx2(DCTELEM *data)
|
||||
;
|
||||
|
||||
; r10 = DCTELEM *data
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
|
||||
|
||||
EXTN(jsimd_fdct_islow_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
|
||||
; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
vperm2i128 ymm0, ymm4, ymm6, 0x20
|
||||
vperm2i128 ymm1, ymm4, ymm6, 0x31
|
||||
vperm2i128 ymm2, ymm5, ymm7, 0x20
|
||||
vperm2i128 ymm3, ymm5, ymm7, 0x31
|
||||
; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
|
||||
; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
|
||||
; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
|
||||
; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
|
||||
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
|
||||
vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
|
||||
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
|
||||
|
||||
vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
|
||||
vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3
|
||||
vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5
|
||||
vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7
|
||||
|
||||
vmovdqu YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
|
||||
vmovdqu YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
|
||||
vmovdqu YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
|
||||
vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
|
||||
|
||||
vzeroupper
|
||||
uncollect_args 1
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
619
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctint-sse2.asm
Normal file
619
TMessagesProj/jni/mozjpeg/simd/x86_64/jfdctint-sse2.asm
Normal file
|
|
@ -0,0 +1,619 @@
|
|||
;
|
||||
; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_fdct_islow_sse2)
|
||||
|
||||
EXTN(jconst_fdct_islow_sse2):
|
||||
|
||||
PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
|
||||
PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
|
||||
PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
|
||||
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_islow_sse2(DCTELEM *data)
|
||||
;
|
||||
|
||||
; r10 = DCTELEM *data
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 6
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
|
||||
|
||||
EXTN(jsimd_fdct_islow_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov rdx, r10 ; (DCTELEM *)
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
|
||||
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
|
||||
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
|
||||
|
||||
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||
|
||||
movdqa xmm6, xmm1
|
||||
movdqa xmm3, xmm0
|
||||
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
|
||||
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
|
||||
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
|
||||
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
|
||||
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
|
||||
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
|
||||
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm0, xmm6
|
||||
paddw xmm3, xmm1 ; xmm3=tmp10
|
||||
paddw xmm6, xmm7 ; xmm6=tmp11
|
||||
psubw xmm4, xmm1 ; xmm4=tmp13
|
||||
psubw xmm0, xmm7 ; xmm0=tmp12
|
||||
|
||||
movdqa xmm1, xmm3
|
||||
paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
|
||||
psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
|
||||
|
||||
psllw xmm3, PASS1_BITS ; xmm3=data0
|
||||
psllw xmm1, PASS1_BITS ; xmm1=data4
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
|
||||
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movdqa xmm7, xmm4 ; xmm4=tmp13
|
||||
movdqa xmm6, xmm4
|
||||
punpcklwd xmm7, xmm0 ; xmm0=tmp12
|
||||
punpckhwd xmm6, xmm0
|
||||
movdqa xmm4, xmm7
|
||||
movdqa xmm0, xmm6
|
||||
pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L
|
||||
pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H
|
||||
pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L
|
||||
pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H
|
||||
|
||||
paddd xmm7, [rel PD_DESCALE_P1]
|
||||
paddd xmm6, [rel PD_DESCALE_P1]
|
||||
psrad xmm7, DESCALE_P1
|
||||
psrad xmm6, DESCALE_P1
|
||||
paddd xmm4, [rel PD_DESCALE_P1]
|
||||
paddd xmm0, [rel PD_DESCALE_P1]
|
||||
psrad xmm4, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
|
||||
packssdw xmm7, xmm6 ; xmm7=data2
|
||||
packssdw xmm4, xmm0 ; xmm4=data6
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
|
||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
|
||||
|
||||
movdqa xmm6, xmm2 ; xmm2=tmp4
|
||||
movdqa xmm0, xmm5 ; xmm5=tmp5
|
||||
paddw xmm6, xmm3 ; xmm6=z3
|
||||
paddw xmm0, xmm1 ; xmm0=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm4, xmm6
|
||||
punpcklwd xmm7, xmm0
|
||||
punpckhwd xmm4, xmm0
|
||||
movdqa xmm6, xmm7
|
||||
movdqa xmm0, xmm4
|
||||
pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L
|
||||
pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H
|
||||
pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L
|
||||
pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
|
||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movdqa xmm7, xmm2
|
||||
movdqa xmm4, xmm2
|
||||
punpcklwd xmm7, xmm1
|
||||
punpckhwd xmm4, xmm1
|
||||
movdqa xmm2, xmm7
|
||||
movdqa xmm1, xmm4
|
||||
pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L
|
||||
pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H
|
||||
pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L
|
||||
pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H
|
||||
|
||||
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
|
||||
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
|
||||
paddd xmm2, xmm6 ; xmm2=data1L
|
||||
paddd xmm1, xmm0 ; xmm1=data1H
|
||||
|
||||
paddd xmm7, [rel PD_DESCALE_P1]
|
||||
paddd xmm4, [rel PD_DESCALE_P1]
|
||||
psrad xmm7, DESCALE_P1
|
||||
psrad xmm4, DESCALE_P1
|
||||
paddd xmm2, [rel PD_DESCALE_P1]
|
||||
paddd xmm1, [rel PD_DESCALE_P1]
|
||||
psrad xmm2, DESCALE_P1
|
||||
psrad xmm1, DESCALE_P1
|
||||
|
||||
packssdw xmm7, xmm4 ; xmm7=data7
|
||||
packssdw xmm2, xmm1 ; xmm2=data1
|
||||
|
||||
movdqa xmm4, xmm5
|
||||
movdqa xmm1, xmm5
|
||||
punpcklwd xmm4, xmm3
|
||||
punpckhwd xmm1, xmm3
|
||||
movdqa xmm5, xmm4
|
||||
movdqa xmm3, xmm1
|
||||
pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L
|
||||
pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H
|
||||
pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L
|
||||
pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H
|
||||
|
||||
paddd xmm4, xmm6 ; xmm4=data5L
|
||||
paddd xmm1, xmm0 ; xmm1=data5H
|
||||
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
|
||||
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
|
||||
|
||||
paddd xmm4, [rel PD_DESCALE_P1]
|
||||
paddd xmm1, [rel PD_DESCALE_P1]
|
||||
psrad xmm4, DESCALE_P1
|
||||
psrad xmm1, DESCALE_P1
|
||||
paddd xmm5, [rel PD_DESCALE_P1]
|
||||
paddd xmm3, [rel PD_DESCALE_P1]
|
||||
psrad xmm5, DESCALE_P1
|
||||
psrad xmm3, DESCALE_P1
|
||||
|
||||
packssdw xmm4, xmm1 ; xmm4=data5
|
||||
packssdw xmm5, xmm3 ; xmm5=data3
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
|
||||
movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
|
||||
|
||||
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
|
||||
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
|
||||
movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||
punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
|
||||
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
|
||||
|
||||
; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
|
||||
; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
|
||||
punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
|
||||
movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
|
||||
punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
|
||||
|
||||
movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
|
||||
punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
|
||||
punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
|
||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
|
||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
|
||||
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
|
||||
|
||||
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
|
||||
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
|
||||
punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
|
||||
|
||||
movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
|
||||
punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
|
||||
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
|
||||
punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm7, xmm6
|
||||
psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
|
||||
psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
|
||||
paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
|
||||
paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
|
||||
punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
|
||||
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
|
||||
punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
movdqa xmm3, xmm4
|
||||
paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
|
||||
paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
|
||||
psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
|
||||
psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm1, xmm7
|
||||
movdqa xmm6, xmm2
|
||||
paddw xmm7, xmm5 ; xmm7=tmp10
|
||||
paddw xmm2, xmm4 ; xmm2=tmp11
|
||||
psubw xmm1, xmm5 ; xmm1=tmp13
|
||||
psubw xmm6, xmm4 ; xmm6=tmp12
|
||||
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
|
||||
psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
|
||||
|
||||
paddw xmm7, [rel PW_DESCALE_P2X]
|
||||
paddw xmm5, [rel PW_DESCALE_P2X]
|
||||
psraw xmm7, PASS1_BITS ; xmm7=data0
|
||||
psraw xmm5, PASS1_BITS ; xmm5=data4
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
|
||||
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movdqa xmm4, xmm1 ; xmm1=tmp13
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm4, xmm6 ; xmm6=tmp12
|
||||
punpckhwd xmm2, xmm6
|
||||
movdqa xmm1, xmm4
|
||||
movdqa xmm6, xmm2
|
||||
pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L
|
||||
pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H
|
||||
pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L
|
||||
pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H
|
||||
|
||||
paddd xmm4, [rel PD_DESCALE_P2]
|
||||
paddd xmm2, [rel PD_DESCALE_P2]
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm2, DESCALE_P2
|
||||
paddd xmm1, [rel PD_DESCALE_P2]
|
||||
paddd xmm6, [rel PD_DESCALE_P2]
|
||||
psrad xmm1, DESCALE_P2
|
||||
psrad xmm6, DESCALE_P2
|
||||
|
||||
packssdw xmm4, xmm2 ; xmm4=data2
|
||||
packssdw xmm1, xmm6 ; xmm1=data6
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||
|
||||
movdqa xmm2, xmm0 ; xmm0=tmp4
|
||||
movdqa xmm6, xmm3 ; xmm3=tmp5
|
||||
paddw xmm2, xmm7 ; xmm2=z3
|
||||
paddw xmm6, xmm5 ; xmm6=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm4, xmm6
|
||||
punpckhwd xmm1, xmm6
|
||||
movdqa xmm2, xmm4
|
||||
movdqa xmm6, xmm1
|
||||
pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L
|
||||
pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H
|
||||
pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L
|
||||
pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm4, xmm5
|
||||
punpckhwd xmm1, xmm5
|
||||
movdqa xmm0, xmm4
|
||||
movdqa xmm5, xmm1
|
||||
pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L
|
||||
pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H
|
||||
pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L
|
||||
pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H
|
||||
|
||||
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
|
||||
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
|
||||
paddd xmm0, xmm2 ; xmm0=data1L
|
||||
paddd xmm5, xmm6 ; xmm5=data1H
|
||||
|
||||
paddd xmm4, [rel PD_DESCALE_P2]
|
||||
paddd xmm1, [rel PD_DESCALE_P2]
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm1, DESCALE_P2
|
||||
paddd xmm0, [rel PD_DESCALE_P2]
|
||||
paddd xmm5, [rel PD_DESCALE_P2]
|
||||
psrad xmm0, DESCALE_P2
|
||||
psrad xmm5, DESCALE_P2
|
||||
|
||||
packssdw xmm4, xmm1 ; xmm4=data7
|
||||
packssdw xmm0, xmm5 ; xmm0=data1
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
|
||||
|
||||
movdqa xmm1, xmm3
|
||||
movdqa xmm5, xmm3
|
||||
punpcklwd xmm1, xmm7
|
||||
punpckhwd xmm5, xmm7
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm7, xmm5
|
||||
pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L
|
||||
pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H
|
||||
pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L
|
||||
pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H
|
||||
|
||||
paddd xmm1, xmm2 ; xmm1=data5L
|
||||
paddd xmm5, xmm6 ; xmm5=data5H
|
||||
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
|
||||
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
|
||||
|
||||
paddd xmm1, [rel PD_DESCALE_P2]
|
||||
paddd xmm5, [rel PD_DESCALE_P2]
|
||||
psrad xmm1, DESCALE_P2
|
||||
psrad xmm5, DESCALE_P2
|
||||
paddd xmm3, [rel PD_DESCALE_P2]
|
||||
paddd xmm7, [rel PD_DESCALE_P2]
|
||||
psrad xmm3, DESCALE_P2
|
||||
psrad xmm7, DESCALE_P2
|
||||
|
||||
packssdw xmm1, xmm5 ; xmm1=data5
|
||||
packssdw xmm3, xmm7 ; xmm3=data3
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
481
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctflt-sse2.asm
Normal file
481
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctflt-sse2.asm
Normal file
|
|
@ -0,0 +1,481 @@
|
|||
;
|
||||
; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||
shufps %1, %2, 0x44
|
||||
%endmacro
|
||||
|
||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||
shufps %1, %2, 0xEE
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_float_sse2)
|
||||
|
||||
EXTN(jconst_idct_float_sse2):
|
||||
|
||||
PD_1_414 times 4 dd 1.414213562373095048801689
|
||||
PD_1_847 times 4 dd 1.847759065022573512256366
|
||||
PD_1_082 times 4 dd 1.082392200292393968799446
|
||||
PD_M2_613 times 4 dd -2.613125929752753055713286
|
||||
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = void *dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
|
||||
; FAST_FLOAT workspace[DCTSIZE2]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_float_sse2)
|
||||
|
||||
EXTN(jsimd_idct_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [workspace]
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
lea rdi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov rcx, DCTSIZE/4 ; ctr
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
||||
mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, xmm2
|
||||
por xmm3, xmm4
|
||||
por xmm5, xmm6
|
||||
por xmm1, xmm3
|
||||
por xmm5, xmm7
|
||||
por xmm1, xmm5
|
||||
packsswb xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test rax, rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm1, xmm0
|
||||
movaps xmm2, xmm0
|
||||
movaps xmm3, xmm0
|
||||
|
||||
shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
|
||||
shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
|
||||
shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
|
||||
shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
jmp near .nextcolumn
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
|
||||
psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
|
||||
cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
|
||||
cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
|
||||
|
||||
punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
|
||||
punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
|
||||
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
|
||||
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
|
||||
cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
|
||||
cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
|
||||
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4, xmm0
|
||||
movaps xmm5, xmm1
|
||||
subps xmm0, xmm2 ; xmm0=tmp11
|
||||
subps xmm1, xmm3
|
||||
addps xmm4, xmm2 ; xmm4=tmp10
|
||||
addps xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1, [rel PD_1_414]
|
||||
subps xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm0
|
||||
subps xmm4, xmm5 ; xmm4=tmp3
|
||||
subps xmm0, xmm1 ; xmm0=tmp2
|
||||
addps xmm6, xmm5 ; xmm6=tmp0
|
||||
addps xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
|
||||
punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
|
||||
psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
|
||||
psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
|
||||
cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
|
||||
cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
|
||||
|
||||
punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
|
||||
punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
|
||||
psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
|
||||
psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
|
||||
cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
|
||||
cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
|
||||
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4, xmm2
|
||||
movaps xmm0, xmm5
|
||||
addps xmm2, xmm1 ; xmm2=z11
|
||||
addps xmm5, xmm3 ; xmm5=z13
|
||||
subps xmm4, xmm1 ; xmm4=z12
|
||||
subps xmm0, xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1, xmm2
|
||||
subps xmm2, xmm5
|
||||
addps xmm1, xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3, xmm0
|
||||
addps xmm0, xmm4
|
||||
mulps xmm0, [rel PD_1_847] ; xmm0=z5
|
||||
mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3, xmm0 ; xmm3=tmp12
|
||||
subps xmm4, xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3, xmm1 ; xmm3=tmp6
|
||||
movaps xmm5, xmm6
|
||||
movaps xmm0, xmm7
|
||||
addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
|
||||
addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
|
||||
subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
|
||||
subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
|
||||
subps xmm2, xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1, xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
|
||||
unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
|
||||
movaps xmm3, xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
|
||||
unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
||||
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
||||
|
||||
addps xmm4, xmm2 ; xmm4=tmp4
|
||||
movaps xmm0, xmm7
|
||||
movaps xmm3, xmm5
|
||||
addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
|
||||
addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
|
||||
subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
|
||||
subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
|
||||
|
||||
movaps xmm2, xmm7 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
|
||||
unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
|
||||
movaps xmm4, xmm5 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
|
||||
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
|
||||
|
||||
movaps xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
|
||||
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
|
||||
movaps xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
|
||||
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
||||
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
|
||||
movaps xmm6, xmm5 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
|
||||
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
|
||||
movaps xmm3, xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
|
||||
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
|
||||
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
|
||||
.nextcolumn:
|
||||
add rsi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
|
||||
add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
|
||||
dec rcx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
lea rsi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
mov rcx, DCTSIZE/4 ; ctr
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4, xmm0
|
||||
movaps xmm5, xmm1
|
||||
subps xmm0, xmm2 ; xmm0=tmp11
|
||||
subps xmm1, xmm3
|
||||
addps xmm4, xmm2 ; xmm4=tmp10
|
||||
addps xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1, [rel PD_1_414]
|
||||
subps xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm0
|
||||
subps xmm4, xmm5 ; xmm4=tmp3
|
||||
subps xmm0, xmm1 ; xmm0=tmp2
|
||||
addps xmm6, xmm5 ; xmm6=tmp0
|
||||
addps xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4, xmm2
|
||||
movaps xmm0, xmm5
|
||||
addps xmm2, xmm1 ; xmm2=z11
|
||||
addps xmm5, xmm3 ; xmm5=z13
|
||||
subps xmm4, xmm1 ; xmm4=z12
|
||||
subps xmm0, xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1, xmm2
|
||||
subps xmm2, xmm5
|
||||
addps xmm1, xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3, xmm0
|
||||
addps xmm0, xmm4
|
||||
mulps xmm0, [rel PD_1_847] ; xmm0=z5
|
||||
mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3, xmm0 ; xmm3=tmp12
|
||||
subps xmm4, xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3, xmm1 ; xmm3=tmp6
|
||||
movaps xmm5, xmm6
|
||||
movaps xmm0, xmm7
|
||||
addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
|
||||
addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
|
||||
subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
|
||||
subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
|
||||
subps xmm2, xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
|
||||
pcmpeqd xmm3, xmm3
|
||||
psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||
|
||||
addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
|
||||
addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
|
||||
addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
|
||||
addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
|
||||
|
||||
pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
|
||||
pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
|
||||
pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
|
||||
pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
|
||||
por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||
por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
|
||||
|
||||
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
|
||||
|
||||
addps xmm4, xmm2 ; xmm4=tmp4
|
||||
movaps xmm7, xmm1
|
||||
movaps xmm5, xmm3
|
||||
addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
|
||||
addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
|
||||
subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
|
||||
subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
|
||||
|
||||
movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
|
||||
pcmpeqd xmm4, xmm4
|
||||
psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||
|
||||
addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
|
||||
addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
|
||||
addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
|
||||
addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
|
||||
|
||||
pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
|
||||
pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
|
||||
pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
|
||||
pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
|
||||
por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
|
||||
por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
|
||||
|
||||
packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
|
||||
packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
|
||||
paddb xmm6, xmm2
|
||||
paddb xmm1, xmm2
|
||||
|
||||
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
|
||||
movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
|
||||
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
|
||||
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
|
||||
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
|
||||
add rdi, byte 4*SIZEOF_JSAMPROW
|
||||
dec rcx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
490
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctfst-sse2.asm
Normal file
490
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctfst-sse2.asm
Normal file
|
|
@ -0,0 +1,490 @@
|
|||
;
|
||||
; jidctfst.asm - fast integer IDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the inverse DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
|
||||
; for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%if IFAST_SCALE_BITS != PASS1_BITS
|
||||
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
|
||||
%endif
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_1_082 equ 277 ; FIX(1.082392200)
|
||||
F_1_414 equ 362 ; FIX(1.414213562)
|
||||
F_1_847 equ 473 ; FIX(1.847759065)
|
||||
F_2_613 equ 669 ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
|
||||
F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_ifast_sse2)
|
||||
|
||||
EXTN(jconst_idct_ifast_sse2):
|
||||
|
||||
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
|
||||
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
|
||||
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
|
||||
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = jpeg_component_info *compptr
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
|
||||
|
||||
EXTN(jsimd_idct_ifast_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
|
||||
mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, xmm0
|
||||
packsswb xmm1, xmm1
|
||||
packsswb xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test rax, rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
|
||||
pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
|
||||
pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
|
||||
pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
|
||||
pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
|
||||
pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
|
||||
pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
|
||||
pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
|
||||
jmp near .column_end
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm1
|
||||
psubw xmm0, xmm2 ; xmm0=tmp11
|
||||
psubw xmm1, xmm3
|
||||
paddw xmm4, xmm2 ; xmm4=tmp10
|
||||
paddw xmm5, xmm3 ; xmm5=tmp13
|
||||
|
||||
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm1, [rel PW_F1414]
|
||||
psubw xmm1, xmm5 ; xmm1=tmp12
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm0
|
||||
psubw xmm4, xmm5 ; xmm4=tmp3
|
||||
psubw xmm0, xmm1 ; xmm0=tmp2
|
||||
paddw xmm6, xmm5 ; xmm6=tmp0
|
||||
paddw xmm7, xmm1 ; xmm7=tmp1
|
||||
|
||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
movdqa xmm0, xmm5
|
||||
psubw xmm2, xmm1 ; xmm2=z12
|
||||
psubw xmm5, xmm3 ; xmm5=z10
|
||||
paddw xmm4, xmm1 ; xmm4=z11
|
||||
paddw xmm0, xmm3 ; xmm0=z13
|
||||
|
||||
movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
|
||||
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
psubw xmm4, xmm0
|
||||
paddw xmm3, xmm0 ; xmm3=tmp7
|
||||
|
||||
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
paddw xmm5, xmm2
|
||||
pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
|
||||
pmulhw xmm0, [rel PW_MF1613]
|
||||
pmulhw xmm2, [rel PW_F1082]
|
||||
psubw xmm0, xmm1
|
||||
psubw xmm2, xmm5 ; xmm2=tmp10
|
||||
paddw xmm0, xmm5 ; xmm0=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw xmm0, xmm3 ; xmm0=tmp6
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm5, xmm7
|
||||
paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
|
||||
paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
|
||||
psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
|
||||
psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
|
||||
psubw xmm4, xmm0 ; xmm4=tmp5
|
||||
|
||||
movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
|
||||
|
||||
paddw xmm2, xmm4 ; xmm2=tmp4
|
||||
movdqa xmm5, xmm7
|
||||
movdqa xmm0, xmm1
|
||||
paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
|
||||
paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
|
||||
psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||
psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||
movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
|
||||
movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
|
||||
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
|
||||
|
||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
|
||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
|
||||
punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
|
||||
movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
|
||||
punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
|
||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
|
||||
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
|
||||
punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
|
||||
movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
|
||||
punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
|
||||
; -- Even part
|
||||
|
||||
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
|
||||
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm0, xmm5
|
||||
psubw xmm6, xmm1 ; xmm6=tmp11
|
||||
psubw xmm5, xmm3
|
||||
paddw xmm2, xmm1 ; xmm2=tmp10
|
||||
paddw xmm0, xmm3 ; xmm0=tmp13
|
||||
|
||||
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5, [rel PW_F1414]
|
||||
psubw xmm5, xmm0 ; xmm5=tmp12
|
||||
|
||||
movdqa xmm1, xmm2
|
||||
movdqa xmm3, xmm6
|
||||
psubw xmm2, xmm0 ; xmm2=tmp3
|
||||
psubw xmm6, xmm5 ; xmm6=tmp2
|
||||
paddw xmm1, xmm0 ; xmm1=tmp0
|
||||
paddw xmm3, xmm5 ; xmm3=tmp1
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm6, xmm4
|
||||
psubw xmm0, xmm7 ; xmm0=z12
|
||||
psubw xmm4, xmm5 ; xmm4=z10
|
||||
paddw xmm2, xmm7 ; xmm2=z11
|
||||
paddw xmm6, xmm5 ; xmm6=z13
|
||||
|
||||
movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
|
||||
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm4, PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movdqa xmm5, xmm2
|
||||
psubw xmm2, xmm6
|
||||
paddw xmm5, xmm6 ; xmm5=tmp7
|
||||
|
||||
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
paddw xmm4, xmm0
|
||||
pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
|
||||
pmulhw xmm6, [rel PW_MF1613]
|
||||
pmulhw xmm0, [rel PW_F1082]
|
||||
psubw xmm6, xmm7
|
||||
psubw xmm0, xmm4 ; xmm0=tmp10
|
||||
paddw xmm6, xmm4 ; xmm6=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw xmm6, xmm5 ; xmm6=tmp6
|
||||
movdqa xmm7, xmm1
|
||||
movdqa xmm4, xmm3
|
||||
paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
|
||||
paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||
psraw xmm1, (PASS1_BITS+3) ; descale
|
||||
psraw xmm3, (PASS1_BITS+3) ; descale
|
||||
psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
|
||||
psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||
psubw xmm2, xmm6 ; xmm2=tmp5
|
||||
|
||||
packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
|
||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
|
||||
|
||||
paddw xmm0, xmm2 ; xmm0=tmp4
|
||||
movdqa xmm4, xmm5
|
||||
movdqa xmm7, xmm6
|
||||
paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
|
||||
paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
|
||||
psraw xmm5, (PASS1_BITS+3) ; descale
|
||||
psraw xmm6, (PASS1_BITS+3) ; descale
|
||||
psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||
psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
|
||||
psraw xmm4, (PASS1_BITS+3) ; descale
|
||||
psraw xmm7, (PASS1_BITS+3) ; descale
|
||||
|
||||
movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
|
||||
|
||||
packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||
packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||
|
||||
paddb xmm1, xmm2
|
||||
paddb xmm3, xmm2
|
||||
paddb xmm5, xmm2
|
||||
paddb xmm7, xmm2
|
||||
|
||||
movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||
punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||
punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
|
||||
|
||||
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
|
||||
mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
417
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctint-avx2.asm
Normal file
417
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctint-avx2.asm
Normal file
|
|
@ -0,0 +1,417 @@
|
|||
;
|
||||
; jidctint.asm - accurate integer IDCT (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, 2018, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%8: Temp registers
|
||||
|
||||
%macro dotranspose 8
|
||||
; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
|
||||
; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
|
||||
; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
|
||||
; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76)
|
||||
|
||||
vpermq %5, %1, 0xD8
|
||||
vpermq %6, %2, 0x72
|
||||
vpermq %7, %3, 0xD8
|
||||
vpermq %8, %4, 0x72
|
||||
; transpose coefficients(phase 1)
|
||||
; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
|
||||
; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
|
||||
; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
|
||||
; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
|
||||
|
||||
vpunpcklwd %1, %5, %6
|
||||
vpunpckhwd %2, %5, %6
|
||||
vpunpcklwd %3, %7, %8
|
||||
vpunpckhwd %4, %7, %8
|
||||
; transpose coefficients(phase 2)
|
||||
; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72)
|
||||
; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73)
|
||||
; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76)
|
||||
; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77)
|
||||
|
||||
vpunpcklwd %5, %1, %2
|
||||
vpunpcklwd %6, %3, %4
|
||||
vpunpckhwd %7, %1, %2
|
||||
vpunpckhwd %8, %3, %4
|
||||
; transpose coefficients(phase 3)
|
||||
; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53)
|
||||
; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57)
|
||||
; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73)
|
||||
; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77)
|
||||
|
||||
vpunpcklqdq %1, %5, %6
|
||||
vpunpckhqdq %2, %5, %6
|
||||
vpunpcklqdq %3, %7, %8
|
||||
vpunpckhqdq %4, %7, %8
|
||||
; transpose coefficients(phase 4)
|
||||
; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
|
||||
; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
|
||||
; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
|
||||
; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; In-place 8x8x16-bit slow integer inverse DCT using AVX2 instructions
|
||||
; %1-%4: Input/output registers
|
||||
; %5-%12: Temp registers
|
||||
; %9: Pass (1 or 2)
|
||||
|
||||
%macro dodct 13
|
||||
; -- Even part
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2
|
||||
vpunpcklwd %5, %3, %6 ; %5=in26_62L
|
||||
vpunpckhwd %6, %3, %6 ; %6=in26_62H
|
||||
vpmaddwd %5, %5, [rel PW_F130_F054_MF130_F054] ; %5=tmp3_2L
|
||||
vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=tmp3_2H
|
||||
|
||||
vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0
|
||||
vpsignw %1, %1, [rel PW_1_NEG1]
|
||||
vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4)
|
||||
|
||||
vpxor %1, %1, %1
|
||||
vpunpcklwd %8, %1, %7 ; %8=tmp0_1L
|
||||
vpunpckhwd %1, %1, %7 ; %1=tmp0_1H
|
||||
vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS
|
||||
vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS
|
||||
|
||||
vpsubd %11, %8, %5 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
|
||||
vpaddd %9, %8, %5 ; %9=tmp0_1L+tmp3_2L=tmp10_11L
|
||||
vpsubd %12, %1, %6 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
|
||||
vpaddd %10, %1, %6 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3
|
||||
vpunpcklwd %7, %1, %8 ; %7=z34_43L
|
||||
vpunpckhwd %8, %1, %8 ; %8=z34_43H
|
||||
vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4L
|
||||
vpmaddwd %8, %8, [rel PW_MF078_F117_F078_F117] ; %8=z3_4H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3
|
||||
vpunpcklwd %3, %4, %2 ; %3=in71_53L
|
||||
vpunpckhwd %4, %4, %2 ; %4=in71_53H
|
||||
|
||||
vpmaddwd %5, %3, [rel PW_MF060_MF089_MF050_MF256] ; %5=tmp0_1L
|
||||
vpmaddwd %6, %4, [rel PW_MF060_MF089_MF050_MF256] ; %6=tmp0_1H
|
||||
vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L
|
||||
vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H
|
||||
|
||||
vpmaddwd %3, %3, [rel PW_MF089_F060_MF256_F050] ; %3=tmp3_2L
|
||||
vpmaddwd %4, %4, [rel PW_MF089_F060_MF256_F050] ; %4=tmp3_2H
|
||||
vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L
|
||||
vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H
|
||||
vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L
|
||||
vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
vpaddd %1, %9, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L
|
||||
vpaddd %2, %10, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H
|
||||
vpaddd %1, %1, [rel PD_DESCALE_P %+ %13]
|
||||
vpaddd %2, %2, [rel PD_DESCALE_P %+ %13]
|
||||
vpsrad %1, %1, DESCALE_P %+ %13
|
||||
vpsrad %2, %2, DESCALE_P %+ %13
|
||||
vpackssdw %1, %1, %2 ; %1=data0_1
|
||||
|
||||
vpsubd %3, %9, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L
|
||||
vpsubd %4, %10, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H
|
||||
vpaddd %3, %3, [rel PD_DESCALE_P %+ %13]
|
||||
vpaddd %4, %4, [rel PD_DESCALE_P %+ %13]
|
||||
vpsrad %3, %3, DESCALE_P %+ %13
|
||||
vpsrad %4, %4, DESCALE_P %+ %13
|
||||
vpackssdw %4, %3, %4 ; %4=data7_6
|
||||
|
||||
vpaddd %7, %11, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L
|
||||
vpaddd %8, %12, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H
|
||||
vpaddd %7, %7, [rel PD_DESCALE_P %+ %13]
|
||||
vpaddd %8, %8, [rel PD_DESCALE_P %+ %13]
|
||||
vpsrad %7, %7, DESCALE_P %+ %13
|
||||
vpsrad %8, %8, DESCALE_P %+ %13
|
||||
vpackssdw %2, %7, %8 ; %2=data3_2
|
||||
|
||||
vpsubd %7, %11, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L
|
||||
vpsubd %8, %12, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H
|
||||
vpaddd %7, %7, [rel PD_DESCALE_P %+ %13]
|
||||
vpaddd %8, %8, [rel PD_DESCALE_P %+ %13]
|
||||
vpsrad %7, %7, DESCALE_P %+ %13
|
||||
vpsrad %8, %8, DESCALE_P %+ %13
|
||||
vpackssdw %3, %7, %8 ; %3=data4_5
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_islow_avx2)
|
||||
|
||||
EXTN(jconst_idct_islow_avx2):
|
||||
|
||||
PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
times 4 dw (F_0_541 - F_1_847), F_0_541
|
||||
PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
times 4 dw (F_1_175 - F_0_390), F_1_175
|
||||
PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
times 4 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
|
||||
PB_CENTERJSAMP times 32 db CENTERJSAMPLE
|
||||
PW_1_NEG1 times 8 dw 1
|
||||
times 8 dw -1
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = jpeg_component_info *compptr
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
|
||||
|
||||
EXTN(jsimd_idct_islow_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
push_xmm 4
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns.
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
|
||||
mov eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
|
||||
vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
|
||||
vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
|
||||
vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
|
||||
vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
|
||||
vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
|
||||
vpor xmm1, xmm1, xmm0
|
||||
vpacksswb xmm1, xmm1, xmm1
|
||||
vpacksswb xmm1, xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test rax, rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
|
||||
vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
vpsllw xmm5, xmm5, PASS1_BITS
|
||||
|
||||
vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03)
|
||||
vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07)
|
||||
vinserti128 ymm4, ymm4, xmm5, 1
|
||||
|
||||
vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04)
|
||||
vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05)
|
||||
vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06)
|
||||
vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07)
|
||||
|
||||
jmp near .column_end
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)] ; ymm4=in0_1
|
||||
vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)] ; ymm5=in2_3
|
||||
vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)] ; ymm6=in4_5
|
||||
vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)] ; ymm7=in6_7
|
||||
vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4
|
||||
vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1
|
||||
vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
|
||||
vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
|
||||
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
|
||||
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
|
||||
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows.
|
||||
|
||||
vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
|
||||
vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
|
||||
|
||||
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
|
||||
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
|
||||
|
||||
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
|
||||
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
|
||||
|
||||
vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
|
||||
vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67
|
||||
vpaddb ymm0, ymm0, [rel PB_CENTERJSAMP]
|
||||
vpaddb ymm1, ymm1, [rel PB_CENTERJSAMP]
|
||||
|
||||
vextracti128 xmm6, ymm1, 1 ; xmm3=data67
|
||||
vextracti128 xmm4, ymm0, 1 ; xmm2=data45
|
||||
vextracti128 xmm2, ymm1, 0 ; xmm1=data23
|
||||
vextracti128 xmm0, ymm0, 0 ; xmm0=data01
|
||||
|
||||
vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
vzeroupper
|
||||
|
||||
mov eax, r13d
|
||||
|
||||
mov rdx, JSAMPROW [r12+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rsi, JSAMPROW [r12+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
|
||||
|
||||
mov rdx, JSAMPROW [r12+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rsi, JSAMPROW [r12+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
mov rdx, JSAMPROW [r12+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rsi, JSAMPROW [r12+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
||||
|
||||
mov rdx, JSAMPROW [r12+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rsi, JSAMPROW [r12+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
|
||||
|
||||
uncollect_args 4
|
||||
pop_xmm 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
846
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctint-sse2.asm
Normal file
846
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctint-sse2.asm
Normal file
|
|
@ -0,0 +1,846 @@
|
|||
;
|
||||
; jidctint.asm - accurate integer IDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_islow_sse2)
|
||||
|
||||
EXTN(jconst_idct_islow_sse2):
|
||||
|
||||
PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
|
||||
PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
|
||||
PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
|
||||
PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
|
||||
PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
|
||||
PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
|
||||
PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
|
||||
PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
|
||||
PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
|
||||
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = jpeg_component_info *compptr
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 12
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
|
||||
|
||||
EXTN(jsimd_idct_islow_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
|
||||
mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, xmm0
|
||||
packsswb xmm1, xmm1
|
||||
packsswb xmm1, xmm1
|
||||
movd eax, xmm1
|
||||
test rax, rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw xmm5, PASS1_BITS
|
||||
|
||||
movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
|
||||
pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
|
||||
pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
|
||||
pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
|
||||
pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
|
||||
pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
|
||||
pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
|
||||
pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
|
||||
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
|
||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||
jmp near .column_end
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movdqa xmm4, xmm1 ; xmm1=in2=z2
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm4, xmm3 ; xmm3=in6=z3
|
||||
punpckhwd xmm5, xmm3
|
||||
movdqa xmm1, xmm4
|
||||
movdqa xmm3, xmm5
|
||||
pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L
|
||||
pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
|
||||
pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
|
||||
pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
paddw xmm0, xmm2 ; xmm0=in0+in4
|
||||
psubw xmm6, xmm2 ; xmm6=in0-in4
|
||||
|
||||
pxor xmm7, xmm7
|
||||
pxor xmm2, xmm2
|
||||
punpcklwd xmm7, xmm0 ; xmm7=tmp0L
|
||||
punpckhwd xmm2, xmm0 ; xmm2=tmp0H
|
||||
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||
psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
|
||||
|
||||
movdqa xmm0, xmm7
|
||||
paddd xmm7, xmm4 ; xmm7=tmp10L
|
||||
psubd xmm0, xmm4 ; xmm0=tmp13L
|
||||
movdqa xmm4, xmm2
|
||||
paddd xmm2, xmm5 ; xmm2=tmp10H
|
||||
psubd xmm4, xmm5 ; xmm4=tmp13H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
|
||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
|
||||
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
|
||||
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
|
||||
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm7, xmm7
|
||||
punpcklwd xmm5, xmm6 ; xmm5=tmp1L
|
||||
punpckhwd xmm7, xmm6 ; xmm7=tmp1H
|
||||
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||
psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||
psubd xmm2, xmm1 ; xmm2=tmp12L
|
||||
movdqa xmm0, xmm7
|
||||
paddd xmm7, xmm3 ; xmm7=tmp11H
|
||||
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
|
||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
|
||||
movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm5, xmm6
|
||||
movdqa xmm7, xmm4
|
||||
paddw xmm5, xmm3 ; xmm5=z3
|
||||
paddw xmm7, xmm1 ; xmm7=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
punpcklwd xmm2, xmm7
|
||||
punpckhwd xmm0, xmm7
|
||||
movdqa xmm5, xmm2
|
||||
movdqa xmm7, xmm0
|
||||
pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L
|
||||
pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H
|
||||
pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
|
||||
pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
|
||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movdqa xmm2, xmm3
|
||||
movdqa xmm0, xmm3
|
||||
punpcklwd xmm2, xmm4
|
||||
punpckhwd xmm0, xmm4
|
||||
movdqa xmm3, xmm2
|
||||
movdqa xmm4, xmm0
|
||||
pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L
|
||||
pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H
|
||||
pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L
|
||||
pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H
|
||||
|
||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
|
||||
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
|
||||
paddd xmm3, xmm5 ; xmm3=tmp3L
|
||||
paddd xmm4, xmm7 ; xmm4=tmp3H
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
|
||||
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm0, xmm1
|
||||
punpcklwd xmm2, xmm6
|
||||
punpckhwd xmm0, xmm6
|
||||
movdqa xmm1, xmm2
|
||||
movdqa xmm6, xmm0
|
||||
pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L
|
||||
pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H
|
||||
pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L
|
||||
pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
|
||||
|
||||
paddd xmm2, xmm5 ; xmm2=tmp1L
|
||||
paddd xmm0, xmm7 ; xmm0=tmp1H
|
||||
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
|
||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
|
||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
|
||||
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm0, xmm7
|
||||
paddd xmm5, xmm3 ; xmm5=data0L
|
||||
paddd xmm7, xmm4 ; xmm7=data0H
|
||||
psubd xmm2, xmm3 ; xmm2=data7L
|
||||
psubd xmm0, xmm4 ; xmm0=data7H
|
||||
|
||||
movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
|
||||
|
||||
paddd xmm5, xmm3
|
||||
paddd xmm7, xmm3
|
||||
psrad xmm5, DESCALE_P1
|
||||
psrad xmm7, DESCALE_P1
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm0, xmm3
|
||||
psrad xmm2, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
|
||||
packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
|
||||
packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
|
||||
|
||||
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
|
||||
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
|
||||
|
||||
movdqa xmm7, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
paddd xmm4, xmm1 ; xmm4=data1L
|
||||
paddd xmm3, xmm6 ; xmm3=data1H
|
||||
psubd xmm7, xmm1 ; xmm7=data6L
|
||||
psubd xmm0, xmm6 ; xmm0=data6H
|
||||
|
||||
movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
|
||||
|
||||
paddd xmm4, xmm1
|
||||
paddd xmm3, xmm1
|
||||
psrad xmm4, DESCALE_P1
|
||||
psrad xmm3, DESCALE_P1
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm0, xmm1
|
||||
psrad xmm7, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
|
||||
packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||
packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
|
||||
|
||||
movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
|
||||
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
|
||||
movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
|
||||
movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
|
||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
|
||||
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm5, xmm3
|
||||
movdqa xmm6, xmm0
|
||||
paddd xmm3, xmm4 ; xmm3=data2L
|
||||
paddd xmm0, xmm2 ; xmm0=data2H
|
||||
psubd xmm5, xmm4 ; xmm5=data5L
|
||||
psubd xmm6, xmm2 ; xmm6=data5H
|
||||
|
||||
movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
|
||||
|
||||
paddd xmm3, xmm7
|
||||
paddd xmm0, xmm7
|
||||
psrad xmm3, DESCALE_P1
|
||||
psrad xmm0, DESCALE_P1
|
||||
paddd xmm5, xmm7
|
||||
paddd xmm6, xmm7
|
||||
psrad xmm5, DESCALE_P1
|
||||
psrad xmm6, DESCALE_P1
|
||||
|
||||
packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
|
||||
packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
|
||||
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
|
||||
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
|
||||
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
|
||||
|
||||
movdqa xmm0, xmm1
|
||||
movdqa xmm6, xmm4
|
||||
paddd xmm1, xmm2 ; xmm1=data3L
|
||||
paddd xmm4, xmm7 ; xmm4=data3H
|
||||
psubd xmm0, xmm2 ; xmm0=data4L
|
||||
psubd xmm6, xmm7 ; xmm6=data4H
|
||||
|
||||
movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
|
||||
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm4, xmm2
|
||||
psrad xmm1, DESCALE_P1
|
||||
psrad xmm4, DESCALE_P1
|
||||
paddd xmm0, xmm2
|
||||
paddd xmm6, xmm2
|
||||
psrad xmm0, DESCALE_P1
|
||||
psrad xmm6, DESCALE_P1
|
||||
|
||||
packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
|
||||
packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
|
||||
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
|
||||
|
||||
movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||
movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
|
||||
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
|
||||
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
|
||||
punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
|
||||
movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
|
||||
punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
|
||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
|
||||
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
|
||||
|
||||
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
|
||||
punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
|
||||
movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
|
||||
punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
|
||||
; -- Even part
|
||||
|
||||
; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movdqa xmm6, xmm1 ; xmm1=in2=z2
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm6, xmm2 ; xmm2=in6=z3
|
||||
punpckhwd xmm5, xmm2
|
||||
movdqa xmm1, xmm6
|
||||
movdqa xmm2, xmm5
|
||||
pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L
|
||||
pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
|
||||
pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
|
||||
pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H
|
||||
|
||||
movdqa xmm3, xmm7
|
||||
paddw xmm7, xmm0 ; xmm7=in0+in4
|
||||
psubw xmm3, xmm0 ; xmm3=in0-in4
|
||||
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm0, xmm0
|
||||
punpcklwd xmm4, xmm7 ; xmm4=tmp0L
|
||||
punpckhwd xmm0, xmm7 ; xmm0=tmp0H
|
||||
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||
psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
|
||||
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm4, xmm6 ; xmm4=tmp10L
|
||||
psubd xmm7, xmm6 ; xmm7=tmp13L
|
||||
movdqa xmm6, xmm0
|
||||
paddd xmm0, xmm5 ; xmm0=tmp10H
|
||||
psubd xmm6, xmm5 ; xmm6=tmp13H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
|
||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
|
||||
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
|
||||
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm5, xmm3 ; xmm5=tmp1L
|
||||
punpckhwd xmm4, xmm3 ; xmm4=tmp1H
|
||||
psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||
psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
paddd xmm5, xmm1 ; xmm5=tmp11L
|
||||
psubd xmm0, xmm1 ; xmm0=tmp12L
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm4, xmm2 ; xmm4=tmp11H
|
||||
psubd xmm7, xmm2 ; xmm7=tmp12H
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
|
||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
|
||||
movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
|
||||
movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
|
||||
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
|
||||
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
|
||||
|
||||
movdqa xmm5, xmm6
|
||||
movdqa xmm4, xmm3
|
||||
paddw xmm5, xmm1 ; xmm5=z3
|
||||
paddw xmm4, xmm2 ; xmm4=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
movdqa xmm7, xmm5
|
||||
punpcklwd xmm0, xmm4
|
||||
punpckhwd xmm7, xmm4
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm4, xmm7
|
||||
pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L
|
||||
pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H
|
||||
pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
|
||||
pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
|
||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movdqa xmm0, xmm1
|
||||
movdqa xmm7, xmm1
|
||||
punpcklwd xmm0, xmm3
|
||||
punpckhwd xmm7, xmm3
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm7
|
||||
pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L
|
||||
pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H
|
||||
pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L
|
||||
pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H
|
||||
|
||||
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
|
||||
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
|
||||
paddd xmm1, xmm5 ; xmm1=tmp3L
|
||||
paddd xmm3, xmm4 ; xmm3=tmp3H
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
|
||||
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
|
||||
|
||||
movdqa xmm0, xmm2
|
||||
movdqa xmm7, xmm2
|
||||
punpcklwd xmm0, xmm6
|
||||
punpckhwd xmm7, xmm6
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm6, xmm7
|
||||
pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L
|
||||
pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H
|
||||
pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L
|
||||
pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
|
||||
|
||||
paddd xmm0, xmm5 ; xmm0=tmp1L
|
||||
paddd xmm7, xmm4 ; xmm7=tmp1H
|
||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
|
||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
|
||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm5, xmm1 ; xmm5=data0L
|
||||
paddd xmm4, xmm3 ; xmm4=data0H
|
||||
psubd xmm0, xmm1 ; xmm0=data7L
|
||||
psubd xmm7, xmm3 ; xmm7=data7H
|
||||
|
||||
movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
|
||||
|
||||
paddd xmm5, xmm1
|
||||
paddd xmm4, xmm1
|
||||
psrad xmm5, DESCALE_P2
|
||||
psrad xmm4, DESCALE_P2
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm7, xmm1
|
||||
psrad xmm0, DESCALE_P2
|
||||
psrad xmm7, DESCALE_P2
|
||||
|
||||
packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
|
||||
packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
|
||||
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
movdqa xmm7, xmm1
|
||||
paddd xmm3, xmm2 ; xmm3=data1L
|
||||
paddd xmm1, xmm6 ; xmm1=data1H
|
||||
psubd xmm4, xmm2 ; xmm4=data6L
|
||||
psubd xmm7, xmm6 ; xmm7=data6H
|
||||
|
||||
movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
|
||||
|
||||
paddd xmm3, xmm2
|
||||
paddd xmm1, xmm2
|
||||
psrad xmm3, DESCALE_P2
|
||||
psrad xmm1, DESCALE_P2
|
||||
paddd xmm4, xmm2
|
||||
paddd xmm7, xmm2
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm7, DESCALE_P2
|
||||
|
||||
packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||
packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||
|
||||
packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
|
||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
|
||||
movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
|
||||
movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
movdqa xmm0, xmm2
|
||||
paddd xmm6, xmm1 ; xmm6=data2L
|
||||
paddd xmm2, xmm7 ; xmm2=data2H
|
||||
psubd xmm4, xmm1 ; xmm4=data5L
|
||||
psubd xmm0, xmm7 ; xmm0=data5H
|
||||
|
||||
movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
|
||||
|
||||
paddd xmm6, xmm5
|
||||
paddd xmm2, xmm5
|
||||
psrad xmm6, DESCALE_P2
|
||||
psrad xmm2, DESCALE_P2
|
||||
paddd xmm4, xmm5
|
||||
paddd xmm0, xmm5
|
||||
psrad xmm4, DESCALE_P2
|
||||
psrad xmm0, DESCALE_P2
|
||||
|
||||
packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
|
||||
packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
|
||||
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
|
||||
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
|
||||
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
|
||||
|
||||
movdqa xmm2, xmm3
|
||||
movdqa xmm0, xmm1
|
||||
paddd xmm3, xmm7 ; xmm3=data3L
|
||||
paddd xmm1, xmm5 ; xmm1=data3H
|
||||
psubd xmm2, xmm7 ; xmm2=data4L
|
||||
psubd xmm0, xmm5 ; xmm0=data4H
|
||||
|
||||
movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
|
||||
|
||||
paddd xmm3, xmm7
|
||||
paddd xmm1, xmm7
|
||||
psrad xmm3, DESCALE_P2
|
||||
psrad xmm1, DESCALE_P2
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm0, xmm7
|
||||
psrad xmm2, DESCALE_P2
|
||||
psrad xmm0, DESCALE_P2
|
||||
|
||||
movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
|
||||
|
||||
packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
|
||||
packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||
packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||
|
||||
paddb xmm7, xmm5
|
||||
paddb xmm1, xmm5
|
||||
paddb xmm6, xmm5
|
||||
paddb xmm3, xmm5
|
||||
|
||||
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||
punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||
punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||
|
||||
movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||
movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
|
||||
mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
|
||||
mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
||||
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
573
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctred-sse2.asm
Normal file
573
TMessagesProj/jni/mozjpeg/simd/x86_64/jidctred-sse2.asm
Normal file
|
|
@ -0,0 +1,573 @@
|
|||
;
|
||||
; jidctred.asm - reduced-size IDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains inverse-DCT routines that produce reduced-size
|
||||
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
|
||||
; The following code is based directly on the IJG's original jidctred.c;
|
||||
; see the jidctred.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
|
||||
%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
|
||||
%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
|
||||
%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_211 equ 1730 ; FIX(0.211164243)
|
||||
F_0_509 equ 4176 ; FIX(0.509795579)
|
||||
F_0_601 equ 4926 ; FIX(0.601344887)
|
||||
F_0_720 equ 5906 ; FIX(0.720959822)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_850 equ 6967 ; FIX(0.850430095)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_061 equ 8697 ; FIX(1.061594337)
|
||||
F_1_272 equ 10426 ; FIX(1.272758580)
|
||||
F_1_451 equ 11893 ; FIX(1.451774981)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_2_172 equ 17799 ; FIX(2.172734803)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_624 equ 29692 ; FIX(3.624509785)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
|
||||
F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
|
||||
F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
|
||||
F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
|
||||
F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
|
||||
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
|
||||
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
|
||||
F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
|
||||
F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
|
||||
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
|
||||
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
GLOBAL_DATA(jconst_idct_red_sse2)
|
||||
|
||||
EXTN(jconst_idct_red_sse2):
|
||||
|
||||
PW_F184_MF076 times 4 dw F_1_847, -F_0_765
|
||||
PW_F256_F089 times 4 dw F_2_562, F_0_899
|
||||
PW_F106_MF217 times 4 dw F_1_061, -F_2_172
|
||||
PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509
|
||||
PW_F145_MF021 times 4 dw F_1_451, -F_0_211
|
||||
PW_F362_MF127 times 4 dw F_3_624, -F_1_272
|
||||
PW_F085_MF072 times 4 dw F_0_850, -F_0_720
|
||||
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1)
|
||||
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1)
|
||||
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
|
||||
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 4x4 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = void *dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
|
||||
|
||||
EXTN(jsimd_idct_4x4_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
|
||||
mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, xmm1
|
||||
packsswb xmm0, xmm0
|
||||
packsswb xmm0, xmm0
|
||||
movd eax, xmm0
|
||||
test rax, rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw xmm0, PASS1_BITS
|
||||
|
||||
movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
|
||||
pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
|
||||
pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
|
||||
pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
|
||||
|
||||
jmp near .column_end
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm4, xmm1
|
||||
punpckhwd xmm5, xmm1
|
||||
movdqa xmm0, xmm4
|
||||
movdqa xmm1, xmm5
|
||||
pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
|
||||
pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
|
||||
pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
|
||||
pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
|
||||
|
||||
movdqa xmm6, xmm2
|
||||
movdqa xmm7, xmm2
|
||||
punpcklwd xmm6, xmm3
|
||||
punpckhwd xmm7, xmm3
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm3, xmm7
|
||||
pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
|
||||
pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
|
||||
pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
|
||||
pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
|
||||
|
||||
paddd xmm6, xmm4 ; xmm6=tmp2L
|
||||
paddd xmm7, xmm5 ; xmm7=tmp2H
|
||||
paddd xmm2, xmm0 ; xmm2=tmp0L
|
||||
paddd xmm3, xmm1 ; xmm3=tmp0H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm2, xmm2
|
||||
punpcklwd xmm1, xmm4 ; xmm1=tmp0L
|
||||
punpckhwd xmm2, xmm4 ; xmm2=tmp0H
|
||||
psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
|
||||
psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
|
||||
|
||||
movdqa xmm3, xmm5 ; xmm5=in2=z2
|
||||
punpcklwd xmm5, xmm0 ; xmm0=in6=z3
|
||||
punpckhwd xmm3, xmm0
|
||||
pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
|
||||
pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
|
||||
|
||||
movdqa xmm4, xmm1
|
||||
movdqa xmm0, xmm2
|
||||
paddd xmm1, xmm5 ; xmm1=tmp10L
|
||||
paddd xmm2, xmm3 ; xmm2=tmp10H
|
||||
psubd xmm4, xmm5 ; xmm4=tmp12L
|
||||
psubd xmm0, xmm3 ; xmm0=tmp12H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm3, xmm2
|
||||
paddd xmm1, xmm6 ; xmm1=data0L
|
||||
paddd xmm2, xmm7 ; xmm2=data0H
|
||||
psubd xmm5, xmm6 ; xmm5=data3L
|
||||
psubd xmm3, xmm7 ; xmm3=data3H
|
||||
|
||||
movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
|
||||
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm6
|
||||
psrad xmm1, DESCALE_P1_4
|
||||
psrad xmm2, DESCALE_P1_4
|
||||
paddd xmm5, xmm6
|
||||
paddd xmm3, xmm6
|
||||
psrad xmm5, DESCALE_P1_4
|
||||
psrad xmm3, DESCALE_P1_4
|
||||
|
||||
packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
|
||||
packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
|
||||
|
||||
movdqa xmm2, xmm4
|
||||
movdqa xmm3, xmm0
|
||||
paddd xmm4, xmm7 ; xmm4=data1L
|
||||
paddd xmm0, xmm6 ; xmm0=data1H
|
||||
psubd xmm2, xmm7 ; xmm2=data2L
|
||||
psubd xmm3, xmm6 ; xmm3=data2H
|
||||
|
||||
movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
|
||||
|
||||
paddd xmm4, xmm7
|
||||
paddd xmm0, xmm7
|
||||
psrad xmm4, DESCALE_P1_4
|
||||
psrad xmm0, DESCALE_P1_4
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm7
|
||||
psrad xmm2, DESCALE_P1_4
|
||||
psrad xmm3, DESCALE_P1_4
|
||||
|
||||
packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||
packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
|
||||
|
||||
movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
|
||||
; -- Even part
|
||||
|
||||
pxor xmm4, xmm4
|
||||
punpcklwd xmm4, xmm1 ; xmm4=tmp0
|
||||
psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
punpckhwd xmm1, xmm0
|
||||
punpckhwd xmm6, xmm3
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm2, xmm6
|
||||
pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
|
||||
pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
|
||||
pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
|
||||
pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
|
||||
|
||||
paddd xmm6, xmm1 ; xmm6=tmp2
|
||||
paddd xmm2, xmm5 ; xmm2=tmp0
|
||||
|
||||
; -- Even part
|
||||
|
||||
punpcklwd xmm0, xmm3
|
||||
pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
|
||||
|
||||
movdqa xmm7, xmm4
|
||||
paddd xmm4, xmm0 ; xmm4=tmp10
|
||||
psubd xmm7, xmm0 ; xmm7=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
|
||||
|
||||
movdqa xmm5, xmm4
|
||||
movdqa xmm3, xmm7
|
||||
paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
|
||||
paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
|
||||
psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
|
||||
psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
|
||||
|
||||
paddd xmm4, xmm1
|
||||
paddd xmm7, xmm1
|
||||
psrad xmm4, DESCALE_P2_4
|
||||
psrad xmm7, DESCALE_P2_4
|
||||
paddd xmm5, xmm1
|
||||
paddd xmm3, xmm1
|
||||
psrad xmm5, DESCALE_P2_4
|
||||
psrad xmm3, DESCALE_P2_4
|
||||
|
||||
packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
|
||||
packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
|
||||
|
||||
movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
|
||||
|
||||
packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
|
||||
paddb xmm4, [rel PB_CENTERJSAMP]
|
||||
|
||||
pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
|
||||
pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
|
||||
pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
|
||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||
mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
|
||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 2x2 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = void *dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
|
||||
|
||||
EXTN(jsimd_idct_2x2_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
; | input: | result: |
|
||||
; | 00 01 ** 03 ** 05 ** 07 | |
|
||||
; | 10 11 ** 13 ** 15 ** 17 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
|
||||
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
|
||||
; | 50 51 ** 53 ** 55 ** 57 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 70 71 ** 73 ** 75 ** 77 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
|
||||
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
|
||||
|
||||
pcmpeqd xmm7, xmm7
|
||||
pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
|
||||
|
||||
movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
|
||||
movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
|
||||
punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
|
||||
punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
|
||||
pmaddwd xmm4, [rel PW_F362_MF127]
|
||||
pmaddwd xmm5, [rel PW_F085_MF072]
|
||||
|
||||
psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
|
||||
pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
|
||||
psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
|
||||
pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
|
||||
por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
|
||||
por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
|
||||
pmaddwd xmm0, [rel PW_F362_MF127]
|
||||
pmaddwd xmm2, [rel PW_F085_MF072]
|
||||
|
||||
paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
|
||||
paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; xmm6=(00 01 ** 03 ** 05 ** 07)
|
||||
|
||||
movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
|
||||
pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
|
||||
pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
|
||||
psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
|
||||
psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
movdqa xmm5, xmm1
|
||||
paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
|
||||
paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
|
||||
psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
|
||||
psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
|
||||
|
||||
movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
|
||||
|
||||
punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
|
||||
|
||||
movdqa xmm7, xmm1
|
||||
punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
|
||||
punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
|
||||
|
||||
paddd xmm6, xmm2
|
||||
psrad xmm6, DESCALE_P1_2
|
||||
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm7, xmm2
|
||||
psrad xmm1, DESCALE_P1_2
|
||||
psrad xmm7, DESCALE_P1_2
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
|
||||
; | input:| result:|
|
||||
; | A0 B0 | |
|
||||
; | A1 B1 | C0 C1 |
|
||||
; | A3 B3 | D0 D1 |
|
||||
; | A5 B5 | |
|
||||
; | A7 B7 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
|
||||
packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
|
||||
pmaddwd xmm1, [rel PW_F362_MF127]
|
||||
pmaddwd xmm7, [rel PW_F085_MF072]
|
||||
|
||||
paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
|
||||
|
||||
; -- Even part
|
||||
|
||||
pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
|
||||
psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
|
||||
|
||||
punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
|
||||
|
||||
paddd xmm6, [rel PD_DESCALE_P2_2]
|
||||
psrad xmm6, DESCALE_P2_2
|
||||
|
||||
packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
|
||||
packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
|
||||
paddb xmm6, [rel PB_CENTERJSAMP]
|
||||
|
||||
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
|
||||
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov word [rdx+rax*SIZEOF_JSAMPLE], bx
|
||||
mov word [rsi+rax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
154
TMessagesProj/jni/mozjpeg/simd/x86_64/jquantf-sse2.asm
Normal file
154
TMessagesProj/jni/mozjpeg/simd/x86_64/jquantf-sse2.asm
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
;
|
||||
; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11d = JDIMENSION start_col
|
||||
; r12 = FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
|
||||
|
||||
EXTN(jsimd_convsamp_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
push rbx
|
||||
|
||||
pcmpeqw xmm7, xmm7
|
||||
psllw xmm7, 7
|
||||
packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
|
||||
|
||||
mov rsi, r10
|
||||
mov eax, r11d
|
||||
mov rdi, r12
|
||||
mov rcx, DCTSIZE/2
|
||||
.convloop:
|
||||
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
|
||||
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
|
||||
|
||||
psubb xmm0, xmm7 ; xmm0=(01234567)
|
||||
psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
|
||||
|
||||
punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
|
||||
punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
|
||||
|
||||
punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
|
||||
punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
|
||||
punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
|
||||
punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
|
||||
|
||||
psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
|
||||
psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
|
||||
cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
|
||||
cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
|
||||
psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
|
||||
psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
|
||||
cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
|
||||
cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
|
||||
add rsi, byte 2*SIZEOF_JSAMPROW
|
||||
add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec rcx
|
||||
jnz short .convloop
|
||||
|
||||
pop rbx
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
|
||||
; FAST_FLOAT *workspace);
|
||||
;
|
||||
|
||||
; r10 = JCOEFPTR coef_block
|
||||
; r11 = FAST_FLOAT *divisors
|
||||
; r12 = FAST_FLOAT *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
|
||||
|
||||
EXTN(jsimd_quantize_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
mov rsi, r12
|
||||
mov rdx, r11
|
||||
mov rdi, r10
|
||||
mov rax, DCTSIZE2/16
|
||||
.quantloop:
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
cvtps2dq xmm0, xmm0
|
||||
cvtps2dq xmm1, xmm1
|
||||
cvtps2dq xmm2, xmm2
|
||||
cvtps2dq xmm3, xmm3
|
||||
|
||||
packssdw xmm0, xmm1
|
||||
packssdw xmm2, xmm3
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
|
||||
|
||||
add rsi, byte 16*SIZEOF_FAST_FLOAT
|
||||
add rdx, byte 16*SIZEOF_FAST_FLOAT
|
||||
add rdi, byte 16*SIZEOF_JCOEF
|
||||
dec rax
|
||||
jnz short .quantloop
|
||||
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
162
TMessagesProj/jni/mozjpeg/simd/x86_64/jquanti-avx2.asm
Normal file
162
TMessagesProj/jni/mozjpeg/simd/x86_64/jquanti-avx2.asm
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
;
|
||||
; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, 2018, D. R. Commander.
|
||||
; Copyright (C) 2016, Matthieu Darbois.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11d = JDIMENSION start_col
|
||||
; r12 = DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_avx2)
|
||||
|
||||
EXTN(jsimd_convsamp_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
mov eax, r11d
|
||||
|
||||
mov rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
mov rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
mov rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
mov rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
vpmovzxbw ymm0, xmm0 ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
vpmovzxbw ymm1, xmm1 ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
vpmovzxbw ymm2, xmm2 ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
vpmovzxbw ymm3, xmm3 ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
vpcmpeqw ymm7, ymm7, ymm7
|
||||
vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm1, ymm1, ymm7
|
||||
vpaddw ymm2, ymm2, ymm7
|
||||
vpaddw ymm3, ymm3, ymm7
|
||||
|
||||
vmovdqu YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
|
||||
vmovdqu YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
|
||||
vmovdqu YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
|
||||
vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
|
||||
|
||||
vzeroupper
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; This implementation is based on an algorithm described in
|
||||
; "How to optimize for the Pentium family of microprocessors"
|
||||
; (http://www.agner.org/assem/).
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define RECIPROCAL(m, n, b) \
|
||||
YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define CORRECTION(m, n, b) \
|
||||
YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define SCALE(m, n, b) \
|
||||
YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
|
||||
; r10 = JCOEFPTR coef_block
|
||||
; r11 = DCTELEM *divisors
|
||||
; r12 = DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_avx2)
|
||||
|
||||
EXTN(jsimd_quantize_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
|
||||
vmovdqu ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
|
||||
vpabsw ymm0, ymm4
|
||||
vpabsw ymm1, ymm5
|
||||
vpabsw ymm2, ymm6
|
||||
vpabsw ymm3, ymm7
|
||||
|
||||
vpaddw ymm0, YMMWORD [CORRECTION(0,0,r11)] ; correction + roundfactor
|
||||
vpaddw ymm1, YMMWORD [CORRECTION(2,0,r11)]
|
||||
vpaddw ymm2, YMMWORD [CORRECTION(4,0,r11)]
|
||||
vpaddw ymm3, YMMWORD [CORRECTION(6,0,r11)]
|
||||
vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,r11)] ; reciprocal
|
||||
vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
|
||||
vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
|
||||
vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
|
||||
vpmulhuw ymm0, YMMWORD [SCALE(0,0,r11)] ; scale
|
||||
vpmulhuw ymm1, YMMWORD [SCALE(2,0,r11)]
|
||||
vpmulhuw ymm2, YMMWORD [SCALE(4,0,r11)]
|
||||
vpmulhuw ymm3, YMMWORD [SCALE(6,0,r11)]
|
||||
|
||||
vpsignw ymm0, ymm0, ymm4
|
||||
vpsignw ymm1, ymm1, ymm5
|
||||
vpsignw ymm2, ymm2, ymm6
|
||||
vpsignw ymm3, ymm3, ymm7
|
||||
|
||||
vmovdqu [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
|
||||
vmovdqu [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
|
||||
vmovdqu [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
|
||||
vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
|
||||
|
||||
vzeroupper
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
187
TMessagesProj/jni/mozjpeg/simd/x86_64/jquanti-sse2.asm
Normal file
187
TMessagesProj/jni/mozjpeg/simd/x86_64/jquanti-sse2.asm
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
;
|
||||
; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11d = JDIMENSION start_col
|
||||
; r12 = DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_convsamp_sse2)
|
||||
|
||||
EXTN(jsimd_convsamp_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
push rbx
|
||||
|
||||
pxor xmm6, xmm6 ; xmm6=(all 0's)
|
||||
pcmpeqw xmm7, xmm7
|
||||
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
mov rsi, r10
|
||||
mov eax, r11d
|
||||
mov rdi, r12
|
||||
mov rcx, DCTSIZE/4
|
||||
.convloop:
|
||||
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
|
||||
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
|
||||
|
||||
mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
|
||||
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
|
||||
|
||||
punpcklbw xmm0, xmm6 ; xmm0=(01234567)
|
||||
punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
|
||||
paddw xmm0, xmm7
|
||||
paddw xmm1, xmm7
|
||||
punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
|
||||
punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
|
||||
paddw xmm2, xmm7
|
||||
paddw xmm3, xmm7
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
add rsi, byte 4*SIZEOF_JSAMPROW
|
||||
add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec rcx
|
||||
jnz short .convloop
|
||||
|
||||
pop rbx
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; This implementation is based on an algorithm described in
|
||||
; "How to optimize for the Pentium family of microprocessors"
|
||||
; (http://www.agner.org/assem/).
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
%define RECIPROCAL(m, n, b) \
|
||||
XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define CORRECTION(m, n, b) \
|
||||
XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
%define SCALE(m, n, b) \
|
||||
XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
|
||||
|
||||
; r10 = JCOEFPTR coef_block
|
||||
; r11 = DCTELEM *divisors
|
||||
; r12 = DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_quantize_sse2)
|
||||
|
||||
EXTN(jsimd_quantize_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
mov rsi, r12
|
||||
mov rdx, r11
|
||||
mov rdi, r10
|
||||
mov rax, DCTSIZE2/32
|
||||
.quantloop:
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm0, xmm4
|
||||
movdqa xmm1, xmm5
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm3, xmm7
|
||||
psraw xmm4, (WORD_BIT-1)
|
||||
psraw xmm5, (WORD_BIT-1)
|
||||
psraw xmm6, (WORD_BIT-1)
|
||||
psraw xmm7, (WORD_BIT-1)
|
||||
pxor xmm0, xmm4
|
||||
pxor xmm1, xmm5
|
||||
pxor xmm2, xmm6
|
||||
pxor xmm3, xmm7
|
||||
psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
|
||||
psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
|
||||
psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
|
||||
psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
|
||||
|
||||
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
|
||||
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
|
||||
paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
|
||||
paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
|
||||
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
|
||||
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
|
||||
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
|
||||
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
|
||||
pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
|
||||
pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
|
||||
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
|
||||
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
|
||||
|
||||
pxor xmm0, xmm4
|
||||
pxor xmm1, xmm5
|
||||
pxor xmm2, xmm6
|
||||
pxor xmm3, xmm7
|
||||
psubw xmm0, xmm4
|
||||
psubw xmm1, xmm5
|
||||
psubw xmm2, xmm6
|
||||
psubw xmm3, xmm7
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
add rsi, byte 32*SIZEOF_DCTELEM
|
||||
add rdx, byte 32*SIZEOF_DCTELEM
|
||||
add rdi, byte 32*SIZEOF_JCOEF
|
||||
dec rax
|
||||
jnz near .quantloop
|
||||
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
1076
TMessagesProj/jni/mozjpeg/simd/x86_64/jsimd.c
Normal file
1076
TMessagesProj/jni/mozjpeg/simd/x86_64/jsimd.c
Normal file
File diff suppressed because it is too large
Load diff
86
TMessagesProj/jni/mozjpeg/simd/x86_64/jsimdcpu.asm
Normal file
86
TMessagesProj/jni/mozjpeg/simd/x86_64/jsimdcpu.asm
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
;
|
||||
; jsimdcpu.asm - SIMD instruction support check
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Check if the CPU supports SIMD instructions
|
||||
;
|
||||
; GLOBAL(unsigned int)
|
||||
; jpeg_simd_cpu_support(void)
|
||||
;
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jpeg_simd_cpu_support)
|
||||
|
||||
EXTN(jpeg_simd_cpu_support):
|
||||
push rbx
|
||||
push rdi
|
||||
|
||||
xor rdi, rdi ; simd support flag
|
||||
|
||||
; Assume that all x86-64 processors support SSE & SSE2 instructions
|
||||
or rdi, JSIMD_SSE2
|
||||
or rdi, JSIMD_SSE
|
||||
|
||||
; Check whether CPUID leaf 07H is supported
|
||||
; (leaf 07H is used to check for AVX2 instruction support)
|
||||
mov rax, 0
|
||||
cpuid
|
||||
cmp rax, 7
|
||||
jl short .return ; Maximum leaf < 07H
|
||||
|
||||
; Check for AVX2 instruction support
|
||||
mov rax, 7
|
||||
xor rcx, rcx
|
||||
cpuid
|
||||
mov rax, rbx ; rax = Extended feature flags
|
||||
|
||||
test rax, 1<<5 ; bit5:AVX2
|
||||
jz short .return
|
||||
|
||||
; Check for AVX2 O/S support
|
||||
mov rax, 1
|
||||
xor rcx, rcx
|
||||
cpuid
|
||||
test rcx, 1<<27
|
||||
jz short .return ; O/S does not support XSAVE
|
||||
test rcx, 1<<28
|
||||
jz short .return ; CPU does not support AVX2
|
||||
|
||||
xor rcx, rcx
|
||||
xgetbv
|
||||
and rax, 6
|
||||
cmp rax, 6 ; O/S does not manage XMM/YMM state
|
||||
; using XSAVE
|
||||
jnz short .return
|
||||
|
||||
or rdi, JSIMD_AVX2
|
||||
|
||||
.return:
|
||||
mov rax, rdi
|
||||
|
||||
pop rdi
|
||||
pop rbx
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
Loading…
Add table
Add a link
Reference in a new issue