Repo created
This commit is contained in:
parent
81b91f4139
commit
f8c34fa5ee
22732 changed files with 4815320 additions and 2 deletions
269
TMessagesProj/jni/mozjpeg/simd/powerpc/jccolext-altivec.c
Normal file
269
TMessagesProj/jni/mozjpeg/simd/powerpc/jccolext-altivec.c
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2014, Jay Foad. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jccolor-altivec.c */
|
||||
|
||||
|
||||
void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
JSAMPROW inptr, outptr0, outptr1, outptr2;
|
||||
int pitch = img_width * RGB_PIXELSIZE, num_cols;
|
||||
#if __BIG_ENDIAN__
|
||||
int offset;
|
||||
#endif
|
||||
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
|
||||
|
||||
__vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
|
||||
rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
|
||||
#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb3 = { 0 };
|
||||
#endif
|
||||
#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb4 = { 0 };
|
||||
#endif
|
||||
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
|
||||
__vector unsigned short yl, yh, crl, crh, cbl, cbh;
|
||||
__vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
|
||||
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
|
||||
pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
|
||||
pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
|
||||
__vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
|
||||
__vector int pd_onehalf = { __4X(ONE_HALF) },
|
||||
pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
|
||||
__vector unsigned char pb_zero = { __16X(0) },
|
||||
#if __BIG_ENDIAN__
|
||||
shift_pack_index =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
#else
|
||||
shift_pack_index =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
#endif
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr0 = output_buf[0][output_row];
|
||||
outptr1 = output_buf[1][output_row];
|
||||
outptr2 = output_buf[2][output_row];
|
||||
output_row++;
|
||||
|
||||
for (num_cols = pitch; num_cols > 0;
|
||||
num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
|
||||
outptr0 += 16, outptr1 += 16, outptr2 += 16) {
|
||||
|
||||
#if __BIG_ENDIAN__
|
||||
/* Load 16 pixels == 48 or 64 bytes */
|
||||
offset = (size_t)inptr & 15;
|
||||
if (offset) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
int bytes = num_cols + offset;
|
||||
|
||||
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
|
||||
/* Slow path to prevent buffer overread. Since there is no way to
|
||||
* read a partial AltiVec register, overread would occur on the last
|
||||
* chunk of the last image row if the right edge is not on a 16-byte
|
||||
* boundary. It could also occur on other rows if the bytes per row
|
||||
* is low enough. Since we can't determine whether we're on the last
|
||||
* image row, we have to assume every row is the last.
|
||||
*/
|
||||
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
rgb0 = vec_ld(0, tmpbuf);
|
||||
rgb1 = vec_ld(16, tmpbuf);
|
||||
rgb2 = vec_ld(32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
rgb3 = vec_ld(48, tmpbuf);
|
||||
#endif
|
||||
} else {
|
||||
/* Fast path */
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (bytes > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
if (bytes > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
if (bytes > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (bytes > 64)
|
||||
rgb4 = vec_ld(64, inptr);
|
||||
#endif
|
||||
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
#endif /* __BIG_ENDIAN__ */
|
||||
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
|
||||
/* Slow path */
|
||||
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
rgb0 = VEC_LD(0, tmpbuf);
|
||||
rgb1 = VEC_LD(16, tmpbuf);
|
||||
rgb2 = VEC_LD(32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
rgb3 = VEC_LD(48, tmpbuf);
|
||||
#endif
|
||||
} else {
|
||||
/* Fast path */
|
||||
rgb0 = VEC_LD(0, inptr);
|
||||
if (num_cols > 16)
|
||||
rgb1 = VEC_LD(16, inptr);
|
||||
if (num_cols > 32)
|
||||
rgb2 = VEC_LD(32, inptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (num_cols > 48)
|
||||
rgb3 = VEC_LD(48, inptr);
|
||||
#endif
|
||||
}
|
||||
#if __BIG_ENDIAN__
|
||||
}
|
||||
#endif
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
|
||||
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
|
||||
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
|
||||
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
|
||||
#else
|
||||
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
|
||||
#endif
|
||||
|
||||
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
|
||||
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* ...
|
||||
*
|
||||
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||
* support unsigned vectors.
|
||||
*/
|
||||
rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
|
||||
bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
|
||||
rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
|
||||
bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
|
||||
rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
|
||||
bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
|
||||
rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
|
||||
bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
|
||||
|
||||
/* (Original)
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
*
|
||||
* (This implementation)
|
||||
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
*/
|
||||
|
||||
/* Calculate Y values */
|
||||
|
||||
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
|
||||
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
|
||||
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
|
||||
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
|
||||
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
|
||||
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
|
||||
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
|
||||
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
|
||||
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||
* each dword into a new 16-bit vector, which is the equivalent of
|
||||
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||
* packing them.
|
||||
*/
|
||||
yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
|
||||
shift_pack_index);
|
||||
yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
|
||||
shift_pack_index);
|
||||
y = vec_pack(yl, yh);
|
||||
vec_st(y, 0, outptr0);
|
||||
|
||||
/* Calculate Cb values */
|
||||
cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
|
||||
cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
|
||||
(__vector unsigned int)cb0);
|
||||
cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
|
||||
(__vector unsigned int)cb1);
|
||||
cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
|
||||
(__vector unsigned int)cb2);
|
||||
cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
|
||||
(__vector unsigned int)cb3);
|
||||
cbl = vec_perm((__vector unsigned short)cb0,
|
||||
(__vector unsigned short)cb1, shift_pack_index);
|
||||
cbh = vec_perm((__vector unsigned short)cb2,
|
||||
(__vector unsigned short)cb3, shift_pack_index);
|
||||
cb = vec_pack(cbl, cbh);
|
||||
vec_st(cb, 0, outptr1);
|
||||
|
||||
/* Calculate Cr values */
|
||||
cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
|
||||
cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
|
||||
(__vector unsigned int)cr0);
|
||||
cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
|
||||
(__vector unsigned int)cr1);
|
||||
cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
|
||||
(__vector unsigned int)cr2);
|
||||
cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
|
||||
(__vector unsigned int)cr3);
|
||||
crl = vec_perm((__vector unsigned short)cr0,
|
||||
(__vector unsigned short)cr1, shift_pack_index);
|
||||
crh = vec_perm((__vector unsigned short)cr2,
|
||||
(__vector unsigned short)cr3, shift_pack_index);
|
||||
cr = vec_pack(crl, crh);
|
||||
vec_st(cr, 0, outptr2);
|
||||
}
|
||||
}
|
||||
}
|
||||
116
TMessagesProj/jni/mozjpeg/simd/powerpc/jccolor-altivec.c
Normal file
116
TMessagesProj/jni/mozjpeg/simd/powerpc/jccolor-altivec.c
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* RGB --> YCC CONVERSION */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_081 5329 /* FIX(0.08131) */
|
||||
#define F_0_114 7471 /* FIX(0.11400) */
|
||||
#define F_0_168 11059 /* FIX(0.16874) */
|
||||
#define F_0_250 16384 /* FIX(0.25000) */
|
||||
#define F_0_299 19595 /* FIX(0.29900) */
|
||||
#define F_0_331 21709 /* FIX(0.33126) */
|
||||
#define F_0_418 27439 /* FIX(0.41869) */
|
||||
#define F_0_500 32768 /* FIX(0.50000) */
|
||||
#define F_0_587 38470 /* FIX(0.58700) */
|
||||
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
|
||||
|
||||
#define SCALEBITS 16
|
||||
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||
|
||||
|
||||
#define RGBG_INDEX0 \
|
||||
{ 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
|
||||
#define RGBG_INDEX1 \
|
||||
{ 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
|
||||
#define RGBG_INDEX2 \
|
||||
{ 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
|
||||
#define RGBG_INDEX3 \
|
||||
{ 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define RGBG_INDEX0 \
|
||||
{ 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
|
||||
#define RGBG_INDEX1 \
|
||||
{ 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
|
||||
#define RGBG_INDEX2 \
|
||||
{ 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
|
||||
#define RGBG_INDEX3 \
|
||||
{ 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
|
||||
#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
|
||||
#include "jccolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_ycc_convert_altivec
|
||||
111
TMessagesProj/jni/mozjpeg/simd/powerpc/jcgray-altivec.c
Normal file
111
TMessagesProj/jni/mozjpeg/simd/powerpc/jcgray-altivec.c
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* RGB --> GRAYSCALE CONVERSION */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_114 7471 /* FIX(0.11400) */
|
||||
#define F_0_250 16384 /* FIX(0.25000) */
|
||||
#define F_0_299 19595 /* FIX(0.29900) */
|
||||
#define F_0_587 38470 /* FIX(0.58700) */
|
||||
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
|
||||
|
||||
#define SCALEBITS 16
|
||||
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||
|
||||
|
||||
#define RGBG_INDEX0 \
|
||||
{ 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
|
||||
#define RGBG_INDEX1 \
|
||||
{ 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
|
||||
#define RGBG_INDEX2 \
|
||||
{ 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
|
||||
#define RGBG_INDEX3 \
|
||||
{ 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define RGBG_INDEX0 \
|
||||
{ 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
|
||||
#define RGBG_INDEX1 \
|
||||
{ 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
|
||||
#define RGBG_INDEX2 \
|
||||
{ 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
|
||||
#define RGBG_INDEX3 \
|
||||
{ 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX0
|
||||
#undef RGBG_INDEX1
|
||||
#undef RGBG_INDEX2
|
||||
#undef RGBG_INDEX3
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define RGBG_INDEX \
|
||||
{ 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
|
||||
#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
|
||||
#include "jcgryext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGBG_INDEX
|
||||
#undef jsimd_rgb_gray_convert_altivec
|
||||
228
TMessagesProj/jni/mozjpeg/simd/powerpc/jcgryext-altivec.c
Normal file
228
TMessagesProj/jni/mozjpeg/simd/powerpc/jcgryext-altivec.c
Normal file
|
|
@ -0,0 +1,228 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2014, Jay Foad. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jcgray-altivec.c */
|
||||
|
||||
|
||||
void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
JSAMPROW inptr, outptr;
|
||||
int pitch = img_width * RGB_PIXELSIZE, num_cols;
|
||||
#if __BIG_ENDIAN__
|
||||
int offset;
|
||||
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
|
||||
#endif
|
||||
|
||||
__vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
|
||||
rgbg0, rgbg1, rgbg2, rgbg3, y;
|
||||
#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb3 = { 0 };
|
||||
#endif
|
||||
#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb4 = { 0 };
|
||||
#endif
|
||||
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
|
||||
__vector unsigned short yl, yh;
|
||||
__vector int y0, y1, y2, y3;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
|
||||
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
|
||||
__vector int pd_onehalf = { __4X(ONE_HALF) };
|
||||
__vector unsigned char pb_zero = { __16X(0) },
|
||||
#if __BIG_ENDIAN__
|
||||
shift_pack_index =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
#else
|
||||
shift_pack_index =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
#endif
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr = output_buf[0][output_row];
|
||||
output_row++;
|
||||
|
||||
for (num_cols = pitch; num_cols > 0;
|
||||
num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
|
||||
outptr += 16) {
|
||||
|
||||
#if __BIG_ENDIAN__
|
||||
/* Load 16 pixels == 48 or 64 bytes */
|
||||
offset = (size_t)inptr & 15;
|
||||
if (offset) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
int bytes = num_cols + offset;
|
||||
|
||||
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
|
||||
/* Slow path to prevent buffer overread. Since there is no way to
|
||||
* read a partial AltiVec register, overread would occur on the last
|
||||
* chunk of the last image row if the right edge is not on a 16-byte
|
||||
* boundary. It could also occur on other rows if the bytes per row
|
||||
* is low enough. Since we can't determine whether we're on the last
|
||||
* image row, we have to assume every row is the last.
|
||||
*/
|
||||
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
rgb0 = vec_ld(0, tmpbuf);
|
||||
rgb1 = vec_ld(16, tmpbuf);
|
||||
rgb2 = vec_ld(32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
rgb3 = vec_ld(48, tmpbuf);
|
||||
#endif
|
||||
} else {
|
||||
/* Fast path */
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (bytes > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
if (bytes > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
if (bytes > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (bytes > 64)
|
||||
rgb4 = vec_ld(64, inptr);
|
||||
#endif
|
||||
unaligned_shift_index = vec_lvsl(0, inptr);
|
||||
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
|
||||
/* Slow path */
|
||||
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
rgb0 = vec_ld(0, tmpbuf);
|
||||
rgb1 = vec_ld(16, tmpbuf);
|
||||
rgb2 = vec_ld(32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
rgb3 = vec_ld(48, tmpbuf);
|
||||
#endif
|
||||
} else {
|
||||
/* Fast path */
|
||||
rgb0 = vec_ld(0, inptr);
|
||||
if (num_cols > 16)
|
||||
rgb1 = vec_ld(16, inptr);
|
||||
if (num_cols > 32)
|
||||
rgb2 = vec_ld(32, inptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (num_cols > 48)
|
||||
rgb3 = vec_ld(48, inptr);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#else
|
||||
/* Little endian */
|
||||
rgb0 = vec_vsx_ld(0, inptr);
|
||||
if (num_cols > 16)
|
||||
rgb1 = vec_vsx_ld(16, inptr);
|
||||
if (num_cols > 32)
|
||||
rgb2 = vec_vsx_ld(32, inptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (num_cols > 48)
|
||||
rgb3 = vec_vsx_ld(48, inptr);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
|
||||
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
|
||||
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
|
||||
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
|
||||
#else
|
||||
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||
*
|
||||
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
|
||||
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
|
||||
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
|
||||
*/
|
||||
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
|
||||
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
|
||||
#endif
|
||||
|
||||
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
|
||||
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
|
||||
* ...
|
||||
*
|
||||
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||
* support unsigned vectors.
|
||||
*/
|
||||
rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
|
||||
bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
|
||||
rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
|
||||
bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
|
||||
rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
|
||||
bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
|
||||
rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
|
||||
bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
|
||||
|
||||
/* (Original)
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
*
|
||||
* (This implementation)
|
||||
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
*/
|
||||
|
||||
/* Calculate Y values */
|
||||
|
||||
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
|
||||
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
|
||||
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
|
||||
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
|
||||
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
|
||||
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
|
||||
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
|
||||
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
|
||||
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||
* each dword into a new 16-bit vector, which is the equivalent of
|
||||
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||
* packing them.
|
||||
*/
|
||||
yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
|
||||
shift_pack_index);
|
||||
yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
|
||||
shift_pack_index);
|
||||
y = vec_pack(yl, yh);
|
||||
vec_st(y, 0, outptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
159
TMessagesProj/jni/mozjpeg/simd/powerpc/jcsample-altivec.c
Normal file
159
TMessagesProj/jni/mozjpeg/simd/powerpc/jcsample-altivec.c
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* CHROMA DOWNSAMPLING */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
#include "jcsample.h"
|
||||
|
||||
|
||||
void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
|
||||
int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor,
|
||||
JDIMENSION width_in_blocks,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY output_data)
|
||||
{
|
||||
int outrow, outcol;
|
||||
JDIMENSION output_cols = width_in_blocks * DCTSIZE;
|
||||
JSAMPROW inptr, outptr;
|
||||
|
||||
__vector unsigned char this0, next0, out;
|
||||
__vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
|
||||
|
||||
/* Constants */
|
||||
__vector unsigned short pw_bias = { __4X2(0, 1) },
|
||||
pw_one = { __8X(1) };
|
||||
__vector unsigned char even_odd_index =
|
||||
{ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
|
||||
pb_zero = { __16X(0) };
|
||||
|
||||
expand_right_edge(input_data, max_v_samp_factor, image_width,
|
||||
output_cols * 2);
|
||||
|
||||
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
||||
outptr = output_data[outrow];
|
||||
inptr = input_data[outrow];
|
||||
|
||||
for (outcol = output_cols; outcol > 0;
|
||||
outcol -= 16, inptr += 32, outptr += 16) {
|
||||
|
||||
this0 = vec_ld(0, inptr);
|
||||
this0 = vec_perm(this0, this0, even_odd_index);
|
||||
this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
|
||||
this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
|
||||
outl = vec_add(this0e, this0o);
|
||||
outl = vec_add(outl, pw_bias);
|
||||
outl = vec_sr(outl, pw_one);
|
||||
|
||||
if (outcol > 8) {
|
||||
next0 = vec_ld(16, inptr);
|
||||
next0 = vec_perm(next0, next0, even_odd_index);
|
||||
next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
|
||||
next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
|
||||
outh = vec_add(next0e, next0o);
|
||||
outh = vec_add(outh, pw_bias);
|
||||
outh = vec_sr(outh, pw_one);
|
||||
} else
|
||||
outh = vec_splat_u16(0);
|
||||
|
||||
out = vec_pack(outl, outh);
|
||||
vec_st(out, 0, outptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor,
|
||||
JDIMENSION width_in_blocks,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
int inrow, outrow, outcol;
|
||||
JDIMENSION output_cols = width_in_blocks * DCTSIZE;
|
||||
JSAMPROW inptr0, inptr1, outptr;
|
||||
|
||||
__vector unsigned char this0, next0, this1, next1, out;
|
||||
__vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
|
||||
next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
|
||||
|
||||
/* Constants */
|
||||
__vector unsigned short pw_bias = { __4X2(1, 2) },
|
||||
pw_two = { __8X(2) };
|
||||
__vector unsigned char even_odd_index =
|
||||
{ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
|
||||
pb_zero = { __16X(0) };
|
||||
|
||||
expand_right_edge(input_data, max_v_samp_factor, image_width,
|
||||
output_cols * 2);
|
||||
|
||||
for (inrow = 0, outrow = 0; outrow < v_samp_factor;
|
||||
inrow += 2, outrow++) {
|
||||
|
||||
inptr0 = input_data[inrow];
|
||||
inptr1 = input_data[inrow + 1];
|
||||
outptr = output_data[outrow];
|
||||
|
||||
for (outcol = output_cols; outcol > 0;
|
||||
outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
|
||||
|
||||
this0 = vec_ld(0, inptr0);
|
||||
this0 = vec_perm(this0, this0, even_odd_index);
|
||||
this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
|
||||
this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
|
||||
out0l = vec_add(this0e, this0o);
|
||||
|
||||
this1 = vec_ld(0, inptr1);
|
||||
this1 = vec_perm(this1, this1, even_odd_index);
|
||||
this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
|
||||
this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
|
||||
out1l = vec_add(this1e, this1o);
|
||||
|
||||
outl = vec_add(out0l, out1l);
|
||||
outl = vec_add(outl, pw_bias);
|
||||
outl = vec_sr(outl, pw_two);
|
||||
|
||||
if (outcol > 8) {
|
||||
next0 = vec_ld(16, inptr0);
|
||||
next0 = vec_perm(next0, next0, even_odd_index);
|
||||
next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
|
||||
next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
|
||||
out0h = vec_add(next0e, next0o);
|
||||
|
||||
next1 = vec_ld(16, inptr1);
|
||||
next1 = vec_perm(next1, next1, even_odd_index);
|
||||
next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
|
||||
next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
|
||||
out1h = vec_add(next1e, next1o);
|
||||
|
||||
outh = vec_add(out0h, out1h);
|
||||
outh = vec_add(outh, pw_bias);
|
||||
outh = vec_sr(outh, pw_two);
|
||||
} else
|
||||
outh = vec_splat_u16(0);
|
||||
|
||||
out = vec_pack(outl, outh);
|
||||
vec_st(out, 0, outptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
28
TMessagesProj/jni/mozjpeg/simd/powerpc/jcsample.h
Normal file
28
TMessagesProj/jni/mozjpeg/simd/powerpc/jcsample.h
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* jcsample.h
|
||||
*
|
||||
* This file was part of the Independent JPEG Group's software:
|
||||
* Copyright (C) 1991-1996, Thomas G. Lane.
|
||||
* For conditions of distribution and use, see the accompanying README.ijg
|
||||
* file.
|
||||
*/
|
||||
|
||||
LOCAL(void)
|
||||
expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
|
||||
JDIMENSION output_cols)
|
||||
{
|
||||
register JSAMPROW ptr;
|
||||
register JSAMPLE pixval;
|
||||
register int count;
|
||||
int row;
|
||||
int numcols = (int)(output_cols - input_cols);
|
||||
|
||||
if (numcols > 0) {
|
||||
for (row = 0; row < num_rows; row++) {
|
||||
ptr = image_data[row] + input_cols;
|
||||
pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
|
||||
for (count = numcols; count > 0; count--)
|
||||
*ptr++ = pixval;
|
||||
}
|
||||
}
|
||||
}
|
||||
276
TMessagesProj/jni/mozjpeg/simd/powerpc/jdcolext-altivec.c
Normal file
276
TMessagesProj/jni/mozjpeg/simd/powerpc/jdcolext-altivec.c
Normal file
|
|
@ -0,0 +1,276 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jdcolor-altivec.c */
|
||||
|
||||
|
||||
void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
int num_rows)
|
||||
{
|
||||
JSAMPROW outptr, inptr0, inptr1, inptr2;
|
||||
int pitch = out_width * RGB_PIXELSIZE, num_cols;
|
||||
#if __BIG_ENDIAN__
|
||||
int offset;
|
||||
#endif
|
||||
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
|
||||
|
||||
__vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
|
||||
y, cb, cr;
|
||||
#if __BIG_ENDIAN__
|
||||
__vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
|
||||
#if RGB_PIXELSIZE == 4
|
||||
__vector unsigned char out4;
|
||||
#endif
|
||||
#endif
|
||||
#if RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb3;
|
||||
#endif
|
||||
__vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
|
||||
crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
|
||||
__vector int g0, g1, g2, g3;
|
||||
|
||||
/* Constants
|
||||
* NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
|
||||
* high-order bits, not 16.
|
||||
*/
|
||||
__vector short pw_f0402 = { __8X(F_0_402 >> 1) },
|
||||
pw_mf0228 = { __8X(-F_0_228 >> 1) },
|
||||
pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
|
||||
pw_one = { __8X(1) }, pw_255 = { __8X(255) },
|
||||
pw_cj = { __8X(CENTERJSAMPLE) };
|
||||
__vector int pd_onehalf = { __4X(ONE_HALF) };
|
||||
__vector unsigned char pb_zero = { __16X(0) },
|
||||
#if __BIG_ENDIAN__
|
||||
shift_pack_index =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
#else
|
||||
shift_pack_index =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
||||
#endif
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr0 = input_buf[0][input_row];
|
||||
inptr1 = input_buf[1][input_row];
|
||||
inptr2 = input_buf[2][input_row];
|
||||
input_row++;
|
||||
outptr = *output_buf++;
|
||||
|
||||
for (num_cols = pitch; num_cols > 0;
|
||||
num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
|
||||
inptr0 += 16, inptr1 += 16, inptr2 += 16) {
|
||||
|
||||
y = vec_ld(0, inptr0);
|
||||
/* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||
* support unsigned vectors.
|
||||
*/
|
||||
yl = (__vector signed short)VEC_UNPACKHU(y);
|
||||
yh = (__vector signed short)VEC_UNPACKLU(y);
|
||||
|
||||
cb = vec_ld(0, inptr1);
|
||||
cbl = (__vector signed short)VEC_UNPACKHU(cb);
|
||||
cbh = (__vector signed short)VEC_UNPACKLU(cb);
|
||||
cbl = vec_sub(cbl, pw_cj);
|
||||
cbh = vec_sub(cbh, pw_cj);
|
||||
|
||||
cr = vec_ld(0, inptr2);
|
||||
crl = (__vector signed short)VEC_UNPACKHU(cr);
|
||||
crh = (__vector signed short)VEC_UNPACKLU(cr);
|
||||
crl = vec_sub(crl, pw_cj);
|
||||
crh = vec_sub(crh, pw_cj);
|
||||
|
||||
/* (Original)
|
||||
* R = Y + 1.40200 * Cr
|
||||
* G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
* B = Y + 1.77200 * Cb
|
||||
*
|
||||
* (This implementation)
|
||||
* R = Y + 0.40200 * Cr + Cr
|
||||
* G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
* B = Y - 0.22800 * Cb + Cb + Cb
|
||||
*/
|
||||
bl = vec_add(cbl, cbl);
|
||||
bh = vec_add(cbh, cbh);
|
||||
bl = vec_madds(bl, pw_mf0228, pw_one);
|
||||
bh = vec_madds(bh, pw_mf0228, pw_one);
|
||||
bl = vec_sra(bl, (__vector unsigned short)pw_one);
|
||||
bh = vec_sra(bh, (__vector unsigned short)pw_one);
|
||||
bl = vec_add(bl, cbl);
|
||||
bh = vec_add(bh, cbh);
|
||||
bl = vec_add(bl, cbl);
|
||||
bh = vec_add(bh, cbh);
|
||||
bl = vec_add(bl, yl);
|
||||
bh = vec_add(bh, yh);
|
||||
|
||||
rl = vec_add(crl, crl);
|
||||
rh = vec_add(crh, crh);
|
||||
rl = vec_madds(rl, pw_f0402, pw_one);
|
||||
rh = vec_madds(rh, pw_f0402, pw_one);
|
||||
rl = vec_sra(rl, (__vector unsigned short)pw_one);
|
||||
rh = vec_sra(rh, (__vector unsigned short)pw_one);
|
||||
rl = vec_add(rl, crl);
|
||||
rh = vec_add(rh, crh);
|
||||
rl = vec_add(rl, yl);
|
||||
rh = vec_add(rh, yh);
|
||||
|
||||
g0w = vec_mergeh(cbl, crl);
|
||||
g1w = vec_mergel(cbl, crl);
|
||||
g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
|
||||
g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
|
||||
g2w = vec_mergeh(cbh, crh);
|
||||
g3w = vec_mergel(cbh, crh);
|
||||
g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
|
||||
g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
|
||||
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||
* each dword into a new 16-bit vector, which is the equivalent of
|
||||
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||
* packing them.
|
||||
*/
|
||||
gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
|
||||
gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
|
||||
gl = vec_sub(gl, crl);
|
||||
gh = vec_sub(gh, crh);
|
||||
gl = vec_add(gl, yl);
|
||||
gh = vec_add(gh, yh);
|
||||
|
||||
rg0 = vec_mergeh(rl, gl);
|
||||
bx0 = vec_mergeh(bl, pw_255);
|
||||
rg1 = vec_mergel(rl, gl);
|
||||
bx1 = vec_mergel(bl, pw_255);
|
||||
rg2 = vec_mergeh(rh, gh);
|
||||
bx2 = vec_mergeh(bh, pw_255);
|
||||
rg3 = vec_mergel(rh, gh);
|
||||
bx3 = vec_mergel(bh, pw_255);
|
||||
|
||||
rgbx0 = vec_packsu(rg0, bx0);
|
||||
rgbx1 = vec_packsu(rg1, bx1);
|
||||
rgbx2 = vec_packsu(rg2, bx2);
|
||||
rgbx3 = vec_packsu(rg3, bx3);
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
|
||||
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
|
||||
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
|
||||
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
|
||||
*
|
||||
* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||
*/
|
||||
rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
|
||||
rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
|
||||
rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
|
||||
#else
|
||||
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
|
||||
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
|
||||
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
|
||||
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
|
||||
*
|
||||
* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||
*/
|
||||
rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
|
||||
rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
|
||||
rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
|
||||
rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
|
||||
#endif
|
||||
|
||||
#if __BIG_ENDIAN__
|
||||
offset = (size_t)outptr & 15;
|
||||
if (offset) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
int bytes = num_cols + offset;
|
||||
|
||||
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
|
||||
/* Slow path to prevent buffer overwrite. Since there is no way to
|
||||
* write a partial AltiVec register, overwrite would occur on the
|
||||
* last chunk of the last image row if the right edge is not on a
|
||||
* 16-byte boundary. It could also occur on other rows if the bytes
|
||||
* per row is low enough. Since we can't determine whether we're on
|
||||
* the last image row, we have to assume every row is the last.
|
||||
*/
|
||||
vec_st(rgb0, 0, tmpbuf);
|
||||
vec_st(rgb1, 16, tmpbuf);
|
||||
vec_st(rgb2, 32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
vec_st(rgb3, 48, tmpbuf);
|
||||
#endif
|
||||
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
} else {
|
||||
/* Fast path */
|
||||
unaligned_shift_index = vec_lvsl(0, outptr);
|
||||
edgel = vec_ld(0, outptr);
|
||||
edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
|
||||
edges = vec_perm(edgeh, edgel, unaligned_shift_index);
|
||||
unaligned_shift_index = vec_lvsr(0, outptr);
|
||||
out0 = vec_perm(edges, rgb0, unaligned_shift_index);
|
||||
out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
out4 = vec_perm(rgb3, edges, unaligned_shift_index);
|
||||
#else
|
||||
out3 = vec_perm(rgb2, edges, unaligned_shift_index);
|
||||
#endif
|
||||
vec_st(out0, 0, outptr);
|
||||
if (bytes > 16)
|
||||
vec_st(out1, 16, outptr);
|
||||
if (bytes > 32)
|
||||
vec_st(out2, 32, outptr);
|
||||
if (bytes > 48)
|
||||
vec_st(out3, 48, outptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (bytes > 64)
|
||||
vec_st(out4, 64, outptr);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
#endif /* __BIG_ENDIAN__ */
|
||||
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
|
||||
/* Slow path */
|
||||
VEC_ST(rgb0, 0, tmpbuf);
|
||||
VEC_ST(rgb1, 16, tmpbuf);
|
||||
VEC_ST(rgb2, 32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
VEC_ST(rgb3, 48, tmpbuf);
|
||||
#endif
|
||||
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
} else {
|
||||
/* Fast path */
|
||||
VEC_ST(rgb0, 0, outptr);
|
||||
if (num_cols > 16)
|
||||
VEC_ST(rgb1, 16, outptr);
|
||||
if (num_cols > 32)
|
||||
VEC_ST(rgb2, 32, outptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (num_cols > 48)
|
||||
VEC_ST(rgb3, 48, outptr);
|
||||
#endif
|
||||
}
|
||||
#if __BIG_ENDIAN__
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
106
TMessagesProj/jni/mozjpeg/simd/powerpc/jdcolor-altivec.c
Normal file
106
TMessagesProj/jni/mozjpeg/simd/powerpc/jdcolor-altivec.c
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* YCC --> RGB CONVERSION */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_344 22554 /* FIX(0.34414) */
|
||||
#define F_0_714 46802 /* FIX(0.71414) */
|
||||
#define F_1_402 91881 /* FIX(1.40200) */
|
||||
#define F_1_772 116130 /* FIX(1.77200) */
|
||||
#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
|
||||
#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
|
||||
#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
|
||||
|
||||
#define SCALEBITS 16
|
||||
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||
|
||||
#define RGB_INDEX0 \
|
||||
{ 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
|
||||
#define RGB_INDEX1 \
|
||||
{ 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
|
||||
#define RGB_INDEX2 \
|
||||
{ 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
|
||||
#include "jdcolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
|
||||
#include "jdcolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX0
|
||||
#undef RGB_INDEX1
|
||||
#undef RGB_INDEX2
|
||||
#undef jsimd_ycc_rgb_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
|
||||
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
|
||||
#include "jdcolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_ycc_rgb_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define RGB_INDEX0 \
|
||||
{ 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
|
||||
#define RGB_INDEX1 \
|
||||
{ 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
|
||||
#define RGB_INDEX2 \
|
||||
{ 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
|
||||
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
|
||||
#include "jdcolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX0
|
||||
#undef RGB_INDEX1
|
||||
#undef RGB_INDEX2
|
||||
#undef jsimd_ycc_rgb_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
|
||||
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
|
||||
#include "jdcolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_ycc_rgb_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
|
||||
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
|
||||
#include "jdcolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_ycc_rgb_convert_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
|
||||
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
|
||||
#include "jdcolext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_ycc_rgb_convert_altivec
|
||||
130
TMessagesProj/jni/mozjpeg/simd/powerpc/jdmerge-altivec.c
Normal file
130
TMessagesProj/jni/mozjpeg/simd/powerpc/jdmerge-altivec.c
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_344 22554 /* FIX(0.34414) */
|
||||
#define F_0_714 46802 /* FIX(0.71414) */
|
||||
#define F_1_402 91881 /* FIX(1.40200) */
|
||||
#define F_1_772 116130 /* FIX(1.77200) */
|
||||
#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
|
||||
#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
|
||||
#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
|
||||
|
||||
#define SCALEBITS 16
|
||||
#define ONE_HALF (1 << (SCALEBITS - 1))
|
||||
|
||||
#define RGB_INDEX0 \
|
||||
{ 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
|
||||
#define RGB_INDEX1 \
|
||||
{ 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
|
||||
#define RGB_INDEX2 \
|
||||
{ 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
|
||||
#include "jdmrgext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_h2v1_merged_upsample_altivec \
|
||||
jsimd_h2v1_extrgb_merged_upsample_altivec
|
||||
#define jsimd_h2v2_merged_upsample_altivec \
|
||||
jsimd_h2v2_extrgb_merged_upsample_altivec
|
||||
#include "jdmrgext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX0
|
||||
#undef RGB_INDEX1
|
||||
#undef RGB_INDEX2
|
||||
#undef jsimd_h2v1_merged_upsample_altivec
|
||||
#undef jsimd_h2v2_merged_upsample_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
|
||||
#define jsimd_h2v1_merged_upsample_altivec \
|
||||
jsimd_h2v1_extrgbx_merged_upsample_altivec
|
||||
#define jsimd_h2v2_merged_upsample_altivec \
|
||||
jsimd_h2v2_extrgbx_merged_upsample_altivec
|
||||
#include "jdmrgext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_h2v1_merged_upsample_altivec
|
||||
#undef jsimd_h2v2_merged_upsample_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define RGB_INDEX0 \
|
||||
{ 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
|
||||
#define RGB_INDEX1 \
|
||||
{ 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
|
||||
#define RGB_INDEX2 \
|
||||
{ 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
|
||||
#define jsimd_h2v1_merged_upsample_altivec \
|
||||
jsimd_h2v1_extbgr_merged_upsample_altivec
|
||||
#define jsimd_h2v2_merged_upsample_altivec \
|
||||
jsimd_h2v2_extbgr_merged_upsample_altivec
|
||||
#include "jdmrgext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX0
|
||||
#undef RGB_INDEX1
|
||||
#undef RGB_INDEX2
|
||||
#undef jsimd_h2v1_merged_upsample_altivec
|
||||
#undef jsimd_h2v2_merged_upsample_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
|
||||
#define jsimd_h2v1_merged_upsample_altivec \
|
||||
jsimd_h2v1_extbgrx_merged_upsample_altivec
|
||||
#define jsimd_h2v2_merged_upsample_altivec \
|
||||
jsimd_h2v2_extbgrx_merged_upsample_altivec
|
||||
#include "jdmrgext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_h2v1_merged_upsample_altivec
|
||||
#undef jsimd_h2v2_merged_upsample_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
|
||||
#define jsimd_h2v1_merged_upsample_altivec \
|
||||
jsimd_h2v1_extxbgr_merged_upsample_altivec
|
||||
#define jsimd_h2v2_merged_upsample_altivec \
|
||||
jsimd_h2v2_extxbgr_merged_upsample_altivec
|
||||
#include "jdmrgext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_h2v1_merged_upsample_altivec
|
||||
#undef jsimd_h2v2_merged_upsample_altivec
|
||||
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define RGB_INDEX \
|
||||
{ 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
|
||||
#define jsimd_h2v1_merged_upsample_altivec \
|
||||
jsimd_h2v1_extxrgb_merged_upsample_altivec
|
||||
#define jsimd_h2v2_merged_upsample_altivec \
|
||||
jsimd_h2v2_extxrgb_merged_upsample_altivec
|
||||
#include "jdmrgext-altivec.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef RGB_INDEX
|
||||
#undef jsimd_h2v1_merged_upsample_altivec
|
||||
#undef jsimd_h2v2_merged_upsample_altivec
|
||||
329
TMessagesProj/jni/mozjpeg/simd/powerpc/jdmrgext-altivec.c
Normal file
329
TMessagesProj/jni/mozjpeg/simd/powerpc/jdmrgext-altivec.c
Normal file
|
|
@ -0,0 +1,329 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jdmerge-altivec.c */
|
||||
|
||||
|
||||
void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
|
||||
JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
JSAMPROW outptr, inptr0, inptr1, inptr2;
|
||||
int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
|
||||
#if __BIG_ENDIAN__
|
||||
int offset;
|
||||
#endif
|
||||
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
|
||||
|
||||
__vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
|
||||
y, cb, cr;
|
||||
#if __BIG_ENDIAN__
|
||||
__vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
|
||||
#if RGB_PIXELSIZE == 4
|
||||
__vector unsigned char out4;
|
||||
#endif
|
||||
#endif
|
||||
#if RGB_PIXELSIZE == 4
|
||||
__vector unsigned char rgb3;
|
||||
#endif
|
||||
__vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
|
||||
crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
|
||||
rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
|
||||
__vector int g_y0, g_y1, g_y2, g_y3;
|
||||
|
||||
/* Constants
|
||||
* NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
|
||||
* high-order bits, not 16.
|
||||
*/
|
||||
__vector short pw_f0402 = { __8X(F_0_402 >> 1) },
|
||||
pw_mf0228 = { __8X(-F_0_228 >> 1) },
|
||||
pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
|
||||
pw_one = { __8X(1) }, pw_255 = { __8X(255) },
|
||||
pw_cj = { __8X(CENTERJSAMPLE) };
|
||||
__vector int pd_onehalf = { __4X(ONE_HALF) };
|
||||
__vector unsigned char pb_zero = { __16X(0) },
|
||||
#if __BIG_ENDIAN__
|
||||
shift_pack_index =
|
||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
|
||||
even_index =
|
||||
{ 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 },
|
||||
odd_index =
|
||||
{ 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 };
|
||||
#else
|
||||
shift_pack_index =
|
||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
|
||||
even_index =
|
||||
{ 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 },
|
||||
odd_index =
|
||||
{ 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 };
|
||||
#endif
|
||||
|
||||
inptr0 = input_buf[0][in_row_group_ctr];
|
||||
inptr1 = input_buf[1][in_row_group_ctr];
|
||||
inptr2 = input_buf[2][in_row_group_ctr];
|
||||
outptr = output_buf[0];
|
||||
|
||||
for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
|
||||
|
||||
cb = vec_ld(0, inptr1);
|
||||
/* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
|
||||
* support unsigned vectors.
|
||||
*/
|
||||
cbl = (__vector signed short)VEC_UNPACKHU(cb);
|
||||
cbh = (__vector signed short)VEC_UNPACKLU(cb);
|
||||
cbl = vec_sub(cbl, pw_cj);
|
||||
cbh = vec_sub(cbh, pw_cj);
|
||||
|
||||
cr = vec_ld(0, inptr2);
|
||||
crl = (__vector signed short)VEC_UNPACKHU(cr);
|
||||
crh = (__vector signed short)VEC_UNPACKLU(cr);
|
||||
crl = vec_sub(crl, pw_cj);
|
||||
crh = vec_sub(crh, pw_cj);
|
||||
|
||||
/* (Original)
|
||||
* R = Y + 1.40200 * Cr
|
||||
* G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
* B = Y + 1.77200 * Cb
|
||||
*
|
||||
* (This implementation)
|
||||
* R = Y + 0.40200 * Cr + Cr
|
||||
* G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
* B = Y - 0.22800 * Cb + Cb + Cb
|
||||
*/
|
||||
b_yl = vec_add(cbl, cbl);
|
||||
b_yh = vec_add(cbh, cbh);
|
||||
b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
|
||||
b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
|
||||
b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
|
||||
b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
|
||||
b_yl = vec_add(b_yl, cbl);
|
||||
b_yh = vec_add(b_yh, cbh);
|
||||
b_yl = vec_add(b_yl, cbl);
|
||||
b_yh = vec_add(b_yh, cbh);
|
||||
|
||||
r_yl = vec_add(crl, crl);
|
||||
r_yh = vec_add(crh, crh);
|
||||
r_yl = vec_madds(r_yl, pw_f0402, pw_one);
|
||||
r_yh = vec_madds(r_yh, pw_f0402, pw_one);
|
||||
r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
|
||||
r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
|
||||
r_yl = vec_add(r_yl, crl);
|
||||
r_yh = vec_add(r_yh, crh);
|
||||
|
||||
g_y0w = vec_mergeh(cbl, crl);
|
||||
g_y1w = vec_mergel(cbl, crl);
|
||||
g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
|
||||
g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
|
||||
g_y2w = vec_mergeh(cbh, crh);
|
||||
g_y3w = vec_mergel(cbh, crh);
|
||||
g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
|
||||
g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
|
||||
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
|
||||
* each dword into a new 16-bit vector, which is the equivalent of
|
||||
* descaling the 32-bit results (right-shifting by 16 bits) and then
|
||||
* packing them.
|
||||
*/
|
||||
g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
|
||||
shift_pack_index);
|
||||
g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
|
||||
shift_pack_index);
|
||||
g_yl = vec_sub(g_yl, crl);
|
||||
g_yh = vec_sub(g_yh, crh);
|
||||
|
||||
for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
|
||||
num_cols -= RGB_PIXELSIZE * 16,
|
||||
outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
|
||||
|
||||
y = vec_ld(0, inptr0);
|
||||
ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
|
||||
yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
|
||||
|
||||
if (yloop == 0) {
|
||||
be = vec_add(b_yl, ye);
|
||||
bo = vec_add(b_yl, yo);
|
||||
re = vec_add(r_yl, ye);
|
||||
ro = vec_add(r_yl, yo);
|
||||
ge = vec_add(g_yl, ye);
|
||||
go = vec_add(g_yl, yo);
|
||||
} else {
|
||||
be = vec_add(b_yh, ye);
|
||||
bo = vec_add(b_yh, yo);
|
||||
re = vec_add(r_yh, ye);
|
||||
ro = vec_add(r_yh, yo);
|
||||
ge = vec_add(g_yh, ye);
|
||||
go = vec_add(g_yh, yo);
|
||||
}
|
||||
|
||||
rl = vec_mergeh(re, ro);
|
||||
rh = vec_mergel(re, ro);
|
||||
gl = vec_mergeh(ge, go);
|
||||
gh = vec_mergel(ge, go);
|
||||
bl = vec_mergeh(be, bo);
|
||||
bh = vec_mergel(be, bo);
|
||||
|
||||
rg0 = vec_mergeh(rl, gl);
|
||||
bx0 = vec_mergeh(bl, pw_255);
|
||||
rg1 = vec_mergel(rl, gl);
|
||||
bx1 = vec_mergel(bl, pw_255);
|
||||
rg2 = vec_mergeh(rh, gh);
|
||||
bx2 = vec_mergeh(bh, pw_255);
|
||||
rg3 = vec_mergel(rh, gh);
|
||||
bx3 = vec_mergel(bh, pw_255);
|
||||
|
||||
rgbx0 = vec_packsu(rg0, bx0);
|
||||
rgbx1 = vec_packsu(rg1, bx1);
|
||||
rgbx2 = vec_packsu(rg2, bx2);
|
||||
rgbx3 = vec_packsu(rg3, bx3);
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
|
||||
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
|
||||
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
|
||||
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
|
||||
*
|
||||
* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
|
||||
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
|
||||
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
|
||||
*/
|
||||
rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
|
||||
rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
|
||||
rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
|
||||
#else
|
||||
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
|
||||
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
|
||||
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
|
||||
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
|
||||
*
|
||||
* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
|
||||
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
|
||||
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
|
||||
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
|
||||
*/
|
||||
rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
|
||||
rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
|
||||
rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
|
||||
rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
|
||||
#endif
|
||||
|
||||
#if __BIG_ENDIAN__
|
||||
offset = (size_t)outptr & 15;
|
||||
if (offset) {
|
||||
__vector unsigned char unaligned_shift_index;
|
||||
int bytes = num_cols + offset;
|
||||
|
||||
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
|
||||
/* Slow path to prevent buffer overwrite. Since there is no way to
|
||||
* write a partial AltiVec register, overwrite would occur on the
|
||||
* last chunk of the last image row if the right edge is not on a
|
||||
* 16-byte boundary. It could also occur on other rows if the bytes
|
||||
* per row is low enough. Since we can't determine whether we're on
|
||||
* the last image row, we have to assume every row is the last.
|
||||
*/
|
||||
vec_st(rgb0, 0, tmpbuf);
|
||||
vec_st(rgb1, 16, tmpbuf);
|
||||
vec_st(rgb2, 32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
vec_st(rgb3, 48, tmpbuf);
|
||||
#endif
|
||||
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
} else {
|
||||
/* Fast path */
|
||||
unaligned_shift_index = vec_lvsl(0, outptr);
|
||||
edgel = vec_ld(0, outptr);
|
||||
edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
|
||||
edges = vec_perm(edgeh, edgel, unaligned_shift_index);
|
||||
unaligned_shift_index = vec_lvsr(0, outptr);
|
||||
out0 = vec_perm(edges, rgb0, unaligned_shift_index);
|
||||
out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
|
||||
out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
|
||||
out4 = vec_perm(rgb3, edges, unaligned_shift_index);
|
||||
#else
|
||||
out3 = vec_perm(rgb2, edges, unaligned_shift_index);
|
||||
#endif
|
||||
vec_st(out0, 0, outptr);
|
||||
if (bytes > 16)
|
||||
vec_st(out1, 16, outptr);
|
||||
if (bytes > 32)
|
||||
vec_st(out2, 32, outptr);
|
||||
if (bytes > 48)
|
||||
vec_st(out3, 48, outptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (bytes > 64)
|
||||
vec_st(out4, 64, outptr);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
#endif /* __BIG_ENDIAN__ */
|
||||
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
|
||||
/* Slow path */
|
||||
VEC_ST(rgb0, 0, tmpbuf);
|
||||
VEC_ST(rgb1, 16, tmpbuf);
|
||||
VEC_ST(rgb2, 32, tmpbuf);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
VEC_ST(rgb3, 48, tmpbuf);
|
||||
#endif
|
||||
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
|
||||
} else {
|
||||
/* Fast path */
|
||||
VEC_ST(rgb0, 0, outptr);
|
||||
if (num_cols > 16)
|
||||
VEC_ST(rgb1, 16, outptr);
|
||||
if (num_cols > 32)
|
||||
VEC_ST(rgb2, 32, outptr);
|
||||
#if RGB_PIXELSIZE == 4
|
||||
if (num_cols > 48)
|
||||
VEC_ST(rgb3, 48, outptr);
|
||||
#endif
|
||||
}
|
||||
#if __BIG_ENDIAN__
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
|
||||
JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
JSAMPROW inptr, outptr;
|
||||
|
||||
inptr = input_buf[0][in_row_group_ctr];
|
||||
outptr = output_buf[0];
|
||||
|
||||
input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
|
||||
jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
|
||||
output_buf);
|
||||
|
||||
input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
|
||||
output_buf[0] = output_buf[1];
|
||||
jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
|
||||
output_buf);
|
||||
|
||||
input_buf[0][in_row_group_ctr] = inptr;
|
||||
output_buf[0] = outptr;
|
||||
}
|
||||
400
TMessagesProj/jni/mozjpeg/simd/powerpc/jdsample-altivec.c
Normal file
400
TMessagesProj/jni/mozjpeg/simd/powerpc/jdsample-altivec.c
Normal file
|
|
@ -0,0 +1,400 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* CHROMA UPSAMPLING */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
|
||||
JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr;
|
||||
int inrow, incol;
|
||||
|
||||
__vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
|
||||
out;
|
||||
__vector short this0e, this0o, this0l, this0h, last0l, last0h,
|
||||
next0l, next0h, outle, outhe, outlo, outho;
|
||||
|
||||
/* Constants */
|
||||
__vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
|
||||
last_index_col0 =
|
||||
{ 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
|
||||
last_index =
|
||||
{ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
|
||||
next_index =
|
||||
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
|
||||
next_index_lastcol =
|
||||
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 },
|
||||
#if __BIG_ENDIAN__
|
||||
merge_pack_index =
|
||||
{ 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
|
||||
#else
|
||||
merge_pack_index =
|
||||
{ 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
|
||||
#endif
|
||||
__vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
|
||||
|
||||
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
||||
inptr = input_data[inrow];
|
||||
outptr = output_data[inrow];
|
||||
|
||||
if (downsampled_width & 15)
|
||||
inptr[downsampled_width] = inptr[downsampled_width - 1];
|
||||
|
||||
this0 = vec_ld(0, inptr);
|
||||
p_last0 = vec_perm(this0, this0, last_index_col0);
|
||||
last0 = this0;
|
||||
|
||||
for (incol = downsampled_width; incol > 0;
|
||||
incol -= 16, inptr += 16, outptr += 32) {
|
||||
|
||||
if (downsampled_width - incol > 0) {
|
||||
p_last0 = vec_perm(last0, this0, last_index);
|
||||
last0 = this0;
|
||||
}
|
||||
|
||||
if (incol <= 16)
|
||||
p_next0 = vec_perm(this0, this0, next_index_lastcol);
|
||||
else {
|
||||
next0 = vec_ld(16, inptr);
|
||||
p_next0 = vec_perm(this0, next0, next_index);
|
||||
}
|
||||
|
||||
this0e = (__vector short)vec_mule(this0, pb_three);
|
||||
this0o = (__vector short)vec_mulo(this0, pb_three);
|
||||
this0l = vec_mergeh(this0e, this0o);
|
||||
this0h = vec_mergel(this0e, this0o);
|
||||
|
||||
last0l = (__vector short)VEC_UNPACKHU(p_last0);
|
||||
last0h = (__vector short)VEC_UNPACKLU(p_last0);
|
||||
last0l = vec_add(last0l, pw_one);
|
||||
|
||||
next0l = (__vector short)VEC_UNPACKHU(p_next0);
|
||||
next0h = (__vector short)VEC_UNPACKLU(p_next0);
|
||||
next0l = vec_add(next0l, pw_two);
|
||||
|
||||
outle = vec_add(this0l, last0l);
|
||||
outlo = vec_add(this0l, next0l);
|
||||
outle = vec_sr(outle, (__vector unsigned short)pw_two);
|
||||
outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
|
||||
|
||||
out = vec_perm((__vector unsigned char)outle,
|
||||
(__vector unsigned char)outlo, merge_pack_index);
|
||||
vec_st(out, 0, outptr);
|
||||
|
||||
if (incol > 8) {
|
||||
last0h = vec_add(last0h, pw_one);
|
||||
next0h = vec_add(next0h, pw_two);
|
||||
|
||||
outhe = vec_add(this0h, last0h);
|
||||
outho = vec_add(this0h, next0h);
|
||||
outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
|
||||
outho = vec_sr(outho, (__vector unsigned short)pw_two);
|
||||
|
||||
out = vec_perm((__vector unsigned char)outhe,
|
||||
(__vector unsigned char)outho, merge_pack_index);
|
||||
vec_st(out, 16, outptr);
|
||||
}
|
||||
|
||||
this0 = next0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
|
||||
JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
|
||||
int inrow, outrow, incol;
|
||||
|
||||
__vector unsigned char this_1, this0, this1, out;
|
||||
__vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
|
||||
lastcolsum_1h, lastcolsum1h,
|
||||
p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
|
||||
thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
|
||||
nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
|
||||
nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
|
||||
p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
|
||||
tmpl, tmph, outle, outhe, outlo, outho;
|
||||
|
||||
/* Constants */
|
||||
__vector unsigned char pb_zero = { __16X(0) },
|
||||
last_index_col0 =
|
||||
{ 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
|
||||
last_index =
|
||||
{ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
|
||||
next_index =
|
||||
{ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 },
|
||||
next_index_lastcol =
|
||||
{ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 },
|
||||
#if __BIG_ENDIAN__
|
||||
merge_pack_index =
|
||||
{ 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
|
||||
#else
|
||||
merge_pack_index =
|
||||
{ 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
|
||||
#endif
|
||||
__vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
|
||||
pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
|
||||
__vector unsigned short pw_four = { __8X(4) };
|
||||
|
||||
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
|
||||
|
||||
inptr_1 = input_data[inrow - 1];
|
||||
inptr0 = input_data[inrow];
|
||||
inptr1 = input_data[inrow + 1];
|
||||
outptr0 = output_data[outrow++];
|
||||
outptr1 = output_data[outrow++];
|
||||
|
||||
if (downsampled_width & 15) {
|
||||
inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
|
||||
inptr0[downsampled_width] = inptr0[downsampled_width - 1];
|
||||
inptr1[downsampled_width] = inptr1[downsampled_width - 1];
|
||||
}
|
||||
|
||||
this0 = vec_ld(0, inptr0);
|
||||
this0l = (__vector short)VEC_UNPACKHU(this0);
|
||||
this0h = (__vector short)VEC_UNPACKLU(this0);
|
||||
this0l = vec_mladd(this0l, pw_three, pw_zero);
|
||||
this0h = vec_mladd(this0h, pw_three, pw_zero);
|
||||
|
||||
this_1 = vec_ld(0, inptr_1);
|
||||
this_1l = (__vector short)VEC_UNPACKHU(this_1);
|
||||
this_1h = (__vector short)VEC_UNPACKLU(this_1);
|
||||
thiscolsum_1l = vec_add(this0l, this_1l);
|
||||
thiscolsum_1h = vec_add(this0h, this_1h);
|
||||
lastcolsum_1h = thiscolsum_1h;
|
||||
p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
|
||||
p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
|
||||
|
||||
this1 = vec_ld(0, inptr1);
|
||||
this1l = (__vector short)VEC_UNPACKHU(this1);
|
||||
this1h = (__vector short)VEC_UNPACKLU(this1);
|
||||
thiscolsum1l = vec_add(this0l, this1l);
|
||||
thiscolsum1h = vec_add(this0h, this1h);
|
||||
lastcolsum1h = thiscolsum1h;
|
||||
p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
|
||||
p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
|
||||
|
||||
for (incol = downsampled_width; incol > 0;
|
||||
incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
|
||||
outptr0 += 32, outptr1 += 32) {
|
||||
|
||||
if (downsampled_width - incol > 0) {
|
||||
p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
|
||||
p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
|
||||
p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
|
||||
p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
|
||||
lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
|
||||
}
|
||||
|
||||
if (incol <= 16) {
|
||||
p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
|
||||
p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
|
||||
next_index_lastcol);
|
||||
p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
|
||||
p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
|
||||
next_index_lastcol);
|
||||
} else {
|
||||
this0 = vec_ld(16, inptr0);
|
||||
this0l = (__vector short)VEC_UNPACKHU(this0);
|
||||
this0h = (__vector short)VEC_UNPACKLU(this0);
|
||||
this0l = vec_mladd(this0l, pw_three, pw_zero);
|
||||
this0h = vec_mladd(this0h, pw_three, pw_zero);
|
||||
|
||||
this_1 = vec_ld(16, inptr_1);
|
||||
this_1l = (__vector short)VEC_UNPACKHU(this_1);
|
||||
this_1h = (__vector short)VEC_UNPACKLU(this_1);
|
||||
nextcolsum_1l = vec_add(this0l, this_1l);
|
||||
nextcolsum_1h = vec_add(this0h, this_1h);
|
||||
p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
|
||||
p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
|
||||
|
||||
this1 = vec_ld(16, inptr1);
|
||||
this1l = (__vector short)VEC_UNPACKHU(this1);
|
||||
this1h = (__vector short)VEC_UNPACKLU(this1);
|
||||
nextcolsum1l = vec_add(this0l, this1l);
|
||||
nextcolsum1h = vec_add(this0h, this1h);
|
||||
p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
|
||||
p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
|
||||
}
|
||||
|
||||
/* Process the upper row */
|
||||
|
||||
tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
|
||||
outle = vec_add(tmpl, p_lastcolsum_1l);
|
||||
outle = vec_add(outle, pw_eight);
|
||||
outle = vec_sr(outle, pw_four);
|
||||
|
||||
outlo = vec_add(tmpl, p_nextcolsum_1l);
|
||||
outlo = vec_add(outlo, pw_seven);
|
||||
outlo = vec_sr(outlo, pw_four);
|
||||
|
||||
out = vec_perm((__vector unsigned char)outle,
|
||||
(__vector unsigned char)outlo, merge_pack_index);
|
||||
vec_st(out, 0, outptr0);
|
||||
|
||||
if (incol > 8) {
|
||||
tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
|
||||
outhe = vec_add(tmph, p_lastcolsum_1h);
|
||||
outhe = vec_add(outhe, pw_eight);
|
||||
outhe = vec_sr(outhe, pw_four);
|
||||
|
||||
outho = vec_add(tmph, p_nextcolsum_1h);
|
||||
outho = vec_add(outho, pw_seven);
|
||||
outho = vec_sr(outho, pw_four);
|
||||
|
||||
out = vec_perm((__vector unsigned char)outhe,
|
||||
(__vector unsigned char)outho, merge_pack_index);
|
||||
vec_st(out, 16, outptr0);
|
||||
}
|
||||
|
||||
/* Process the lower row */
|
||||
|
||||
tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
|
||||
outle = vec_add(tmpl, p_lastcolsum1l);
|
||||
outle = vec_add(outle, pw_eight);
|
||||
outle = vec_sr(outle, pw_four);
|
||||
|
||||
outlo = vec_add(tmpl, p_nextcolsum1l);
|
||||
outlo = vec_add(outlo, pw_seven);
|
||||
outlo = vec_sr(outlo, pw_four);
|
||||
|
||||
out = vec_perm((__vector unsigned char)outle,
|
||||
(__vector unsigned char)outlo, merge_pack_index);
|
||||
vec_st(out, 0, outptr1);
|
||||
|
||||
if (incol > 8) {
|
||||
tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
|
||||
outhe = vec_add(tmph, p_lastcolsum1h);
|
||||
outhe = vec_add(outhe, pw_eight);
|
||||
outhe = vec_sr(outhe, pw_four);
|
||||
|
||||
outho = vec_add(tmph, p_nextcolsum1h);
|
||||
outho = vec_add(outho, pw_seven);
|
||||
outho = vec_sr(outho, pw_four);
|
||||
|
||||
out = vec_perm((__vector unsigned char)outhe,
|
||||
(__vector unsigned char)outho, merge_pack_index);
|
||||
vec_st(out, 16, outptr1);
|
||||
}
|
||||
|
||||
thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
|
||||
thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* These are rarely used (mainly just for decompressing YCCK images) */
|
||||
|
||||
void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
|
||||
JDIMENSION output_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr;
|
||||
int inrow, incol;
|
||||
|
||||
__vector unsigned char in, inl, inh;
|
||||
|
||||
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
||||
inptr = input_data[inrow];
|
||||
outptr = output_data[inrow];
|
||||
|
||||
for (incol = (output_width + 31) & (~31); incol > 0;
|
||||
incol -= 64, inptr += 32, outptr += 64) {
|
||||
|
||||
in = vec_ld(0, inptr);
|
||||
inl = vec_mergeh(in, in);
|
||||
inh = vec_mergel(in, in);
|
||||
|
||||
vec_st(inl, 0, outptr);
|
||||
vec_st(inh, 16, outptr);
|
||||
|
||||
if (incol > 32) {
|
||||
in = vec_ld(16, inptr);
|
||||
inl = vec_mergeh(in, in);
|
||||
inh = vec_mergel(in, in);
|
||||
|
||||
vec_st(inl, 32, outptr);
|
||||
vec_st(inh, 48, outptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
|
||||
JDIMENSION output_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr0, outptr1;
|
||||
int inrow, outrow, incol;
|
||||
|
||||
__vector unsigned char in, inl, inh;
|
||||
|
||||
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
|
||||
|
||||
inptr = input_data[inrow];
|
||||
outptr0 = output_data[outrow++];
|
||||
outptr1 = output_data[outrow++];
|
||||
|
||||
for (incol = (output_width + 31) & (~31); incol > 0;
|
||||
incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
|
||||
|
||||
in = vec_ld(0, inptr);
|
||||
inl = vec_mergeh(in, in);
|
||||
inh = vec_mergel(in, in);
|
||||
|
||||
vec_st(inl, 0, outptr0);
|
||||
vec_st(inl, 0, outptr1);
|
||||
|
||||
vec_st(inh, 16, outptr0);
|
||||
vec_st(inh, 16, outptr1);
|
||||
|
||||
if (incol > 32) {
|
||||
in = vec_ld(16, inptr);
|
||||
inl = vec_mergeh(in, in);
|
||||
inh = vec_mergel(in, in);
|
||||
|
||||
vec_st(inl, 32, outptr0);
|
||||
vec_st(inl, 32, outptr1);
|
||||
|
||||
vec_st(inh, 48, outptr0);
|
||||
vec_st(inh, 48, outptr1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
154
TMessagesProj/jni/mozjpeg/simd/powerpc/jfdctfst-altivec.c
Normal file
154
TMessagesProj/jni/mozjpeg/simd/powerpc/jfdctfst-altivec.c
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* FAST INTEGER FORWARD DCT
|
||||
*
|
||||
* This is similar to the SSE2 implementation, except that we left-shift the
|
||||
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
|
||||
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
|
||||
* the elements in arg3 + the most significant 17 bits of
|
||||
* (the elements in arg1 * the elements in arg2).
|
||||
*/
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_382 98 /* FIX(0.382683433) */
|
||||
#define F_0_541 139 /* FIX(0.541196100) */
|
||||
#define F_0_707 181 /* FIX(0.707106781) */
|
||||
#define F_1_306 334 /* FIX(1.306562965) */
|
||||
|
||||
#define CONST_BITS 8
|
||||
#define PRE_MULTIPLY_SCALE_BITS 2
|
||||
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
|
||||
|
||||
|
||||
#define DO_FDCT() { \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp0, tmp3); \
|
||||
tmp13 = vec_sub(tmp0, tmp3); \
|
||||
tmp11 = vec_add(tmp1, tmp2); \
|
||||
tmp12 = vec_sub(tmp1, tmp2); \
|
||||
\
|
||||
out0 = vec_add(tmp10, tmp11); \
|
||||
out4 = vec_sub(tmp10, tmp11); \
|
||||
\
|
||||
z1 = vec_add(tmp12, tmp13); \
|
||||
z1 = vec_sl(z1, pre_multiply_scale_bits); \
|
||||
z1 = vec_madds(z1, pw_0707, pw_zero); \
|
||||
\
|
||||
out2 = vec_add(tmp13, z1); \
|
||||
out6 = vec_sub(tmp13, z1); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp4, tmp5); \
|
||||
tmp11 = vec_add(tmp5, tmp6); \
|
||||
tmp12 = vec_add(tmp6, tmp7); \
|
||||
\
|
||||
tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
|
||||
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
|
||||
z5 = vec_sub(tmp10, tmp12); \
|
||||
z5 = vec_madds(z5, pw_0382, pw_zero); \
|
||||
\
|
||||
z2 = vec_madds(tmp10, pw_0541, z5); \
|
||||
z4 = vec_madds(tmp12, pw_1306, z5); \
|
||||
\
|
||||
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
|
||||
z3 = vec_madds(tmp11, pw_0707, pw_zero); \
|
||||
\
|
||||
z11 = vec_add(tmp7, z3); \
|
||||
z13 = vec_sub(tmp7, z3); \
|
||||
\
|
||||
out5 = vec_add(z13, z2); \
|
||||
out3 = vec_sub(z13, z2); \
|
||||
out1 = vec_add(z11, z4); \
|
||||
out7 = vec_sub(z11, z4); \
|
||||
}
|
||||
|
||||
|
||||
void jsimd_fdct_ifast_altivec(DCTELEM *data)
|
||||
{
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||
z1, z2, z3, z4, z5, z11, z13,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_zero = { __8X(0) },
|
||||
pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
|
||||
pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
|
||||
pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
|
||||
pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
|
||||
__vector unsigned short
|
||||
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
|
||||
|
||||
/* Pass 1: process rows */
|
||||
|
||||
row0 = vec_ld(0, data);
|
||||
row1 = vec_ld(16, data);
|
||||
row2 = vec_ld(32, data);
|
||||
row3 = vec_ld(48, data);
|
||||
row4 = vec_ld(64, data);
|
||||
row5 = vec_ld(80, data);
|
||||
row6 = vec_ld(96, data);
|
||||
row7 = vec_ld(112, data);
|
||||
|
||||
TRANSPOSE(row, col);
|
||||
|
||||
tmp0 = vec_add(col0, col7);
|
||||
tmp7 = vec_sub(col0, col7);
|
||||
tmp1 = vec_add(col1, col6);
|
||||
tmp6 = vec_sub(col1, col6);
|
||||
tmp2 = vec_add(col2, col5);
|
||||
tmp5 = vec_sub(col2, col5);
|
||||
tmp3 = vec_add(col3, col4);
|
||||
tmp4 = vec_sub(col3, col4);
|
||||
|
||||
DO_FDCT();
|
||||
|
||||
/* Pass 2: process columns */
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
|
||||
tmp0 = vec_add(row0, row7);
|
||||
tmp7 = vec_sub(row0, row7);
|
||||
tmp1 = vec_add(row1, row6);
|
||||
tmp6 = vec_sub(row1, row6);
|
||||
tmp2 = vec_add(row2, row5);
|
||||
tmp5 = vec_sub(row2, row5);
|
||||
tmp3 = vec_add(row3, row4);
|
||||
tmp4 = vec_sub(row3, row4);
|
||||
|
||||
DO_FDCT();
|
||||
|
||||
vec_st(out0, 0, data);
|
||||
vec_st(out1, 16, data);
|
||||
vec_st(out2, 32, data);
|
||||
vec_st(out3, 48, data);
|
||||
vec_st(out4, 64, data);
|
||||
vec_st(out5, 80, data);
|
||||
vec_st(out6, 96, data);
|
||||
vec_st(out7, 112, data);
|
||||
}
|
||||
258
TMessagesProj/jni/mozjpeg/simd/powerpc/jfdctint-altivec.c
Normal file
258
TMessagesProj/jni/mozjpeg/simd/powerpc/jfdctint-altivec.c
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* SLOW INTEGER FORWARD DCT */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
|
||||
#define DO_FDCT_COMMON(PASS) { \
|
||||
/* (Original) \
|
||||
* z1 = (tmp12 + tmp13) * 0.541196100; \
|
||||
* data2 = z1 + tmp13 * 0.765366865; \
|
||||
* data6 = z1 + tmp12 * -1.847759065; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
|
||||
* data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
|
||||
*/ \
|
||||
\
|
||||
tmp1312l = vec_mergeh(tmp13, tmp12); \
|
||||
tmp1312h = vec_mergel(tmp13, tmp12); \
|
||||
\
|
||||
out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
|
||||
out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
|
||||
out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
|
||||
out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
|
||||
\
|
||||
out2l = vec_sra(out2l, descale_p##PASS); \
|
||||
out2h = vec_sra(out2h, descale_p##PASS); \
|
||||
out6l = vec_sra(out6l, descale_p##PASS); \
|
||||
out6h = vec_sra(out6h, descale_p##PASS); \
|
||||
\
|
||||
out2 = vec_pack(out2l, out2h); \
|
||||
out6 = vec_pack(out6l, out6h); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
z3 = vec_add(tmp4, tmp6); \
|
||||
z4 = vec_add(tmp5, tmp7); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z5 = (z3 + z4) * 1.175875602; \
|
||||
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
|
||||
* z3 += z5; z4 += z5; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
|
||||
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
|
||||
*/ \
|
||||
\
|
||||
z34l = vec_mergeh(z3, z4); \
|
||||
z34h = vec_mergel(z3, z4); \
|
||||
\
|
||||
z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
|
||||
z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
|
||||
z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
|
||||
z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
|
||||
* tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
|
||||
* tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
|
||||
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
|
||||
* data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
|
||||
* data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
|
||||
* tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
|
||||
* tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
|
||||
* tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
|
||||
* data7 = tmp4 + z3; data5 = tmp5 + z4; \
|
||||
* data3 = tmp6 + z3; data1 = tmp7 + z4; \
|
||||
*/ \
|
||||
\
|
||||
tmp47l = vec_mergeh(tmp4, tmp7); \
|
||||
tmp47h = vec_mergel(tmp4, tmp7); \
|
||||
\
|
||||
out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
|
||||
out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
|
||||
out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
|
||||
out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
|
||||
\
|
||||
out7l = vec_sra(out7l, descale_p##PASS); \
|
||||
out7h = vec_sra(out7h, descale_p##PASS); \
|
||||
out1l = vec_sra(out1l, descale_p##PASS); \
|
||||
out1h = vec_sra(out1h, descale_p##PASS); \
|
||||
\
|
||||
out7 = vec_pack(out7l, out7h); \
|
||||
out1 = vec_pack(out1l, out1h); \
|
||||
\
|
||||
tmp56l = vec_mergeh(tmp5, tmp6); \
|
||||
tmp56h = vec_mergel(tmp5, tmp6); \
|
||||
\
|
||||
out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
|
||||
out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
|
||||
out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
|
||||
out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
|
||||
\
|
||||
out5l = vec_sra(out5l, descale_p##PASS); \
|
||||
out5h = vec_sra(out5h, descale_p##PASS); \
|
||||
out3l = vec_sra(out3l, descale_p##PASS); \
|
||||
out3h = vec_sra(out3h, descale_p##PASS); \
|
||||
\
|
||||
out5 = vec_pack(out5l, out5h); \
|
||||
out3 = vec_pack(out3l, out3h); \
|
||||
}
|
||||
|
||||
#define DO_FDCT_PASS1() { \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp0, tmp3); \
|
||||
tmp13 = vec_sub(tmp0, tmp3); \
|
||||
tmp11 = vec_add(tmp1, tmp2); \
|
||||
tmp12 = vec_sub(tmp1, tmp2); \
|
||||
\
|
||||
out0 = vec_add(tmp10, tmp11); \
|
||||
out0 = vec_sl(out0, pass1_bits); \
|
||||
out4 = vec_sub(tmp10, tmp11); \
|
||||
out4 = vec_sl(out4, pass1_bits); \
|
||||
\
|
||||
DO_FDCT_COMMON(1); \
|
||||
}
|
||||
|
||||
#define DO_FDCT_PASS2() { \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(tmp0, tmp3); \
|
||||
tmp13 = vec_sub(tmp0, tmp3); \
|
||||
tmp11 = vec_add(tmp1, tmp2); \
|
||||
tmp12 = vec_sub(tmp1, tmp2); \
|
||||
\
|
||||
out0 = vec_add(tmp10, tmp11); \
|
||||
out0 = vec_add(out0, pw_descale_p2x); \
|
||||
out0 = vec_sra(out0, pass1_bits); \
|
||||
out4 = vec_sub(tmp10, tmp11); \
|
||||
out4 = vec_add(out4, pw_descale_p2x); \
|
||||
out4 = vec_sra(out4, pass1_bits); \
|
||||
\
|
||||
DO_FDCT_COMMON(2); \
|
||||
}
|
||||
|
||||
|
||||
void jsimd_fdct_islow_altivec(DCTELEM *data)
|
||||
{
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||
tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
|
||||
z3, z4, z34l, z34h,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
__vector int z3l, z3h, z4l, z4h,
|
||||
out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
|
||||
out7l, out7h;
|
||||
|
||||
/* Constants */
|
||||
__vector short
|
||||
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
|
||||
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
|
||||
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
|
||||
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
|
||||
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
|
||||
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
|
||||
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
|
||||
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
|
||||
pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
|
||||
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
|
||||
__vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
|
||||
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
|
||||
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
|
||||
descale_p2 = { __4X(DESCALE_P2) };
|
||||
|
||||
/* Pass 1: process rows */
|
||||
|
||||
row0 = vec_ld(0, data);
|
||||
row1 = vec_ld(16, data);
|
||||
row2 = vec_ld(32, data);
|
||||
row3 = vec_ld(48, data);
|
||||
row4 = vec_ld(64, data);
|
||||
row5 = vec_ld(80, data);
|
||||
row6 = vec_ld(96, data);
|
||||
row7 = vec_ld(112, data);
|
||||
|
||||
TRANSPOSE(row, col);
|
||||
|
||||
tmp0 = vec_add(col0, col7);
|
||||
tmp7 = vec_sub(col0, col7);
|
||||
tmp1 = vec_add(col1, col6);
|
||||
tmp6 = vec_sub(col1, col6);
|
||||
tmp2 = vec_add(col2, col5);
|
||||
tmp5 = vec_sub(col2, col5);
|
||||
tmp3 = vec_add(col3, col4);
|
||||
tmp4 = vec_sub(col3, col4);
|
||||
|
||||
DO_FDCT_PASS1();
|
||||
|
||||
/* Pass 2: process columns */
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
|
||||
tmp0 = vec_add(row0, row7);
|
||||
tmp7 = vec_sub(row0, row7);
|
||||
tmp1 = vec_add(row1, row6);
|
||||
tmp6 = vec_sub(row1, row6);
|
||||
tmp2 = vec_add(row2, row5);
|
||||
tmp5 = vec_sub(row2, row5);
|
||||
tmp3 = vec_add(row3, row4);
|
||||
tmp4 = vec_sub(row3, row4);
|
||||
|
||||
DO_FDCT_PASS2();
|
||||
|
||||
vec_st(out0, 0, data);
|
||||
vec_st(out1, 16, data);
|
||||
vec_st(out2, 32, data);
|
||||
vec_st(out3, 48, data);
|
||||
vec_st(out4, 64, data);
|
||||
vec_st(out5, 80, data);
|
||||
vec_st(out6, 96, data);
|
||||
vec_st(out7, 112, data);
|
||||
}
|
||||
255
TMessagesProj/jni/mozjpeg/simd/powerpc/jidctfst-altivec.c
Normal file
255
TMessagesProj/jni/mozjpeg/simd/powerpc/jidctfst-altivec.c
Normal file
|
|
@ -0,0 +1,255 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* FAST INTEGER INVERSE DCT
|
||||
*
|
||||
* This is similar to the SSE2 implementation, except that we left-shift the
|
||||
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
|
||||
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
|
||||
* the elements in arg3 + the most significant 17 bits of
|
||||
* (the elements in arg1 * the elements in arg2).
|
||||
*/
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_1_082 277 /* FIX(1.082392200) */
|
||||
#define F_1_414 362 /* FIX(1.414213562) */
|
||||
#define F_1_847 473 /* FIX(1.847759065) */
|
||||
#define F_2_613 669 /* FIX(2.613125930) */
|
||||
#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
|
||||
|
||||
#define CONST_BITS 8
|
||||
#define PASS1_BITS 2
|
||||
#define PRE_MULTIPLY_SCALE_BITS 2
|
||||
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
|
||||
|
||||
|
||||
#define DO_IDCT(in) { \
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = vec_add(in##0, in##4); \
|
||||
tmp11 = vec_sub(in##0, in##4); \
|
||||
tmp13 = vec_add(in##2, in##6); \
|
||||
\
|
||||
tmp12 = vec_sub(in##2, in##6); \
|
||||
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
|
||||
tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
|
||||
tmp12 = vec_sub(tmp12, tmp13); \
|
||||
\
|
||||
tmp0 = vec_add(tmp10, tmp13); \
|
||||
tmp3 = vec_sub(tmp10, tmp13); \
|
||||
tmp1 = vec_add(tmp11, tmp12); \
|
||||
tmp2 = vec_sub(tmp11, tmp12); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
z13 = vec_add(in##5, in##3); \
|
||||
z10 = vec_sub(in##5, in##3); \
|
||||
z10s = vec_sl(z10, pre_multiply_scale_bits); \
|
||||
z11 = vec_add(in##1, in##7); \
|
||||
z12s = vec_sub(in##1, in##7); \
|
||||
z12s = vec_sl(z12s, pre_multiply_scale_bits); \
|
||||
\
|
||||
tmp11 = vec_sub(z11, z13); \
|
||||
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
|
||||
tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
|
||||
\
|
||||
tmp7 = vec_add(z11, z13); \
|
||||
\
|
||||
/* To avoid overflow... \
|
||||
* \
|
||||
* (Original) \
|
||||
* tmp12 = -2.613125930 * z10 + z5; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp12 = (-1.613125930 - 1) * z10 + z5; \
|
||||
* = -1.613125930 * z10 - z10 + z5; \
|
||||
*/ \
|
||||
\
|
||||
z5 = vec_add(z10s, z12s); \
|
||||
z5 = vec_madds(z5, pw_F1847, pw_zero); \
|
||||
\
|
||||
tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
|
||||
tmp10 = vec_sub(tmp10, z5); \
|
||||
tmp12 = vec_madds(z10s, pw_MF1613, z5); \
|
||||
tmp12 = vec_sub(tmp12, z10); \
|
||||
\
|
||||
tmp6 = vec_sub(tmp12, tmp7); \
|
||||
tmp5 = vec_sub(tmp11, tmp6); \
|
||||
tmp4 = vec_add(tmp10, tmp5); \
|
||||
\
|
||||
out0 = vec_add(tmp0, tmp7); \
|
||||
out1 = vec_add(tmp1, tmp6); \
|
||||
out2 = vec_add(tmp2, tmp5); \
|
||||
out3 = vec_sub(tmp3, tmp4); \
|
||||
out4 = vec_add(tmp3, tmp4); \
|
||||
out5 = vec_sub(tmp2, tmp5); \
|
||||
out6 = vec_sub(tmp1, tmp6); \
|
||||
out7 = vec_sub(tmp0, tmp7); \
|
||||
}
|
||||
|
||||
|
||||
void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
short *dct_table = (short *)dct_table_;
|
||||
int *outptr;
|
||||
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
|
||||
z5, z10, z10s, z11, z12s, z13,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
__vector signed char outb;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_zero = { __8X(0) },
|
||||
pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
|
||||
pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
|
||||
pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
|
||||
pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
|
||||
__vector unsigned short
|
||||
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
|
||||
pass1_bits3 = { __8X(PASS1_BITS + 3) };
|
||||
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
|
||||
|
||||
/* Pass 1: process columns */
|
||||
|
||||
col0 = vec_ld(0, coef_block);
|
||||
col1 = vec_ld(16, coef_block);
|
||||
col2 = vec_ld(32, coef_block);
|
||||
col3 = vec_ld(48, coef_block);
|
||||
col4 = vec_ld(64, coef_block);
|
||||
col5 = vec_ld(80, coef_block);
|
||||
col6 = vec_ld(96, coef_block);
|
||||
col7 = vec_ld(112, coef_block);
|
||||
|
||||
tmp1 = vec_or(col1, col2);
|
||||
tmp2 = vec_or(col3, col4);
|
||||
tmp1 = vec_or(tmp1, tmp2);
|
||||
tmp3 = vec_or(col5, col6);
|
||||
tmp3 = vec_or(tmp3, col7);
|
||||
tmp1 = vec_or(tmp1, tmp3);
|
||||
|
||||
quant0 = vec_ld(0, dct_table);
|
||||
col0 = vec_mladd(col0, quant0, pw_zero);
|
||||
|
||||
if (vec_all_eq(tmp1, pw_zero)) {
|
||||
/* AC terms all zero */
|
||||
|
||||
row0 = vec_splat(col0, 0);
|
||||
row1 = vec_splat(col0, 1);
|
||||
row2 = vec_splat(col0, 2);
|
||||
row3 = vec_splat(col0, 3);
|
||||
row4 = vec_splat(col0, 4);
|
||||
row5 = vec_splat(col0, 5);
|
||||
row6 = vec_splat(col0, 6);
|
||||
row7 = vec_splat(col0, 7);
|
||||
|
||||
} else {
|
||||
|
||||
quant1 = vec_ld(16, dct_table);
|
||||
quant2 = vec_ld(32, dct_table);
|
||||
quant3 = vec_ld(48, dct_table);
|
||||
quant4 = vec_ld(64, dct_table);
|
||||
quant5 = vec_ld(80, dct_table);
|
||||
quant6 = vec_ld(96, dct_table);
|
||||
quant7 = vec_ld(112, dct_table);
|
||||
|
||||
col1 = vec_mladd(col1, quant1, pw_zero);
|
||||
col2 = vec_mladd(col2, quant2, pw_zero);
|
||||
col3 = vec_mladd(col3, quant3, pw_zero);
|
||||
col4 = vec_mladd(col4, quant4, pw_zero);
|
||||
col5 = vec_mladd(col5, quant5, pw_zero);
|
||||
col6 = vec_mladd(col6, quant6, pw_zero);
|
||||
col7 = vec_mladd(col7, quant7, pw_zero);
|
||||
|
||||
DO_IDCT(col);
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
}
|
||||
|
||||
/* Pass 2: process rows */
|
||||
|
||||
DO_IDCT(row);
|
||||
|
||||
out0 = vec_sra(out0, pass1_bits3);
|
||||
out1 = vec_sra(out1, pass1_bits3);
|
||||
out2 = vec_sra(out2, pass1_bits3);
|
||||
out3 = vec_sra(out3, pass1_bits3);
|
||||
out4 = vec_sra(out4, pass1_bits3);
|
||||
out5 = vec_sra(out5, pass1_bits3);
|
||||
out6 = vec_sra(out6, pass1_bits3);
|
||||
out7 = vec_sra(out7, pass1_bits3);
|
||||
|
||||
TRANSPOSE(out, col);
|
||||
|
||||
outb = vec_packs(col0, col0);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[0] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col1, col1);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[1] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col2, col2);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[2] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col3, col3);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[3] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col4, col4);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[4] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col5, col5);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[5] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col6, col6);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[6] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col7, col7);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[7] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
}
|
||||
357
TMessagesProj/jni/mozjpeg/simd/powerpc/jidctint-altivec.c
Normal file
357
TMessagesProj/jni/mozjpeg/simd/powerpc/jidctint-altivec.c
Normal file
|
|
@ -0,0 +1,357 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* SLOW INTEGER INVERSE DCT */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
|
||||
#define DO_IDCT(in, PASS) { \
|
||||
/* Even part \
|
||||
* \
|
||||
* (Original) \
|
||||
* z1 = (z2 + z3) * 0.541196100; \
|
||||
* tmp2 = z1 + z3 * -1.847759065; \
|
||||
* tmp3 = z1 + z2 * 0.765366865; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
|
||||
* tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
|
||||
*/ \
|
||||
\
|
||||
in##26l = vec_mergeh(in##2, in##6); \
|
||||
in##26h = vec_mergel(in##2, in##6); \
|
||||
\
|
||||
tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
|
||||
tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
|
||||
tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
|
||||
tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
|
||||
\
|
||||
tmp0 = vec_add(in##0, in##4); \
|
||||
tmp1 = vec_sub(in##0, in##4); \
|
||||
\
|
||||
tmp0l = vec_unpackh(tmp0); \
|
||||
tmp0h = vec_unpackl(tmp0); \
|
||||
tmp0l = vec_sl(tmp0l, const_bits); \
|
||||
tmp0h = vec_sl(tmp0h, const_bits); \
|
||||
tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
|
||||
tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
|
||||
\
|
||||
tmp10l = vec_add(tmp0l, tmp3l); \
|
||||
tmp10h = vec_add(tmp0h, tmp3h); \
|
||||
tmp13l = vec_sub(tmp0l, tmp3l); \
|
||||
tmp13h = vec_sub(tmp0h, tmp3h); \
|
||||
\
|
||||
tmp1l = vec_unpackh(tmp1); \
|
||||
tmp1h = vec_unpackl(tmp1); \
|
||||
tmp1l = vec_sl(tmp1l, const_bits); \
|
||||
tmp1h = vec_sl(tmp1h, const_bits); \
|
||||
tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
|
||||
tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
|
||||
\
|
||||
tmp11l = vec_add(tmp1l, tmp2l); \
|
||||
tmp11h = vec_add(tmp1h, tmp2h); \
|
||||
tmp12l = vec_sub(tmp1l, tmp2l); \
|
||||
tmp12h = vec_sub(tmp1h, tmp2h); \
|
||||
\
|
||||
/* Odd part */ \
|
||||
\
|
||||
z3 = vec_add(in##3, in##7); \
|
||||
z4 = vec_add(in##1, in##5); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z5 = (z3 + z4) * 1.175875602; \
|
||||
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
|
||||
* z3 += z5; z4 += z5; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
|
||||
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
|
||||
*/ \
|
||||
\
|
||||
z34l = vec_mergeh(z3, z4); \
|
||||
z34h = vec_mergel(z3, z4); \
|
||||
\
|
||||
z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
|
||||
z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
|
||||
z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
|
||||
z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
|
||||
\
|
||||
/* (Original) \
|
||||
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
|
||||
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
|
||||
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
|
||||
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
|
||||
* tmp0 += z1 + z3; tmp1 += z2 + z4; \
|
||||
* tmp2 += z2 + z3; tmp3 += z1 + z4; \
|
||||
* \
|
||||
* (This implementation) \
|
||||
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
|
||||
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
|
||||
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
|
||||
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
|
||||
* tmp0 += z3; tmp1 += z4; \
|
||||
* tmp2 += z3; tmp3 += z4; \
|
||||
*/ \
|
||||
\
|
||||
in##71l = vec_mergeh(in##7, in##1); \
|
||||
in##71h = vec_mergel(in##7, in##1); \
|
||||
\
|
||||
tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
|
||||
tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
|
||||
tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
|
||||
tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
|
||||
\
|
||||
in##53l = vec_mergeh(in##5, in##3); \
|
||||
in##53h = vec_mergel(in##5, in##3); \
|
||||
\
|
||||
tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
|
||||
tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
|
||||
tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
|
||||
tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
|
||||
\
|
||||
/* Final output stage */ \
|
||||
\
|
||||
out0l = vec_add(tmp10l, tmp3l); \
|
||||
out0h = vec_add(tmp10h, tmp3h); \
|
||||
out7l = vec_sub(tmp10l, tmp3l); \
|
||||
out7h = vec_sub(tmp10h, tmp3h); \
|
||||
\
|
||||
out0l = vec_sra(out0l, descale_p##PASS); \
|
||||
out0h = vec_sra(out0h, descale_p##PASS); \
|
||||
out7l = vec_sra(out7l, descale_p##PASS); \
|
||||
out7h = vec_sra(out7h, descale_p##PASS); \
|
||||
\
|
||||
out0 = vec_pack(out0l, out0h); \
|
||||
out7 = vec_pack(out7l, out7h); \
|
||||
\
|
||||
out1l = vec_add(tmp11l, tmp2l); \
|
||||
out1h = vec_add(tmp11h, tmp2h); \
|
||||
out6l = vec_sub(tmp11l, tmp2l); \
|
||||
out6h = vec_sub(tmp11h, tmp2h); \
|
||||
\
|
||||
out1l = vec_sra(out1l, descale_p##PASS); \
|
||||
out1h = vec_sra(out1h, descale_p##PASS); \
|
||||
out6l = vec_sra(out6l, descale_p##PASS); \
|
||||
out6h = vec_sra(out6h, descale_p##PASS); \
|
||||
\
|
||||
out1 = vec_pack(out1l, out1h); \
|
||||
out6 = vec_pack(out6l, out6h); \
|
||||
\
|
||||
out2l = vec_add(tmp12l, tmp1l); \
|
||||
out2h = vec_add(tmp12h, tmp1h); \
|
||||
out5l = vec_sub(tmp12l, tmp1l); \
|
||||
out5h = vec_sub(tmp12h, tmp1h); \
|
||||
\
|
||||
out2l = vec_sra(out2l, descale_p##PASS); \
|
||||
out2h = vec_sra(out2h, descale_p##PASS); \
|
||||
out5l = vec_sra(out5l, descale_p##PASS); \
|
||||
out5h = vec_sra(out5h, descale_p##PASS); \
|
||||
\
|
||||
out2 = vec_pack(out2l, out2h); \
|
||||
out5 = vec_pack(out5l, out5h); \
|
||||
\
|
||||
out3l = vec_add(tmp13l, tmp0l); \
|
||||
out3h = vec_add(tmp13h, tmp0h); \
|
||||
out4l = vec_sub(tmp13l, tmp0l); \
|
||||
out4h = vec_sub(tmp13h, tmp0h); \
|
||||
\
|
||||
out3l = vec_sra(out3l, descale_p##PASS); \
|
||||
out3h = vec_sra(out3h, descale_p##PASS); \
|
||||
out4l = vec_sra(out4l, descale_p##PASS); \
|
||||
out4h = vec_sra(out4h, descale_p##PASS); \
|
||||
\
|
||||
out3 = vec_pack(out3l, out3h); \
|
||||
out4 = vec_pack(out4l, out4h); \
|
||||
}
|
||||
|
||||
|
||||
void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
short *dct_table = (short *)dct_table_;
|
||||
int *outptr;
|
||||
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
col0, col1, col2, col3, col4, col5, col6, col7,
|
||||
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
|
||||
tmp0, tmp1, tmp2, tmp3, z3, z4,
|
||||
z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
|
||||
row71l, row71h, row26l, row26h, row53l, row53h,
|
||||
out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
__vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
|
||||
tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
|
||||
z3l, z3h, z4l, z4h,
|
||||
out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
|
||||
out5l, out5h, out6l, out6h, out7l, out7h;
|
||||
__vector signed char outb;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_zero = { __8X(0) },
|
||||
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
|
||||
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
|
||||
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
|
||||
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
|
||||
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
|
||||
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
|
||||
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
|
||||
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
|
||||
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
|
||||
__vector int pd_zero = { __4X(0) },
|
||||
pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
|
||||
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
|
||||
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
|
||||
descale_p2 = { __4X(DESCALE_P2) },
|
||||
const_bits = { __4X(CONST_BITS) };
|
||||
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
|
||||
|
||||
/* Pass 1: process columns */
|
||||
|
||||
col0 = vec_ld(0, coef_block);
|
||||
col1 = vec_ld(16, coef_block);
|
||||
col2 = vec_ld(32, coef_block);
|
||||
col3 = vec_ld(48, coef_block);
|
||||
col4 = vec_ld(64, coef_block);
|
||||
col5 = vec_ld(80, coef_block);
|
||||
col6 = vec_ld(96, coef_block);
|
||||
col7 = vec_ld(112, coef_block);
|
||||
|
||||
tmp1 = vec_or(col1, col2);
|
||||
tmp2 = vec_or(col3, col4);
|
||||
tmp1 = vec_or(tmp1, tmp2);
|
||||
tmp3 = vec_or(col5, col6);
|
||||
tmp3 = vec_or(tmp3, col7);
|
||||
tmp1 = vec_or(tmp1, tmp3);
|
||||
|
||||
quant0 = vec_ld(0, dct_table);
|
||||
col0 = vec_mladd(col0, quant0, pw_zero);
|
||||
|
||||
if (vec_all_eq(tmp1, pw_zero)) {
|
||||
/* AC terms all zero */
|
||||
|
||||
col0 = vec_sl(col0, pass1_bits);
|
||||
|
||||
row0 = vec_splat(col0, 0);
|
||||
row1 = vec_splat(col0, 1);
|
||||
row2 = vec_splat(col0, 2);
|
||||
row3 = vec_splat(col0, 3);
|
||||
row4 = vec_splat(col0, 4);
|
||||
row5 = vec_splat(col0, 5);
|
||||
row6 = vec_splat(col0, 6);
|
||||
row7 = vec_splat(col0, 7);
|
||||
|
||||
} else {
|
||||
|
||||
quant1 = vec_ld(16, dct_table);
|
||||
quant2 = vec_ld(32, dct_table);
|
||||
quant3 = vec_ld(48, dct_table);
|
||||
quant4 = vec_ld(64, dct_table);
|
||||
quant5 = vec_ld(80, dct_table);
|
||||
quant6 = vec_ld(96, dct_table);
|
||||
quant7 = vec_ld(112, dct_table);
|
||||
|
||||
col1 = vec_mladd(col1, quant1, pw_zero);
|
||||
col2 = vec_mladd(col2, quant2, pw_zero);
|
||||
col3 = vec_mladd(col3, quant3, pw_zero);
|
||||
col4 = vec_mladd(col4, quant4, pw_zero);
|
||||
col5 = vec_mladd(col5, quant5, pw_zero);
|
||||
col6 = vec_mladd(col6, quant6, pw_zero);
|
||||
col7 = vec_mladd(col7, quant7, pw_zero);
|
||||
|
||||
DO_IDCT(col, 1);
|
||||
|
||||
TRANSPOSE(out, row);
|
||||
}
|
||||
|
||||
/* Pass 2: process rows */
|
||||
|
||||
DO_IDCT(row, 2);
|
||||
|
||||
TRANSPOSE(out, col);
|
||||
|
||||
outb = vec_packs(col0, col0);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[0] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col1, col1);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[1] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col2, col2);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[2] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col3, col3);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[3] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col4, col4);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[4] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col5, col5);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[5] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col6, col6);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[6] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
|
||||
outb = vec_packs(col7, col7);
|
||||
outb = vec_add(outb, pb_centerjsamp);
|
||||
outptr = (int *)(output_buf[7] + output_col);
|
||||
vec_ste((__vector int)outb, 0, outptr);
|
||||
vec_ste((__vector int)outb, 4, outptr);
|
||||
}
|
||||
250
TMessagesProj/jni/mozjpeg/simd/powerpc/jquanti-altivec.c
Normal file
250
TMessagesProj/jni/mozjpeg/simd/powerpc/jquanti-altivec.c
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
|
||||
|
||||
#include "jsimd_altivec.h"
|
||||
|
||||
|
||||
/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
|
||||
* always get the data we want by using a single vector load (although we may
|
||||
* have to permute the result.)
|
||||
*/
|
||||
#if __BIG_ENDIAN__
|
||||
|
||||
#define LOAD_ROW(row) { \
|
||||
elemptr = sample_data[row] + start_col; \
|
||||
in##row = vec_ld(0, elemptr); \
|
||||
if ((size_t)elemptr & 15) \
|
||||
in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define LOAD_ROW(row) { \
|
||||
elemptr = sample_data[row] + start_col; \
|
||||
in##row = vec_vsx_ld(0, elemptr); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM *workspace)
|
||||
{
|
||||
JSAMPROW elemptr;
|
||||
|
||||
__vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
|
||||
__vector short out0, out1, out2, out3, out4, out5, out6, out7;
|
||||
|
||||
/* Constants */
|
||||
__vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
|
||||
__vector unsigned char pb_zero = { __16X(0) };
|
||||
|
||||
LOAD_ROW(0);
|
||||
LOAD_ROW(1);
|
||||
LOAD_ROW(2);
|
||||
LOAD_ROW(3);
|
||||
LOAD_ROW(4);
|
||||
LOAD_ROW(5);
|
||||
LOAD_ROW(6);
|
||||
LOAD_ROW(7);
|
||||
|
||||
out0 = (__vector short)VEC_UNPACKHU(in0);
|
||||
out1 = (__vector short)VEC_UNPACKHU(in1);
|
||||
out2 = (__vector short)VEC_UNPACKHU(in2);
|
||||
out3 = (__vector short)VEC_UNPACKHU(in3);
|
||||
out4 = (__vector short)VEC_UNPACKHU(in4);
|
||||
out5 = (__vector short)VEC_UNPACKHU(in5);
|
||||
out6 = (__vector short)VEC_UNPACKHU(in6);
|
||||
out7 = (__vector short)VEC_UNPACKHU(in7);
|
||||
|
||||
out0 = vec_sub(out0, pw_centerjsamp);
|
||||
out1 = vec_sub(out1, pw_centerjsamp);
|
||||
out2 = vec_sub(out2, pw_centerjsamp);
|
||||
out3 = vec_sub(out3, pw_centerjsamp);
|
||||
out4 = vec_sub(out4, pw_centerjsamp);
|
||||
out5 = vec_sub(out5, pw_centerjsamp);
|
||||
out6 = vec_sub(out6, pw_centerjsamp);
|
||||
out7 = vec_sub(out7, pw_centerjsamp);
|
||||
|
||||
vec_st(out0, 0, workspace);
|
||||
vec_st(out1, 16, workspace);
|
||||
vec_st(out2, 32, workspace);
|
||||
vec_st(out3, 48, workspace);
|
||||
vec_st(out4, 64, workspace);
|
||||
vec_st(out5, 80, workspace);
|
||||
vec_st(out6, 96, workspace);
|
||||
vec_st(out7, 112, workspace);
|
||||
}
|
||||
|
||||
|
||||
#define WORD_BIT 16
|
||||
|
||||
/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
|
||||
We basically need an unsigned equivalent of vec_madds(). */
|
||||
|
||||
#define MULTIPLY(vs0, vs1, out) { \
|
||||
tmpe = vec_mule((__vector unsigned short)vs0, \
|
||||
(__vector unsigned short)vs1); \
|
||||
tmpo = vec_mulo((__vector unsigned short)vs0, \
|
||||
(__vector unsigned short)vs1); \
|
||||
out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
|
||||
(__vector unsigned short)tmpo, \
|
||||
shift_pack_index); \
|
||||
}
|
||||
|
||||
void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
|
||||
DCTELEM *workspace)
|
||||
{
|
||||
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
|
||||
corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
|
||||
recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
|
||||
scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
|
||||
__vector unsigned int tmpe, tmpo;
|
||||
|
||||
/* Constants */
|
||||
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
|
||||
#if __BIG_ENDIAN__
|
||||
__vector unsigned char shift_pack_index =
|
||||
{ 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 };
|
||||
#else
|
||||
__vector unsigned char shift_pack_index =
|
||||
{ 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
|
||||
#endif
|
||||
|
||||
row0 = vec_ld(0, workspace);
|
||||
row1 = vec_ld(16, workspace);
|
||||
row2 = vec_ld(32, workspace);
|
||||
row3 = vec_ld(48, workspace);
|
||||
row4 = vec_ld(64, workspace);
|
||||
row5 = vec_ld(80, workspace);
|
||||
row6 = vec_ld(96, workspace);
|
||||
row7 = vec_ld(112, workspace);
|
||||
|
||||
/* Branch-less absolute value */
|
||||
row0s = vec_sra(row0, pw_word_bit_m1);
|
||||
row1s = vec_sra(row1, pw_word_bit_m1);
|
||||
row2s = vec_sra(row2, pw_word_bit_m1);
|
||||
row3s = vec_sra(row3, pw_word_bit_m1);
|
||||
row4s = vec_sra(row4, pw_word_bit_m1);
|
||||
row5s = vec_sra(row5, pw_word_bit_m1);
|
||||
row6s = vec_sra(row6, pw_word_bit_m1);
|
||||
row7s = vec_sra(row7, pw_word_bit_m1);
|
||||
row0 = vec_xor(row0, row0s);
|
||||
row1 = vec_xor(row1, row1s);
|
||||
row2 = vec_xor(row2, row2s);
|
||||
row3 = vec_xor(row3, row3s);
|
||||
row4 = vec_xor(row4, row4s);
|
||||
row5 = vec_xor(row5, row5s);
|
||||
row6 = vec_xor(row6, row6s);
|
||||
row7 = vec_xor(row7, row7s);
|
||||
row0 = vec_sub(row0, row0s);
|
||||
row1 = vec_sub(row1, row1s);
|
||||
row2 = vec_sub(row2, row2s);
|
||||
row3 = vec_sub(row3, row3s);
|
||||
row4 = vec_sub(row4, row4s);
|
||||
row5 = vec_sub(row5, row5s);
|
||||
row6 = vec_sub(row6, row6s);
|
||||
row7 = vec_sub(row7, row7s);
|
||||
|
||||
corr0 = vec_ld(DCTSIZE2 * 2, divisors);
|
||||
corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
|
||||
corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
|
||||
corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
|
||||
corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
|
||||
corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
|
||||
corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
|
||||
corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
|
||||
|
||||
row0 = vec_add(row0, corr0);
|
||||
row1 = vec_add(row1, corr1);
|
||||
row2 = vec_add(row2, corr2);
|
||||
row3 = vec_add(row3, corr3);
|
||||
row4 = vec_add(row4, corr4);
|
||||
row5 = vec_add(row5, corr5);
|
||||
row6 = vec_add(row6, corr6);
|
||||
row7 = vec_add(row7, corr7);
|
||||
|
||||
recip0 = vec_ld(0, divisors);
|
||||
recip1 = vec_ld(16, divisors);
|
||||
recip2 = vec_ld(32, divisors);
|
||||
recip3 = vec_ld(48, divisors);
|
||||
recip4 = vec_ld(64, divisors);
|
||||
recip5 = vec_ld(80, divisors);
|
||||
recip6 = vec_ld(96, divisors);
|
||||
recip7 = vec_ld(112, divisors);
|
||||
|
||||
MULTIPLY(row0, recip0, row0);
|
||||
MULTIPLY(row1, recip1, row1);
|
||||
MULTIPLY(row2, recip2, row2);
|
||||
MULTIPLY(row3, recip3, row3);
|
||||
MULTIPLY(row4, recip4, row4);
|
||||
MULTIPLY(row5, recip5, row5);
|
||||
MULTIPLY(row6, recip6, row6);
|
||||
MULTIPLY(row7, recip7, row7);
|
||||
|
||||
scale0 = vec_ld(DCTSIZE2 * 4, divisors);
|
||||
scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
|
||||
scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
|
||||
scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
|
||||
scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
|
||||
scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
|
||||
scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
|
||||
scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
|
||||
|
||||
MULTIPLY(row0, scale0, row0);
|
||||
MULTIPLY(row1, scale1, row1);
|
||||
MULTIPLY(row2, scale2, row2);
|
||||
MULTIPLY(row3, scale3, row3);
|
||||
MULTIPLY(row4, scale4, row4);
|
||||
MULTIPLY(row5, scale5, row5);
|
||||
MULTIPLY(row6, scale6, row6);
|
||||
MULTIPLY(row7, scale7, row7);
|
||||
|
||||
row0 = vec_xor(row0, row0s);
|
||||
row1 = vec_xor(row1, row1s);
|
||||
row2 = vec_xor(row2, row2s);
|
||||
row3 = vec_xor(row3, row3s);
|
||||
row4 = vec_xor(row4, row4s);
|
||||
row5 = vec_xor(row5, row5s);
|
||||
row6 = vec_xor(row6, row6s);
|
||||
row7 = vec_xor(row7, row7s);
|
||||
row0 = vec_sub(row0, row0s);
|
||||
row1 = vec_sub(row1, row1s);
|
||||
row2 = vec_sub(row2, row2s);
|
||||
row3 = vec_sub(row3, row3s);
|
||||
row4 = vec_sub(row4, row4s);
|
||||
row5 = vec_sub(row5, row5s);
|
||||
row6 = vec_sub(row6, row6s);
|
||||
row7 = vec_sub(row7, row7s);
|
||||
|
||||
vec_st(row0, 0, coef_block);
|
||||
vec_st(row1, 16, coef_block);
|
||||
vec_st(row2, 32, coef_block);
|
||||
vec_st(row3, 48, coef_block);
|
||||
vec_st(row4, 64, coef_block);
|
||||
vec_st(row5, 80, coef_block);
|
||||
vec_st(row6, 96, coef_block);
|
||||
vec_st(row7, 112, coef_block);
|
||||
}
|
||||
872
TMessagesProj/jni/mozjpeg/simd/powerpc/jsimd.c
Normal file
872
TMessagesProj/jni/mozjpeg/simd/powerpc/jsimd.c
Normal file
|
|
@ -0,0 +1,872 @@
|
|||
/*
|
||||
* jsimd_powerpc.c
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
* For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
*
|
||||
* This file contains the interface between the "normal" portions
|
||||
* of the library and the SIMD implementations when running on a
|
||||
* PowerPC architecture.
|
||||
*/
|
||||
|
||||
#ifdef __amigaos4__
|
||||
/* This must be defined first as it re-defines GLOBAL otherwise */
|
||||
#include <proto/exec.h>
|
||||
#endif
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#if defined(__OpenBSD__)
|
||||
#include <sys/param.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <machine/cpu.h>
|
||||
#endif
|
||||
|
||||
static unsigned int simd_support = ~0;
|
||||
|
||||
#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
|
||||
|
||||
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
|
||||
|
||||
LOCAL(int)
|
||||
check_feature(char *buffer, char *feature)
|
||||
{
|
||||
char *p;
|
||||
|
||||
if (*feature == 0)
|
||||
return 0;
|
||||
if (strncmp(buffer, "cpu", 3) != 0)
|
||||
return 0;
|
||||
buffer += 3;
|
||||
while (isspace(*buffer))
|
||||
buffer++;
|
||||
|
||||
/* Check if 'feature' is present in the buffer as a separate word */
|
||||
while ((p = strstr(buffer, feature))) {
|
||||
if (p > buffer && !isspace(*(p - 1))) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
p += strlen(feature);
|
||||
if (*p != 0 && !isspace(*p)) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LOCAL(int)
|
||||
parse_proc_cpuinfo(int bufsize)
|
||||
{
|
||||
char *buffer = (char *)malloc(bufsize);
|
||||
FILE *fd;
|
||||
|
||||
simd_support = 0;
|
||||
|
||||
if (!buffer)
|
||||
return 0;
|
||||
|
||||
fd = fopen("/proc/cpuinfo", "r");
|
||||
if (fd) {
|
||||
while (fgets(buffer, bufsize, fd)) {
|
||||
if (!strchr(buffer, '\n') && !feof(fd)) {
|
||||
/* "impossible" happened - insufficient size of the buffer! */
|
||||
fclose(fd);
|
||||
free(buffer);
|
||||
return 0;
|
||||
}
|
||||
if (check_feature(buffer, "altivec"))
|
||||
simd_support |= JSIMD_ALTIVEC;
|
||||
}
|
||||
fclose(fd);
|
||||
}
|
||||
free(buffer);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Check what SIMD accelerations are supported.
|
||||
*
|
||||
* FIXME: This code is racy under a multi-threaded environment.
|
||||
*/
|
||||
LOCAL(void)
|
||||
init_simd(void)
|
||||
{
|
||||
#ifndef NO_GETENV
|
||||
char *env = NULL;
|
||||
#endif
|
||||
#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
|
||||
int bufsize = 1024; /* an initial guess for the line buffer size limit */
|
||||
#elif defined(__amigaos4__)
|
||||
uint32 altivec = 0;
|
||||
#elif defined(__OpenBSD__)
|
||||
int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
|
||||
int altivec;
|
||||
size_t len = sizeof(altivec);
|
||||
#endif
|
||||
|
||||
if (simd_support != ~0U)
|
||||
return;
|
||||
|
||||
simd_support = 0;
|
||||
|
||||
#if defined(__ALTIVEC__) || defined(__APPLE__)
|
||||
simd_support |= JSIMD_ALTIVEC;
|
||||
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
|
||||
while (!parse_proc_cpuinfo(bufsize)) {
|
||||
bufsize *= 2;
|
||||
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
|
||||
break;
|
||||
}
|
||||
#elif defined(__amigaos4__)
|
||||
IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
|
||||
if (altivec == VECTORTYPE_ALTIVEC)
|
||||
simd_support |= JSIMD_ALTIVEC;
|
||||
#elif defined(__OpenBSD__)
|
||||
if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
|
||||
simd_support |= JSIMD_ALTIVEC;
|
||||
#endif
|
||||
|
||||
#ifndef NO_GETENV
|
||||
/* Force different settings through environment variables */
|
||||
env = getenv("JSIMD_FORCEALTIVEC");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = JSIMD_ALTIVEC;
|
||||
env = getenv("JSIMD_FORCENONE");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_ycc(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_gray(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb565(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
int num_rows)
|
||||
{
|
||||
void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch (cinfo->in_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
altivecfct = jsimd_extrgb_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
altivecfct = jsimd_extrgbx_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
altivecfct = jsimd_extbgr_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
altivecfct = jsimd_extbgrx_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
altivecfct = jsimd_extxbgr_ycc_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
altivecfct = jsimd_extxrgb_ycc_convert_altivec;
|
||||
break;
|
||||
default:
|
||||
altivecfct = jsimd_rgb_ycc_convert_altivec;
|
||||
break;
|
||||
}
|
||||
|
||||
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
int num_rows)
|
||||
{
|
||||
void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch (cinfo->in_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
altivecfct = jsimd_extrgb_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
altivecfct = jsimd_extrgbx_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
altivecfct = jsimd_extbgr_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
altivecfct = jsimd_extbgrx_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
altivecfct = jsimd_extxbgr_gray_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
altivecfct = jsimd_extxrgb_gray_convert_altivec;
|
||||
break;
|
||||
default:
|
||||
altivecfct = jsimd_rgb_gray_convert_altivec;
|
||||
break;
|
||||
}
|
||||
|
||||
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
int num_rows)
|
||||
{
|
||||
void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
|
||||
|
||||
switch (cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
altivecfct = jsimd_ycc_extrgb_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
altivecfct = jsimd_ycc_extrgbx_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
altivecfct = jsimd_ycc_extbgr_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
altivecfct = jsimd_ycc_extbgrx_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
altivecfct = jsimd_ycc_extxbgr_convert_altivec;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
altivecfct = jsimd_ycc_extxrgb_convert_altivec;
|
||||
break;
|
||||
default:
|
||||
altivecfct = jsimd_ycc_rgb_convert_altivec;
|
||||
break;
|
||||
}
|
||||
|
||||
altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
int num_rows)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_downsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_downsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor,
|
||||
compptr->width_in_blocks, input_data,
|
||||
output_data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor,
|
||||
compptr->width_in_blocks, input_data,
|
||||
output_data);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_fancy_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_fancy_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_merged_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_merged_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||
{
|
||||
void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
|
||||
switch (cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
|
||||
break;
|
||||
default:
|
||||
altivecfct = jsimd_h2v2_merged_upsample_altivec;
|
||||
break;
|
||||
}
|
||||
|
||||
altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||
{
|
||||
void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
|
||||
switch (cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
|
||||
break;
|
||||
default:
|
||||
altivecfct = jsimd_h2v1_merged_upsample_altivec;
|
||||
break;
|
||||
}
|
||||
|
||||
altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_convsamp(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_convsamp_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM *workspace)
|
||||
{
|
||||
jsimd_convsamp_altivec(sample_data, start_col, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
FAST_FLOAT *workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_islow(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_ifast(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_islow(DCTELEM *data)
|
||||
{
|
||||
jsimd_fdct_islow_altivec(data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_ifast(DCTELEM *data)
|
||||
{
|
||||
jsimd_fdct_ifast_altivec(data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_float(FAST_FLOAT *data)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_quantize(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_quantize_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
|
||||
{
|
||||
jsimd_quantize_altivec(coef_block, divisors, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
|
||||
FAST_FLOAT *workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_2x2(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_4x4(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_islow(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_ifast(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ALTIVEC)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
|
||||
output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
|
||||
output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_huff_encode_one_block(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(JOCTET *)
|
||||
jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
int last_dc_val, c_derived_tbl *dctbl,
|
||||
c_derived_tbl *actbl)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *values, size_t *zerobits)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
98
TMessagesProj/jni/mozjpeg/simd/powerpc/jsimd_altivec.h
Normal file
98
TMessagesProj/jni/mozjpeg/simd/powerpc/jsimd_altivec.h
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
/*
|
||||
* AltiVec optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include <altivec.h>
|
||||
|
||||
|
||||
/* Common code */
|
||||
|
||||
#define __4X(a) a, a, a, a
|
||||
#define __4X2(a, b) a, b, a, b, a, b, a, b
|
||||
#define __8X(a) __4X(a), __4X(a)
|
||||
#define __16X(a) __8X(a), __8X(a)
|
||||
|
||||
#define TRANSPOSE(row, col) { \
|
||||
__vector short row04l, row04h, row15l, row15h, \
|
||||
row26l, row26h, row37l, row37h; \
|
||||
__vector short col01e, col01o, col23e, col23o, \
|
||||
col45e, col45o, col67e, col67o; \
|
||||
\
|
||||
/* transpose coefficients (phase 1) */ \
|
||||
row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
|
||||
row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
|
||||
row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
|
||||
row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
|
||||
row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
|
||||
row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
|
||||
row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
|
||||
row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
|
||||
\
|
||||
/* transpose coefficients (phase 2) */ \
|
||||
col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
|
||||
col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
|
||||
col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
|
||||
col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
|
||||
col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
|
||||
col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
|
||||
col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
|
||||
col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
|
||||
\
|
||||
/* transpose coefficients (phase 3) */ \
|
||||
col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
|
||||
col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
|
||||
col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
|
||||
col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
|
||||
col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
|
||||
col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
|
||||
col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
|
||||
col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
|
||||
}
|
||||
|
||||
#ifndef min
|
||||
#define min(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
|
||||
/* Macros to abstract big/little endian bit twiddling */
|
||||
|
||||
#if __BIG_ENDIAN__
|
||||
|
||||
#define VEC_LD(a, b) vec_ld(a, b)
|
||||
#define VEC_ST(a, b, c) vec_st(a, b, c)
|
||||
#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
|
||||
#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
|
||||
|
||||
#else
|
||||
|
||||
#define VEC_LD(a, b) vec_vsx_ld(a, b)
|
||||
#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
|
||||
#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
|
||||
#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
|
||||
|
||||
#endif
|
||||
Loading…
Add table
Add a link
Reference in a new issue