Repo created

This commit is contained in:
Fr4nz D13trich 2025-11-22 14:04:28 +01:00
parent 81b91f4139
commit f8c34fa5ee
22732 changed files with 4815320 additions and 2 deletions

View file

@ -0,0 +1,269 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
* Copyright (C) 2014, Jay Foad. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-altivec.c */
void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
JSAMPROW inptr, outptr0, outptr1, outptr2;
int pitch = img_width * RGB_PIXELSIZE, num_cols;
#if __BIG_ENDIAN__
int offset;
#endif
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
__vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
__vector unsigned char rgb3 = { 0 };
#endif
#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
__vector unsigned char rgb4 = { 0 };
#endif
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
__vector unsigned short yl, yh, crl, crh, cbl, cbh;
__vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
/* Constants */
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
__vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
__vector int pd_onehalf = { __4X(ONE_HALF) },
pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
shift_pack_index =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
#else
shift_pack_index =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
#endif
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
for (num_cols = pitch; num_cols > 0;
num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
outptr0 += 16, outptr1 += 16, outptr2 += 16) {
#if __BIG_ENDIAN__
/* Load 16 pixels == 48 or 64 bytes */
offset = (size_t)inptr & 15;
if (offset) {
__vector unsigned char unaligned_shift_index;
int bytes = num_cols + offset;
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
/* Slow path to prevent buffer overread. Since there is no way to
* read a partial AltiVec register, overread would occur on the last
* chunk of the last image row if the right edge is not on a 16-byte
* boundary. It could also occur on other rows if the bytes per row
* is low enough. Since we can't determine whether we're on the last
* image row, we have to assume every row is the last.
*/
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
rgb0 = vec_ld(0, tmpbuf);
rgb1 = vec_ld(16, tmpbuf);
rgb2 = vec_ld(32, tmpbuf);
#if RGB_PIXELSIZE == 4
rgb3 = vec_ld(48, tmpbuf);
#endif
} else {
/* Fast path */
rgb0 = vec_ld(0, inptr);
if (bytes > 16)
rgb1 = vec_ld(16, inptr);
if (bytes > 32)
rgb2 = vec_ld(32, inptr);
if (bytes > 48)
rgb3 = vec_ld(48, inptr);
#if RGB_PIXELSIZE == 4
if (bytes > 64)
rgb4 = vec_ld(64, inptr);
#endif
unaligned_shift_index = vec_lvsl(0, inptr);
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
#if RGB_PIXELSIZE == 4
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
#endif
}
} else {
#endif /* __BIG_ENDIAN__ */
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
/* Slow path */
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
rgb0 = VEC_LD(0, tmpbuf);
rgb1 = VEC_LD(16, tmpbuf);
rgb2 = VEC_LD(32, tmpbuf);
#if RGB_PIXELSIZE == 4
rgb3 = VEC_LD(48, tmpbuf);
#endif
} else {
/* Fast path */
rgb0 = VEC_LD(0, inptr);
if (num_cols > 16)
rgb1 = VEC_LD(16, inptr);
if (num_cols > 32)
rgb2 = VEC_LD(32, inptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
rgb3 = VEC_LD(48, inptr);
#endif
}
#if __BIG_ENDIAN__
}
#endif
#if RGB_PIXELSIZE == 3
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
*
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
*/
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
#else
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
*
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
*/
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
#endif
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
* ...
*
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
/* (Original)
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
*
* (This implementation)
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
*/
/* Calculate Y values */
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
* each dword into a new 16-bit vector, which is the equivalent of
* descaling the 32-bit results (right-shifting by 16 bits) and then
* packing them.
*/
yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
shift_pack_index);
yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
shift_pack_index);
y = vec_pack(yl, yh);
vec_st(y, 0, outptr0);
/* Calculate Cb values */
cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
(__vector unsigned int)cb0);
cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
(__vector unsigned int)cb1);
cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
(__vector unsigned int)cb2);
cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
(__vector unsigned int)cb3);
cbl = vec_perm((__vector unsigned short)cb0,
(__vector unsigned short)cb1, shift_pack_index);
cbh = vec_perm((__vector unsigned short)cb2,
(__vector unsigned short)cb3, shift_pack_index);
cb = vec_pack(cbl, cbh);
vec_st(cb, 0, outptr1);
/* Calculate Cr values */
cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
(__vector unsigned int)cr0);
cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
(__vector unsigned int)cr1);
cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
(__vector unsigned int)cr2);
cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
(__vector unsigned int)cr3);
crl = vec_perm((__vector unsigned short)cr0,
(__vector unsigned short)cr1, shift_pack_index);
crh = vec_perm((__vector unsigned short)cr2,
(__vector unsigned short)cr3, shift_pack_index);
cr = vec_pack(crl, crh);
vec_st(cr, 0, outptr2);
}
}
}

View file

@ -0,0 +1,116 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* RGB --> YCC CONVERSION */
#include "jsimd_altivec.h"
#define F_0_081 5329 /* FIX(0.08131) */
#define F_0_114 7471 /* FIX(0.11400) */
#define F_0_168 11059 /* FIX(0.16874) */
#define F_0_250 16384 /* FIX(0.25000) */
#define F_0_299 19595 /* FIX(0.29900) */
#define F_0_331 21709 /* FIX(0.33126) */
#define F_0_418 27439 /* FIX(0.41869) */
#define F_0_500 32768 /* FIX(0.50000) */
#define F_0_587 38470 /* FIX(0.58700) */
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
#define SCALEBITS 16
#define ONE_HALF (1 << (SCALEBITS - 1))
#define RGBG_INDEX0 \
{ 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
#define RGBG_INDEX1 \
{ 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
#define RGBG_INDEX2 \
{ 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
#define RGBG_INDEX3 \
{ 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
#include "jccolext-altivec.c"
#undef RGB_PIXELSIZE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
#include "jccolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX0
#undef RGBG_INDEX1
#undef RGBG_INDEX2
#undef RGBG_INDEX3
#undef jsimd_rgb_ycc_convert_altivec
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define RGBG_INDEX \
{ 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
#include "jccolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_ycc_convert_altivec
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define RGBG_INDEX0 \
{ 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
#define RGBG_INDEX1 \
{ 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
#define RGBG_INDEX2 \
{ 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
#define RGBG_INDEX3 \
{ 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
#include "jccolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX0
#undef RGBG_INDEX1
#undef RGBG_INDEX2
#undef RGBG_INDEX3
#undef jsimd_rgb_ycc_convert_altivec
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define RGBG_INDEX \
{ 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
#include "jccolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_ycc_convert_altivec
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define RGBG_INDEX \
{ 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
#include "jccolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_ycc_convert_altivec
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define RGBG_INDEX \
{ 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
#include "jccolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_ycc_convert_altivec

View file

@ -0,0 +1,111 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* RGB --> GRAYSCALE CONVERSION */
#include "jsimd_altivec.h"
#define F_0_114 7471 /* FIX(0.11400) */
#define F_0_250 16384 /* FIX(0.25000) */
#define F_0_299 19595 /* FIX(0.29900) */
#define F_0_587 38470 /* FIX(0.58700) */
#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
#define SCALEBITS 16
#define ONE_HALF (1 << (SCALEBITS - 1))
#define RGBG_INDEX0 \
{ 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
#define RGBG_INDEX1 \
{ 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
#define RGBG_INDEX2 \
{ 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
#define RGBG_INDEX3 \
{ 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
#include "jcgryext-altivec.c"
#undef RGB_PIXELSIZE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
#include "jcgryext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX0
#undef RGBG_INDEX1
#undef RGBG_INDEX2
#undef RGBG_INDEX3
#undef jsimd_rgb_gray_convert_altivec
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define RGBG_INDEX \
{ 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
#include "jcgryext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_gray_convert_altivec
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define RGBG_INDEX0 \
{ 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
#define RGBG_INDEX1 \
{ 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
#define RGBG_INDEX2 \
{ 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
#define RGBG_INDEX3 \
{ 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
#include "jcgryext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX0
#undef RGBG_INDEX1
#undef RGBG_INDEX2
#undef RGBG_INDEX3
#undef jsimd_rgb_gray_convert_altivec
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define RGBG_INDEX \
{ 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
#include "jcgryext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_gray_convert_altivec
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define RGBG_INDEX \
{ 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
#include "jcgryext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_gray_convert_altivec
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define RGBG_INDEX \
{ 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
#include "jcgryext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGBG_INDEX
#undef jsimd_rgb_gray_convert_altivec

View file

@ -0,0 +1,228 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
* Copyright (C) 2014, Jay Foad. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jcgray-altivec.c */
void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
JSAMPROW inptr, outptr;
int pitch = img_width * RGB_PIXELSIZE, num_cols;
#if __BIG_ENDIAN__
int offset;
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
#endif
__vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
rgbg0, rgbg1, rgbg2, rgbg3, y;
#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
__vector unsigned char rgb3 = { 0 };
#endif
#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
__vector unsigned char rgb4 = { 0 };
#endif
__vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
__vector unsigned short yl, yh;
__vector int y0, y1, y2, y3;
/* Constants */
__vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
shift_pack_index =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
#else
shift_pack_index =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
#endif
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr = output_buf[0][output_row];
output_row++;
for (num_cols = pitch; num_cols > 0;
num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
outptr += 16) {
#if __BIG_ENDIAN__
/* Load 16 pixels == 48 or 64 bytes */
offset = (size_t)inptr & 15;
if (offset) {
__vector unsigned char unaligned_shift_index;
int bytes = num_cols + offset;
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
/* Slow path to prevent buffer overread. Since there is no way to
* read a partial AltiVec register, overread would occur on the last
* chunk of the last image row if the right edge is not on a 16-byte
* boundary. It could also occur on other rows if the bytes per row
* is low enough. Since we can't determine whether we're on the last
* image row, we have to assume every row is the last.
*/
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
rgb0 = vec_ld(0, tmpbuf);
rgb1 = vec_ld(16, tmpbuf);
rgb2 = vec_ld(32, tmpbuf);
#if RGB_PIXELSIZE == 4
rgb3 = vec_ld(48, tmpbuf);
#endif
} else {
/* Fast path */
rgb0 = vec_ld(0, inptr);
if (bytes > 16)
rgb1 = vec_ld(16, inptr);
if (bytes > 32)
rgb2 = vec_ld(32, inptr);
if (bytes > 48)
rgb3 = vec_ld(48, inptr);
#if RGB_PIXELSIZE == 4
if (bytes > 64)
rgb4 = vec_ld(64, inptr);
#endif
unaligned_shift_index = vec_lvsl(0, inptr);
rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
#if RGB_PIXELSIZE == 4
rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
#endif
}
} else {
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
/* Slow path */
memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
rgb0 = vec_ld(0, tmpbuf);
rgb1 = vec_ld(16, tmpbuf);
rgb2 = vec_ld(32, tmpbuf);
#if RGB_PIXELSIZE == 4
rgb3 = vec_ld(48, tmpbuf);
#endif
} else {
/* Fast path */
rgb0 = vec_ld(0, inptr);
if (num_cols > 16)
rgb1 = vec_ld(16, inptr);
if (num_cols > 32)
rgb2 = vec_ld(32, inptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
rgb3 = vec_ld(48, inptr);
#endif
}
}
#else
/* Little endian */
rgb0 = vec_vsx_ld(0, inptr);
if (num_cols > 16)
rgb1 = vec_vsx_ld(16, inptr);
if (num_cols > 32)
rgb2 = vec_vsx_ld(32, inptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
rgb3 = vec_vsx_ld(48, inptr);
#endif
#endif
#if RGB_PIXELSIZE == 3
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
*
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
*/
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
#else
/* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
*
* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
* rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
* rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
* rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
*/
rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
#endif
/* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
* bg0 = B0 G0 B1 G1 B2 G2 B3 G3
* ...
*
* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
/* (Original)
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
*
* (This implementation)
* Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
*/
/* Calculate Y values */
y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
y0 = vec_msums(bg0, pw_f0114_f0250, y0);
y1 = vec_msums(bg1, pw_f0114_f0250, y1);
y2 = vec_msums(bg2, pw_f0114_f0250, y2);
y3 = vec_msums(bg3, pw_f0114_f0250, y3);
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
* each dword into a new 16-bit vector, which is the equivalent of
* descaling the 32-bit results (right-shifting by 16 bits) and then
* packing them.
*/
yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
shift_pack_index);
yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
shift_pack_index);
y = vec_pack(yl, yh);
vec_st(y, 0, outptr);
}
}
}

View file

@ -0,0 +1,159 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* CHROMA DOWNSAMPLING */
#include "jsimd_altivec.h"
#include "jcsample.h"
void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
JSAMPARRAY input_data,
JSAMPARRAY output_data)
{
int outrow, outcol;
JDIMENSION output_cols = width_in_blocks * DCTSIZE;
JSAMPROW inptr, outptr;
__vector unsigned char this0, next0, out;
__vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
/* Constants */
__vector unsigned short pw_bias = { __4X2(0, 1) },
pw_one = { __8X(1) };
__vector unsigned char even_odd_index =
{ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
pb_zero = { __16X(0) };
expand_right_edge(input_data, max_v_samp_factor, image_width,
output_cols * 2);
for (outrow = 0; outrow < v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr = input_data[outrow];
for (outcol = output_cols; outcol > 0;
outcol -= 16, inptr += 32, outptr += 16) {
this0 = vec_ld(0, inptr);
this0 = vec_perm(this0, this0, even_odd_index);
this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
outl = vec_add(this0e, this0o);
outl = vec_add(outl, pw_bias);
outl = vec_sr(outl, pw_one);
if (outcol > 8) {
next0 = vec_ld(16, inptr);
next0 = vec_perm(next0, next0, even_odd_index);
next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
outh = vec_add(next0e, next0o);
outh = vec_add(outh, pw_bias);
outh = vec_sr(outh, pw_one);
} else
outh = vec_splat_u16(0);
out = vec_pack(outl, outh);
vec_st(out, 0, outptr);
}
}
}
void
jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
int inrow, outrow, outcol;
JDIMENSION output_cols = width_in_blocks * DCTSIZE;
JSAMPROW inptr0, inptr1, outptr;
__vector unsigned char this0, next0, this1, next1, out;
__vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
/* Constants */
__vector unsigned short pw_bias = { __4X2(1, 2) },
pw_two = { __8X(2) };
__vector unsigned char even_odd_index =
{ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
pb_zero = { __16X(0) };
expand_right_edge(input_data, max_v_samp_factor, image_width,
output_cols * 2);
for (inrow = 0, outrow = 0; outrow < v_samp_factor;
inrow += 2, outrow++) {
inptr0 = input_data[inrow];
inptr1 = input_data[inrow + 1];
outptr = output_data[outrow];
for (outcol = output_cols; outcol > 0;
outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
this0 = vec_ld(0, inptr0);
this0 = vec_perm(this0, this0, even_odd_index);
this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
out0l = vec_add(this0e, this0o);
this1 = vec_ld(0, inptr1);
this1 = vec_perm(this1, this1, even_odd_index);
this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
out1l = vec_add(this1e, this1o);
outl = vec_add(out0l, out1l);
outl = vec_add(outl, pw_bias);
outl = vec_sr(outl, pw_two);
if (outcol > 8) {
next0 = vec_ld(16, inptr0);
next0 = vec_perm(next0, next0, even_odd_index);
next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
out0h = vec_add(next0e, next0o);
next1 = vec_ld(16, inptr1);
next1 = vec_perm(next1, next1, even_odd_index);
next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
out1h = vec_add(next1e, next1o);
outh = vec_add(out0h, out1h);
outh = vec_add(outh, pw_bias);
outh = vec_sr(outh, pw_two);
} else
outh = vec_splat_u16(0);
out = vec_pack(outl, outh);
vec_st(out, 0, outptr);
}
}
}

View file

@ -0,0 +1,28 @@
/*
* jcsample.h
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1996, Thomas G. Lane.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
LOCAL(void)
expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
JDIMENSION output_cols)
{
register JSAMPROW ptr;
register JSAMPLE pixval;
register int count;
int row;
int numcols = (int)(output_cols - input_cols);
if (numcols > 0) {
for (row = 0; row < num_rows; row++) {
ptr = image_data[row] + input_cols;
pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
for (count = numcols; count > 0; count--)
*ptr++ = pixval;
}
}
}

View file

@ -0,0 +1,276 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdcolor-altivec.c */
void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
int pitch = out_width * RGB_PIXELSIZE, num_cols;
#if __BIG_ENDIAN__
int offset;
#endif
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
__vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
y, cb, cr;
#if __BIG_ENDIAN__
__vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
#if RGB_PIXELSIZE == 4
__vector unsigned char out4;
#endif
#endif
#if RGB_PIXELSIZE == 4
__vector unsigned char rgb3;
#endif
__vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
__vector int g0, g1, g2, g3;
/* Constants
* NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
* high-order bits, not 16.
*/
__vector short pw_f0402 = { __8X(F_0_402 >> 1) },
pw_mf0228 = { __8X(-F_0_228 >> 1) },
pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
pw_one = { __8X(1) }, pw_255 = { __8X(255) },
pw_cj = { __8X(CENTERJSAMPLE) };
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
shift_pack_index =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
#else
shift_pack_index =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
#endif
while (--num_rows >= 0) {
inptr0 = input_buf[0][input_row];
inptr1 = input_buf[1][input_row];
inptr2 = input_buf[2][input_row];
input_row++;
outptr = *output_buf++;
for (num_cols = pitch; num_cols > 0;
num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
inptr0 += 16, inptr1 += 16, inptr2 += 16) {
y = vec_ld(0, inptr0);
/* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
yl = (__vector signed short)VEC_UNPACKHU(y);
yh = (__vector signed short)VEC_UNPACKLU(y);
cb = vec_ld(0, inptr1);
cbl = (__vector signed short)VEC_UNPACKHU(cb);
cbh = (__vector signed short)VEC_UNPACKLU(cb);
cbl = vec_sub(cbl, pw_cj);
cbh = vec_sub(cbh, pw_cj);
cr = vec_ld(0, inptr2);
crl = (__vector signed short)VEC_UNPACKHU(cr);
crh = (__vector signed short)VEC_UNPACKLU(cr);
crl = vec_sub(crl, pw_cj);
crh = vec_sub(crh, pw_cj);
/* (Original)
* R = Y + 1.40200 * Cr
* G = Y - 0.34414 * Cb - 0.71414 * Cr
* B = Y + 1.77200 * Cb
*
* (This implementation)
* R = Y + 0.40200 * Cr + Cr
* G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
* B = Y - 0.22800 * Cb + Cb + Cb
*/
bl = vec_add(cbl, cbl);
bh = vec_add(cbh, cbh);
bl = vec_madds(bl, pw_mf0228, pw_one);
bh = vec_madds(bh, pw_mf0228, pw_one);
bl = vec_sra(bl, (__vector unsigned short)pw_one);
bh = vec_sra(bh, (__vector unsigned short)pw_one);
bl = vec_add(bl, cbl);
bh = vec_add(bh, cbh);
bl = vec_add(bl, cbl);
bh = vec_add(bh, cbh);
bl = vec_add(bl, yl);
bh = vec_add(bh, yh);
rl = vec_add(crl, crl);
rh = vec_add(crh, crh);
rl = vec_madds(rl, pw_f0402, pw_one);
rh = vec_madds(rh, pw_f0402, pw_one);
rl = vec_sra(rl, (__vector unsigned short)pw_one);
rh = vec_sra(rh, (__vector unsigned short)pw_one);
rl = vec_add(rl, crl);
rh = vec_add(rh, crh);
rl = vec_add(rl, yl);
rh = vec_add(rh, yh);
g0w = vec_mergeh(cbl, crl);
g1w = vec_mergel(cbl, crl);
g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
g2w = vec_mergeh(cbh, crh);
g3w = vec_mergel(cbh, crh);
g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
* each dword into a new 16-bit vector, which is the equivalent of
* descaling the 32-bit results (right-shifting by 16 bits) and then
* packing them.
*/
gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
gl = vec_sub(gl, crl);
gh = vec_sub(gh, crh);
gl = vec_add(gl, yl);
gh = vec_add(gh, yh);
rg0 = vec_mergeh(rl, gl);
bx0 = vec_mergeh(bl, pw_255);
rg1 = vec_mergel(rl, gl);
bx1 = vec_mergel(bl, pw_255);
rg2 = vec_mergeh(rh, gh);
bx2 = vec_mergeh(bh, pw_255);
rg3 = vec_mergel(rh, gh);
bx3 = vec_mergel(bh, pw_255);
rgbx0 = vec_packsu(rg0, bx0);
rgbx1 = vec_packsu(rg1, bx1);
rgbx2 = vec_packsu(rg2, bx2);
rgbx3 = vec_packsu(rg3, bx3);
#if RGB_PIXELSIZE == 3
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
*
* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
*/
rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
#else
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
*
* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
*/
rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
#endif
#if __BIG_ENDIAN__
offset = (size_t)outptr & 15;
if (offset) {
__vector unsigned char unaligned_shift_index;
int bytes = num_cols + offset;
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
/* Slow path to prevent buffer overwrite. Since there is no way to
* write a partial AltiVec register, overwrite would occur on the
* last chunk of the last image row if the right edge is not on a
* 16-byte boundary. It could also occur on other rows if the bytes
* per row is low enough. Since we can't determine whether we're on
* the last image row, we have to assume every row is the last.
*/
vec_st(rgb0, 0, tmpbuf);
vec_st(rgb1, 16, tmpbuf);
vec_st(rgb2, 32, tmpbuf);
#if RGB_PIXELSIZE == 4
vec_st(rgb3, 48, tmpbuf);
#endif
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
} else {
/* Fast path */
unaligned_shift_index = vec_lvsl(0, outptr);
edgel = vec_ld(0, outptr);
edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
edges = vec_perm(edgeh, edgel, unaligned_shift_index);
unaligned_shift_index = vec_lvsr(0, outptr);
out0 = vec_perm(edges, rgb0, unaligned_shift_index);
out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
#if RGB_PIXELSIZE == 4
out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
out4 = vec_perm(rgb3, edges, unaligned_shift_index);
#else
out3 = vec_perm(rgb2, edges, unaligned_shift_index);
#endif
vec_st(out0, 0, outptr);
if (bytes > 16)
vec_st(out1, 16, outptr);
if (bytes > 32)
vec_st(out2, 32, outptr);
if (bytes > 48)
vec_st(out3, 48, outptr);
#if RGB_PIXELSIZE == 4
if (bytes > 64)
vec_st(out4, 64, outptr);
#endif
}
} else {
#endif /* __BIG_ENDIAN__ */
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
/* Slow path */
VEC_ST(rgb0, 0, tmpbuf);
VEC_ST(rgb1, 16, tmpbuf);
VEC_ST(rgb2, 32, tmpbuf);
#if RGB_PIXELSIZE == 4
VEC_ST(rgb3, 48, tmpbuf);
#endif
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
} else {
/* Fast path */
VEC_ST(rgb0, 0, outptr);
if (num_cols > 16)
VEC_ST(rgb1, 16, outptr);
if (num_cols > 32)
VEC_ST(rgb2, 32, outptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
VEC_ST(rgb3, 48, outptr);
#endif
}
#if __BIG_ENDIAN__
}
#endif
}
}
}

View file

@ -0,0 +1,106 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* YCC --> RGB CONVERSION */
#include "jsimd_altivec.h"
#define F_0_344 22554 /* FIX(0.34414) */
#define F_0_714 46802 /* FIX(0.71414) */
#define F_1_402 91881 /* FIX(1.40200) */
#define F_1_772 116130 /* FIX(1.77200) */
#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
#define SCALEBITS 16
#define ONE_HALF (1 << (SCALEBITS - 1))
#define RGB_INDEX0 \
{ 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
#define RGB_INDEX1 \
{ 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
#define RGB_INDEX2 \
{ 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
#include "jdcolext-altivec.c"
#undef RGB_PIXELSIZE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
#include "jdcolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX0
#undef RGB_INDEX1
#undef RGB_INDEX2
#undef jsimd_ycc_rgb_convert_altivec
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define RGB_INDEX \
{ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
#include "jdcolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_ycc_rgb_convert_altivec
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define RGB_INDEX0 \
{ 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
#define RGB_INDEX1 \
{ 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
#define RGB_INDEX2 \
{ 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
#include "jdcolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX0
#undef RGB_INDEX1
#undef RGB_INDEX2
#undef jsimd_ycc_rgb_convert_altivec
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define RGB_INDEX \
{ 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
#include "jdcolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_ycc_rgb_convert_altivec
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define RGB_INDEX \
{ 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
#include "jdcolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_ycc_rgb_convert_altivec
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define RGB_INDEX \
{ 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
#include "jdcolext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_ycc_rgb_convert_altivec

View file

@ -0,0 +1,130 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
#include "jsimd_altivec.h"
#define F_0_344 22554 /* FIX(0.34414) */
#define F_0_714 46802 /* FIX(0.71414) */
#define F_1_402 91881 /* FIX(1.40200) */
#define F_1_772 116130 /* FIX(1.77200) */
#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
#define SCALEBITS 16
#define ONE_HALF (1 << (SCALEBITS - 1))
#define RGB_INDEX0 \
{ 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
#define RGB_INDEX1 \
{ 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
#define RGB_INDEX2 \
{ 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
#include "jdmrgext-altivec.c"
#undef RGB_PIXELSIZE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_h2v1_merged_upsample_altivec \
jsimd_h2v1_extrgb_merged_upsample_altivec
#define jsimd_h2v2_merged_upsample_altivec \
jsimd_h2v2_extrgb_merged_upsample_altivec
#include "jdmrgext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX0
#undef RGB_INDEX1
#undef RGB_INDEX2
#undef jsimd_h2v1_merged_upsample_altivec
#undef jsimd_h2v2_merged_upsample_altivec
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define RGB_INDEX \
{ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
#define jsimd_h2v1_merged_upsample_altivec \
jsimd_h2v1_extrgbx_merged_upsample_altivec
#define jsimd_h2v2_merged_upsample_altivec \
jsimd_h2v2_extrgbx_merged_upsample_altivec
#include "jdmrgext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_h2v1_merged_upsample_altivec
#undef jsimd_h2v2_merged_upsample_altivec
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define RGB_INDEX0 \
{ 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
#define RGB_INDEX1 \
{ 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
#define RGB_INDEX2 \
{ 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
#define jsimd_h2v1_merged_upsample_altivec \
jsimd_h2v1_extbgr_merged_upsample_altivec
#define jsimd_h2v2_merged_upsample_altivec \
jsimd_h2v2_extbgr_merged_upsample_altivec
#include "jdmrgext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX0
#undef RGB_INDEX1
#undef RGB_INDEX2
#undef jsimd_h2v1_merged_upsample_altivec
#undef jsimd_h2v2_merged_upsample_altivec
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define RGB_INDEX \
{ 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
#define jsimd_h2v1_merged_upsample_altivec \
jsimd_h2v1_extbgrx_merged_upsample_altivec
#define jsimd_h2v2_merged_upsample_altivec \
jsimd_h2v2_extbgrx_merged_upsample_altivec
#include "jdmrgext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_h2v1_merged_upsample_altivec
#undef jsimd_h2v2_merged_upsample_altivec
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define RGB_INDEX \
{ 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
#define jsimd_h2v1_merged_upsample_altivec \
jsimd_h2v1_extxbgr_merged_upsample_altivec
#define jsimd_h2v2_merged_upsample_altivec \
jsimd_h2v2_extxbgr_merged_upsample_altivec
#include "jdmrgext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_h2v1_merged_upsample_altivec
#undef jsimd_h2v2_merged_upsample_altivec
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define RGB_INDEX \
{ 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
#define jsimd_h2v1_merged_upsample_altivec \
jsimd_h2v1_extxrgb_merged_upsample_altivec
#define jsimd_h2v2_merged_upsample_altivec \
jsimd_h2v2_extxrgb_merged_upsample_altivec
#include "jdmrgext-altivec.c"
#undef RGB_PIXELSIZE
#undef RGB_INDEX
#undef jsimd_h2v1_merged_upsample_altivec
#undef jsimd_h2v2_merged_upsample_altivec

View file

@ -0,0 +1,329 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdmerge-altivec.c */
void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
#if __BIG_ENDIAN__
int offset;
#endif
unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
__vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
y, cb, cr;
#if __BIG_ENDIAN__
__vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
#if RGB_PIXELSIZE == 4
__vector unsigned char out4;
#endif
#endif
#if RGB_PIXELSIZE == 4
__vector unsigned char rgb3;
#endif
__vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
__vector int g_y0, g_y1, g_y2, g_y3;
/* Constants
* NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
* high-order bits, not 16.
*/
__vector short pw_f0402 = { __8X(F_0_402 >> 1) },
pw_mf0228 = { __8X(-F_0_228 >> 1) },
pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
pw_one = { __8X(1) }, pw_255 = { __8X(255) },
pw_cj = { __8X(CENTERJSAMPLE) };
__vector int pd_onehalf = { __4X(ONE_HALF) };
__vector unsigned char pb_zero = { __16X(0) },
#if __BIG_ENDIAN__
shift_pack_index =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
even_index =
{ 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 },
odd_index =
{ 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 };
#else
shift_pack_index =
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
even_index =
{ 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 },
odd_index =
{ 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 };
#endif
inptr0 = input_buf[0][in_row_group_ctr];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr = output_buf[0];
for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
cb = vec_ld(0, inptr1);
/* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
* support unsigned vectors.
*/
cbl = (__vector signed short)VEC_UNPACKHU(cb);
cbh = (__vector signed short)VEC_UNPACKLU(cb);
cbl = vec_sub(cbl, pw_cj);
cbh = vec_sub(cbh, pw_cj);
cr = vec_ld(0, inptr2);
crl = (__vector signed short)VEC_UNPACKHU(cr);
crh = (__vector signed short)VEC_UNPACKLU(cr);
crl = vec_sub(crl, pw_cj);
crh = vec_sub(crh, pw_cj);
/* (Original)
* R = Y + 1.40200 * Cr
* G = Y - 0.34414 * Cb - 0.71414 * Cr
* B = Y + 1.77200 * Cb
*
* (This implementation)
* R = Y + 0.40200 * Cr + Cr
* G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
* B = Y - 0.22800 * Cb + Cb + Cb
*/
b_yl = vec_add(cbl, cbl);
b_yh = vec_add(cbh, cbh);
b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
b_yl = vec_add(b_yl, cbl);
b_yh = vec_add(b_yh, cbh);
b_yl = vec_add(b_yl, cbl);
b_yh = vec_add(b_yh, cbh);
r_yl = vec_add(crl, crl);
r_yh = vec_add(crh, crh);
r_yl = vec_madds(r_yl, pw_f0402, pw_one);
r_yh = vec_madds(r_yh, pw_f0402, pw_one);
r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
r_yl = vec_add(r_yl, crl);
r_yh = vec_add(r_yh, crh);
g_y0w = vec_mergeh(cbl, crl);
g_y1w = vec_mergel(cbl, crl);
g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
g_y2w = vec_mergeh(cbh, crh);
g_y3w = vec_mergel(cbh, crh);
g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
/* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
* each dword into a new 16-bit vector, which is the equivalent of
* descaling the 32-bit results (right-shifting by 16 bits) and then
* packing them.
*/
g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
shift_pack_index);
g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
shift_pack_index);
g_yl = vec_sub(g_yl, crl);
g_yh = vec_sub(g_yh, crh);
for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
num_cols -= RGB_PIXELSIZE * 16,
outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
y = vec_ld(0, inptr0);
ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
if (yloop == 0) {
be = vec_add(b_yl, ye);
bo = vec_add(b_yl, yo);
re = vec_add(r_yl, ye);
ro = vec_add(r_yl, yo);
ge = vec_add(g_yl, ye);
go = vec_add(g_yl, yo);
} else {
be = vec_add(b_yh, ye);
bo = vec_add(b_yh, yo);
re = vec_add(r_yh, ye);
ro = vec_add(r_yh, yo);
ge = vec_add(g_yh, ye);
go = vec_add(g_yh, yo);
}
rl = vec_mergeh(re, ro);
rh = vec_mergel(re, ro);
gl = vec_mergeh(ge, go);
gh = vec_mergel(ge, go);
bl = vec_mergeh(be, bo);
bh = vec_mergel(be, bo);
rg0 = vec_mergeh(rl, gl);
bx0 = vec_mergeh(bl, pw_255);
rg1 = vec_mergel(rl, gl);
bx1 = vec_mergel(bl, pw_255);
rg2 = vec_mergeh(rh, gh);
bx2 = vec_mergeh(bh, pw_255);
rg3 = vec_mergel(rh, gh);
bx3 = vec_mergel(bh, pw_255);
rgbx0 = vec_packsu(rg0, bx0);
rgbx1 = vec_packsu(rg1, bx1);
rgbx2 = vec_packsu(rg2, bx2);
rgbx3 = vec_packsu(rg3, bx3);
#if RGB_PIXELSIZE == 3
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
*
* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
* rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
*/
rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
#else
/* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
* rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
* rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
* rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
*
* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
* rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
* rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
* rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
*/
rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
#endif
#if __BIG_ENDIAN__
offset = (size_t)outptr & 15;
if (offset) {
__vector unsigned char unaligned_shift_index;
int bytes = num_cols + offset;
if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
/* Slow path to prevent buffer overwrite. Since there is no way to
* write a partial AltiVec register, overwrite would occur on the
* last chunk of the last image row if the right edge is not on a
* 16-byte boundary. It could also occur on other rows if the bytes
* per row is low enough. Since we can't determine whether we're on
* the last image row, we have to assume every row is the last.
*/
vec_st(rgb0, 0, tmpbuf);
vec_st(rgb1, 16, tmpbuf);
vec_st(rgb2, 32, tmpbuf);
#if RGB_PIXELSIZE == 4
vec_st(rgb3, 48, tmpbuf);
#endif
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
} else {
/* Fast path */
unaligned_shift_index = vec_lvsl(0, outptr);
edgel = vec_ld(0, outptr);
edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
edges = vec_perm(edgeh, edgel, unaligned_shift_index);
unaligned_shift_index = vec_lvsr(0, outptr);
out0 = vec_perm(edges, rgb0, unaligned_shift_index);
out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
#if RGB_PIXELSIZE == 4
out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
out4 = vec_perm(rgb3, edges, unaligned_shift_index);
#else
out3 = vec_perm(rgb2, edges, unaligned_shift_index);
#endif
vec_st(out0, 0, outptr);
if (bytes > 16)
vec_st(out1, 16, outptr);
if (bytes > 32)
vec_st(out2, 32, outptr);
if (bytes > 48)
vec_st(out3, 48, outptr);
#if RGB_PIXELSIZE == 4
if (bytes > 64)
vec_st(out4, 64, outptr);
#endif
}
} else {
#endif /* __BIG_ENDIAN__ */
if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
/* Slow path */
VEC_ST(rgb0, 0, tmpbuf);
VEC_ST(rgb1, 16, tmpbuf);
VEC_ST(rgb2, 32, tmpbuf);
#if RGB_PIXELSIZE == 4
VEC_ST(rgb3, 48, tmpbuf);
#endif
memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
} else {
/* Fast path */
VEC_ST(rgb0, 0, outptr);
if (num_cols > 16)
VEC_ST(rgb1, 16, outptr);
if (num_cols > 32)
VEC_ST(rgb2, 32, outptr);
#if RGB_PIXELSIZE == 4
if (num_cols > 48)
VEC_ST(rgb3, 48, outptr);
#endif
}
#if __BIG_ENDIAN__
}
#endif
}
}
}
void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
JSAMPROW inptr, outptr;
inptr = input_buf[0][in_row_group_ctr];
outptr = output_buf[0];
input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
output_buf);
input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
output_buf[0] = output_buf[1];
jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
output_buf);
input_buf[0][in_row_group_ctr] = inptr;
output_buf[0] = outptr;
}

View file

@ -0,0 +1,400 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* CHROMA UPSAMPLING */
#include "jsimd_altivec.h"
void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow, incol;
__vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
out;
__vector short this0e, this0o, this0l, this0h, last0l, last0h,
next0l, next0h, outle, outhe, outlo, outho;
/* Constants */
__vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
last_index_col0 =
{ 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
last_index =
{ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
next_index =
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
next_index_lastcol =
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 },
#if __BIG_ENDIAN__
merge_pack_index =
{ 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
#else
merge_pack_index =
{ 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
#endif
__vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow];
if (downsampled_width & 15)
inptr[downsampled_width] = inptr[downsampled_width - 1];
this0 = vec_ld(0, inptr);
p_last0 = vec_perm(this0, this0, last_index_col0);
last0 = this0;
for (incol = downsampled_width; incol > 0;
incol -= 16, inptr += 16, outptr += 32) {
if (downsampled_width - incol > 0) {
p_last0 = vec_perm(last0, this0, last_index);
last0 = this0;
}
if (incol <= 16)
p_next0 = vec_perm(this0, this0, next_index_lastcol);
else {
next0 = vec_ld(16, inptr);
p_next0 = vec_perm(this0, next0, next_index);
}
this0e = (__vector short)vec_mule(this0, pb_three);
this0o = (__vector short)vec_mulo(this0, pb_three);
this0l = vec_mergeh(this0e, this0o);
this0h = vec_mergel(this0e, this0o);
last0l = (__vector short)VEC_UNPACKHU(p_last0);
last0h = (__vector short)VEC_UNPACKLU(p_last0);
last0l = vec_add(last0l, pw_one);
next0l = (__vector short)VEC_UNPACKHU(p_next0);
next0h = (__vector short)VEC_UNPACKLU(p_next0);
next0l = vec_add(next0l, pw_two);
outle = vec_add(this0l, last0l);
outlo = vec_add(this0l, next0l);
outle = vec_sr(outle, (__vector unsigned short)pw_two);
outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
out = vec_perm((__vector unsigned char)outle,
(__vector unsigned char)outlo, merge_pack_index);
vec_st(out, 0, outptr);
if (incol > 8) {
last0h = vec_add(last0h, pw_one);
next0h = vec_add(next0h, pw_two);
outhe = vec_add(this0h, last0h);
outho = vec_add(this0h, next0h);
outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
outho = vec_sr(outho, (__vector unsigned short)pw_two);
out = vec_perm((__vector unsigned char)outhe,
(__vector unsigned char)outho, merge_pack_index);
vec_st(out, 16, outptr);
}
this0 = next0;
}
}
}
void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
int inrow, outrow, incol;
__vector unsigned char this_1, this0, this1, out;
__vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
lastcolsum_1h, lastcolsum1h,
p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
tmpl, tmph, outle, outhe, outlo, outho;
/* Constants */
__vector unsigned char pb_zero = { __16X(0) },
last_index_col0 =
{ 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
last_index =
{ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
next_index =
{ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 },
next_index_lastcol =
{ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 },
#if __BIG_ENDIAN__
merge_pack_index =
{ 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
#else
merge_pack_index =
{ 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
#endif
__vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
__vector unsigned short pw_four = { __8X(4) };
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
inptr_1 = input_data[inrow - 1];
inptr0 = input_data[inrow];
inptr1 = input_data[inrow + 1];
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
if (downsampled_width & 15) {
inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
inptr0[downsampled_width] = inptr0[downsampled_width - 1];
inptr1[downsampled_width] = inptr1[downsampled_width - 1];
}
this0 = vec_ld(0, inptr0);
this0l = (__vector short)VEC_UNPACKHU(this0);
this0h = (__vector short)VEC_UNPACKLU(this0);
this0l = vec_mladd(this0l, pw_three, pw_zero);
this0h = vec_mladd(this0h, pw_three, pw_zero);
this_1 = vec_ld(0, inptr_1);
this_1l = (__vector short)VEC_UNPACKHU(this_1);
this_1h = (__vector short)VEC_UNPACKLU(this_1);
thiscolsum_1l = vec_add(this0l, this_1l);
thiscolsum_1h = vec_add(this0h, this_1h);
lastcolsum_1h = thiscolsum_1h;
p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
this1 = vec_ld(0, inptr1);
this1l = (__vector short)VEC_UNPACKHU(this1);
this1h = (__vector short)VEC_UNPACKLU(this1);
thiscolsum1l = vec_add(this0l, this1l);
thiscolsum1h = vec_add(this0h, this1h);
lastcolsum1h = thiscolsum1h;
p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
for (incol = downsampled_width; incol > 0;
incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
outptr0 += 32, outptr1 += 32) {
if (downsampled_width - incol > 0) {
p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
}
if (incol <= 16) {
p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
next_index_lastcol);
p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
next_index_lastcol);
} else {
this0 = vec_ld(16, inptr0);
this0l = (__vector short)VEC_UNPACKHU(this0);
this0h = (__vector short)VEC_UNPACKLU(this0);
this0l = vec_mladd(this0l, pw_three, pw_zero);
this0h = vec_mladd(this0h, pw_three, pw_zero);
this_1 = vec_ld(16, inptr_1);
this_1l = (__vector short)VEC_UNPACKHU(this_1);
this_1h = (__vector short)VEC_UNPACKLU(this_1);
nextcolsum_1l = vec_add(this0l, this_1l);
nextcolsum_1h = vec_add(this0h, this_1h);
p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
this1 = vec_ld(16, inptr1);
this1l = (__vector short)VEC_UNPACKHU(this1);
this1h = (__vector short)VEC_UNPACKLU(this1);
nextcolsum1l = vec_add(this0l, this1l);
nextcolsum1h = vec_add(this0h, this1h);
p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
}
/* Process the upper row */
tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
outle = vec_add(tmpl, p_lastcolsum_1l);
outle = vec_add(outle, pw_eight);
outle = vec_sr(outle, pw_four);
outlo = vec_add(tmpl, p_nextcolsum_1l);
outlo = vec_add(outlo, pw_seven);
outlo = vec_sr(outlo, pw_four);
out = vec_perm((__vector unsigned char)outle,
(__vector unsigned char)outlo, merge_pack_index);
vec_st(out, 0, outptr0);
if (incol > 8) {
tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
outhe = vec_add(tmph, p_lastcolsum_1h);
outhe = vec_add(outhe, pw_eight);
outhe = vec_sr(outhe, pw_four);
outho = vec_add(tmph, p_nextcolsum_1h);
outho = vec_add(outho, pw_seven);
outho = vec_sr(outho, pw_four);
out = vec_perm((__vector unsigned char)outhe,
(__vector unsigned char)outho, merge_pack_index);
vec_st(out, 16, outptr0);
}
/* Process the lower row */
tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
outle = vec_add(tmpl, p_lastcolsum1l);
outle = vec_add(outle, pw_eight);
outle = vec_sr(outle, pw_four);
outlo = vec_add(tmpl, p_nextcolsum1l);
outlo = vec_add(outlo, pw_seven);
outlo = vec_sr(outlo, pw_four);
out = vec_perm((__vector unsigned char)outle,
(__vector unsigned char)outlo, merge_pack_index);
vec_st(out, 0, outptr1);
if (incol > 8) {
tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
outhe = vec_add(tmph, p_lastcolsum1h);
outhe = vec_add(outhe, pw_eight);
outhe = vec_sr(outhe, pw_four);
outho = vec_add(tmph, p_nextcolsum1h);
outho = vec_add(outho, pw_seven);
outho = vec_sr(outho, pw_four);
out = vec_perm((__vector unsigned char)outhe,
(__vector unsigned char)outho, merge_pack_index);
vec_st(out, 16, outptr1);
}
thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
}
}
}
/* These are rarely used (mainly just for decompressing YCCK images) */
void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow, incol;
__vector unsigned char in, inl, inh;
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow];
for (incol = (output_width + 31) & (~31); incol > 0;
incol -= 64, inptr += 32, outptr += 64) {
in = vec_ld(0, inptr);
inl = vec_mergeh(in, in);
inh = vec_mergel(in, in);
vec_st(inl, 0, outptr);
vec_st(inh, 16, outptr);
if (incol > 32) {
in = vec_ld(16, inptr);
inl = vec_mergeh(in, in);
inh = vec_mergel(in, in);
vec_st(inl, 32, outptr);
vec_st(inh, 48, outptr);
}
}
}
}
void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr0, outptr1;
int inrow, outrow, incol;
__vector unsigned char in, inl, inh;
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
for (incol = (output_width + 31) & (~31); incol > 0;
incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
in = vec_ld(0, inptr);
inl = vec_mergeh(in, in);
inh = vec_mergel(in, in);
vec_st(inl, 0, outptr0);
vec_st(inl, 0, outptr1);
vec_st(inh, 16, outptr0);
vec_st(inh, 16, outptr1);
if (incol > 32) {
in = vec_ld(16, inptr);
inl = vec_mergeh(in, in);
inh = vec_mergel(in, in);
vec_st(inl, 32, outptr0);
vec_st(inl, 32, outptr1);
vec_st(inh, 48, outptr0);
vec_st(inh, 48, outptr1);
}
}
}
}

View file

@ -0,0 +1,154 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* FAST INTEGER FORWARD DCT
*
* This is similar to the SSE2 implementation, except that we left-shift the
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
* the elements in arg3 + the most significant 17 bits of
* (the elements in arg1 * the elements in arg2).
*/
#include "jsimd_altivec.h"
#define F_0_382 98 /* FIX(0.382683433) */
#define F_0_541 139 /* FIX(0.541196100) */
#define F_0_707 181 /* FIX(0.707106781) */
#define F_1_306 334 /* FIX(1.306562965) */
#define CONST_BITS 8
#define PRE_MULTIPLY_SCALE_BITS 2
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
#define DO_FDCT() { \
/* Even part */ \
\
tmp10 = vec_add(tmp0, tmp3); \
tmp13 = vec_sub(tmp0, tmp3); \
tmp11 = vec_add(tmp1, tmp2); \
tmp12 = vec_sub(tmp1, tmp2); \
\
out0 = vec_add(tmp10, tmp11); \
out4 = vec_sub(tmp10, tmp11); \
\
z1 = vec_add(tmp12, tmp13); \
z1 = vec_sl(z1, pre_multiply_scale_bits); \
z1 = vec_madds(z1, pw_0707, pw_zero); \
\
out2 = vec_add(tmp13, z1); \
out6 = vec_sub(tmp13, z1); \
\
/* Odd part */ \
\
tmp10 = vec_add(tmp4, tmp5); \
tmp11 = vec_add(tmp5, tmp6); \
tmp12 = vec_add(tmp6, tmp7); \
\
tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
z5 = vec_sub(tmp10, tmp12); \
z5 = vec_madds(z5, pw_0382, pw_zero); \
\
z2 = vec_madds(tmp10, pw_0541, z5); \
z4 = vec_madds(tmp12, pw_1306, z5); \
\
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
z3 = vec_madds(tmp11, pw_0707, pw_zero); \
\
z11 = vec_add(tmp7, z3); \
z13 = vec_sub(tmp7, z3); \
\
out5 = vec_add(z13, z2); \
out3 = vec_sub(z13, z2); \
out1 = vec_add(z11, z4); \
out7 = vec_sub(z11, z4); \
}
void jsimd_fdct_ifast_altivec(DCTELEM *data)
{
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
col0, col1, col2, col3, col4, col5, col6, col7,
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
z1, z2, z3, z4, z5, z11, z13,
out0, out1, out2, out3, out4, out5, out6, out7;
/* Constants */
__vector short pw_zero = { __8X(0) },
pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
__vector unsigned short
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
/* Pass 1: process rows */
row0 = vec_ld(0, data);
row1 = vec_ld(16, data);
row2 = vec_ld(32, data);
row3 = vec_ld(48, data);
row4 = vec_ld(64, data);
row5 = vec_ld(80, data);
row6 = vec_ld(96, data);
row7 = vec_ld(112, data);
TRANSPOSE(row, col);
tmp0 = vec_add(col0, col7);
tmp7 = vec_sub(col0, col7);
tmp1 = vec_add(col1, col6);
tmp6 = vec_sub(col1, col6);
tmp2 = vec_add(col2, col5);
tmp5 = vec_sub(col2, col5);
tmp3 = vec_add(col3, col4);
tmp4 = vec_sub(col3, col4);
DO_FDCT();
/* Pass 2: process columns */
TRANSPOSE(out, row);
tmp0 = vec_add(row0, row7);
tmp7 = vec_sub(row0, row7);
tmp1 = vec_add(row1, row6);
tmp6 = vec_sub(row1, row6);
tmp2 = vec_add(row2, row5);
tmp5 = vec_sub(row2, row5);
tmp3 = vec_add(row3, row4);
tmp4 = vec_sub(row3, row4);
DO_FDCT();
vec_st(out0, 0, data);
vec_st(out1, 16, data);
vec_st(out2, 32, data);
vec_st(out3, 48, data);
vec_st(out4, 64, data);
vec_st(out5, 80, data);
vec_st(out6, 96, data);
vec_st(out7, 112, data);
}

View file

@ -0,0 +1,258 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* SLOW INTEGER FORWARD DCT */
#include "jsimd_altivec.h"
#define F_0_298 2446 /* FIX(0.298631336) */
#define F_0_390 3196 /* FIX(0.390180644) */
#define F_0_541 4433 /* FIX(0.541196100) */
#define F_0_765 6270 /* FIX(0.765366865) */
#define F_0_899 7373 /* FIX(0.899976223) */
#define F_1_175 9633 /* FIX(1.175875602) */
#define F_1_501 12299 /* FIX(1.501321110) */
#define F_1_847 15137 /* FIX(1.847759065) */
#define F_1_961 16069 /* FIX(1.961570560) */
#define F_2_053 16819 /* FIX(2.053119869) */
#define F_2_562 20995 /* FIX(2.562915447) */
#define F_3_072 25172 /* FIX(3.072711026) */
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
#define DO_FDCT_COMMON(PASS) { \
/* (Original) \
* z1 = (tmp12 + tmp13) * 0.541196100; \
* data2 = z1 + tmp13 * 0.765366865; \
* data6 = z1 + tmp12 * -1.847759065; \
* \
* (This implementation) \
* data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
* data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
*/ \
\
tmp1312l = vec_mergeh(tmp13, tmp12); \
tmp1312h = vec_mergel(tmp13, tmp12); \
\
out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
\
out2l = vec_sra(out2l, descale_p##PASS); \
out2h = vec_sra(out2h, descale_p##PASS); \
out6l = vec_sra(out6l, descale_p##PASS); \
out6h = vec_sra(out6h, descale_p##PASS); \
\
out2 = vec_pack(out2l, out2h); \
out6 = vec_pack(out6l, out6h); \
\
/* Odd part */ \
\
z3 = vec_add(tmp4, tmp6); \
z4 = vec_add(tmp5, tmp7); \
\
/* (Original) \
* z5 = (z3 + z4) * 1.175875602; \
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
* z3 += z5; z4 += z5; \
* \
* (This implementation) \
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
*/ \
\
z34l = vec_mergeh(z3, z4); \
z34h = vec_mergel(z3, z4); \
\
z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
\
/* (Original) \
* z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
* tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
* tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
* data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
* data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
* \
* (This implementation) \
* tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
* tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
* tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
* tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
* data7 = tmp4 + z3; data5 = tmp5 + z4; \
* data3 = tmp6 + z3; data1 = tmp7 + z4; \
*/ \
\
tmp47l = vec_mergeh(tmp4, tmp7); \
tmp47h = vec_mergel(tmp4, tmp7); \
\
out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
\
out7l = vec_sra(out7l, descale_p##PASS); \
out7h = vec_sra(out7h, descale_p##PASS); \
out1l = vec_sra(out1l, descale_p##PASS); \
out1h = vec_sra(out1h, descale_p##PASS); \
\
out7 = vec_pack(out7l, out7h); \
out1 = vec_pack(out1l, out1h); \
\
tmp56l = vec_mergeh(tmp5, tmp6); \
tmp56h = vec_mergel(tmp5, tmp6); \
\
out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
\
out5l = vec_sra(out5l, descale_p##PASS); \
out5h = vec_sra(out5h, descale_p##PASS); \
out3l = vec_sra(out3l, descale_p##PASS); \
out3h = vec_sra(out3h, descale_p##PASS); \
\
out5 = vec_pack(out5l, out5h); \
out3 = vec_pack(out3l, out3h); \
}
#define DO_FDCT_PASS1() { \
/* Even part */ \
\
tmp10 = vec_add(tmp0, tmp3); \
tmp13 = vec_sub(tmp0, tmp3); \
tmp11 = vec_add(tmp1, tmp2); \
tmp12 = vec_sub(tmp1, tmp2); \
\
out0 = vec_add(tmp10, tmp11); \
out0 = vec_sl(out0, pass1_bits); \
out4 = vec_sub(tmp10, tmp11); \
out4 = vec_sl(out4, pass1_bits); \
\
DO_FDCT_COMMON(1); \
}
#define DO_FDCT_PASS2() { \
/* Even part */ \
\
tmp10 = vec_add(tmp0, tmp3); \
tmp13 = vec_sub(tmp0, tmp3); \
tmp11 = vec_add(tmp1, tmp2); \
tmp12 = vec_sub(tmp1, tmp2); \
\
out0 = vec_add(tmp10, tmp11); \
out0 = vec_add(out0, pw_descale_p2x); \
out0 = vec_sra(out0, pass1_bits); \
out4 = vec_sub(tmp10, tmp11); \
out4 = vec_add(out4, pw_descale_p2x); \
out4 = vec_sra(out4, pass1_bits); \
\
DO_FDCT_COMMON(2); \
}
void jsimd_fdct_islow_altivec(DCTELEM *data)
{
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
col0, col1, col2, col3, col4, col5, col6, col7,
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
z3, z4, z34l, z34h,
out0, out1, out2, out3, out4, out5, out6, out7;
__vector int z3l, z3h, z4l, z4h,
out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
out7l, out7h;
/* Constants */
__vector short
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
__vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
descale_p2 = { __4X(DESCALE_P2) };
/* Pass 1: process rows */
row0 = vec_ld(0, data);
row1 = vec_ld(16, data);
row2 = vec_ld(32, data);
row3 = vec_ld(48, data);
row4 = vec_ld(64, data);
row5 = vec_ld(80, data);
row6 = vec_ld(96, data);
row7 = vec_ld(112, data);
TRANSPOSE(row, col);
tmp0 = vec_add(col0, col7);
tmp7 = vec_sub(col0, col7);
tmp1 = vec_add(col1, col6);
tmp6 = vec_sub(col1, col6);
tmp2 = vec_add(col2, col5);
tmp5 = vec_sub(col2, col5);
tmp3 = vec_add(col3, col4);
tmp4 = vec_sub(col3, col4);
DO_FDCT_PASS1();
/* Pass 2: process columns */
TRANSPOSE(out, row);
tmp0 = vec_add(row0, row7);
tmp7 = vec_sub(row0, row7);
tmp1 = vec_add(row1, row6);
tmp6 = vec_sub(row1, row6);
tmp2 = vec_add(row2, row5);
tmp5 = vec_sub(row2, row5);
tmp3 = vec_add(row3, row4);
tmp4 = vec_sub(row3, row4);
DO_FDCT_PASS2();
vec_st(out0, 0, data);
vec_st(out1, 16, data);
vec_st(out2, 32, data);
vec_st(out3, 48, data);
vec_st(out4, 64, data);
vec_st(out5, 80, data);
vec_st(out6, 96, data);
vec_st(out7, 112, data);
}

View file

@ -0,0 +1,255 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* FAST INTEGER INVERSE DCT
*
* This is similar to the SSE2 implementation, except that we left-shift the
* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
* the elements in arg3 + the most significant 17 bits of
* (the elements in arg1 * the elements in arg2).
*/
#include "jsimd_altivec.h"
#define F_1_082 277 /* FIX(1.082392200) */
#define F_1_414 362 /* FIX(1.414213562) */
#define F_1_847 473 /* FIX(1.847759065) */
#define F_2_613 669 /* FIX(2.613125930) */
#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
#define CONST_BITS 8
#define PASS1_BITS 2
#define PRE_MULTIPLY_SCALE_BITS 2
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
#define DO_IDCT(in) { \
/* Even part */ \
\
tmp10 = vec_add(in##0, in##4); \
tmp11 = vec_sub(in##0, in##4); \
tmp13 = vec_add(in##2, in##6); \
\
tmp12 = vec_sub(in##2, in##6); \
tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
tmp12 = vec_sub(tmp12, tmp13); \
\
tmp0 = vec_add(tmp10, tmp13); \
tmp3 = vec_sub(tmp10, tmp13); \
tmp1 = vec_add(tmp11, tmp12); \
tmp2 = vec_sub(tmp11, tmp12); \
\
/* Odd part */ \
\
z13 = vec_add(in##5, in##3); \
z10 = vec_sub(in##5, in##3); \
z10s = vec_sl(z10, pre_multiply_scale_bits); \
z11 = vec_add(in##1, in##7); \
z12s = vec_sub(in##1, in##7); \
z12s = vec_sl(z12s, pre_multiply_scale_bits); \
\
tmp11 = vec_sub(z11, z13); \
tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
\
tmp7 = vec_add(z11, z13); \
\
/* To avoid overflow... \
* \
* (Original) \
* tmp12 = -2.613125930 * z10 + z5; \
* \
* (This implementation) \
* tmp12 = (-1.613125930 - 1) * z10 + z5; \
* = -1.613125930 * z10 - z10 + z5; \
*/ \
\
z5 = vec_add(z10s, z12s); \
z5 = vec_madds(z5, pw_F1847, pw_zero); \
\
tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
tmp10 = vec_sub(tmp10, z5); \
tmp12 = vec_madds(z10s, pw_MF1613, z5); \
tmp12 = vec_sub(tmp12, z10); \
\
tmp6 = vec_sub(tmp12, tmp7); \
tmp5 = vec_sub(tmp11, tmp6); \
tmp4 = vec_add(tmp10, tmp5); \
\
out0 = vec_add(tmp0, tmp7); \
out1 = vec_add(tmp1, tmp6); \
out2 = vec_add(tmp2, tmp5); \
out3 = vec_sub(tmp3, tmp4); \
out4 = vec_add(tmp3, tmp4); \
out5 = vec_sub(tmp2, tmp5); \
out6 = vec_sub(tmp1, tmp6); \
out7 = vec_sub(tmp0, tmp7); \
}
void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
short *dct_table = (short *)dct_table_;
int *outptr;
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
col0, col1, col2, col3, col4, col5, col6, col7,
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
z5, z10, z10s, z11, z12s, z13,
out0, out1, out2, out3, out4, out5, out6, out7;
__vector signed char outb;
/* Constants */
__vector short pw_zero = { __8X(0) },
pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
__vector unsigned short
pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
pass1_bits3 = { __8X(PASS1_BITS + 3) };
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
/* Pass 1: process columns */
col0 = vec_ld(0, coef_block);
col1 = vec_ld(16, coef_block);
col2 = vec_ld(32, coef_block);
col3 = vec_ld(48, coef_block);
col4 = vec_ld(64, coef_block);
col5 = vec_ld(80, coef_block);
col6 = vec_ld(96, coef_block);
col7 = vec_ld(112, coef_block);
tmp1 = vec_or(col1, col2);
tmp2 = vec_or(col3, col4);
tmp1 = vec_or(tmp1, tmp2);
tmp3 = vec_or(col5, col6);
tmp3 = vec_or(tmp3, col7);
tmp1 = vec_or(tmp1, tmp3);
quant0 = vec_ld(0, dct_table);
col0 = vec_mladd(col0, quant0, pw_zero);
if (vec_all_eq(tmp1, pw_zero)) {
/* AC terms all zero */
row0 = vec_splat(col0, 0);
row1 = vec_splat(col0, 1);
row2 = vec_splat(col0, 2);
row3 = vec_splat(col0, 3);
row4 = vec_splat(col0, 4);
row5 = vec_splat(col0, 5);
row6 = vec_splat(col0, 6);
row7 = vec_splat(col0, 7);
} else {
quant1 = vec_ld(16, dct_table);
quant2 = vec_ld(32, dct_table);
quant3 = vec_ld(48, dct_table);
quant4 = vec_ld(64, dct_table);
quant5 = vec_ld(80, dct_table);
quant6 = vec_ld(96, dct_table);
quant7 = vec_ld(112, dct_table);
col1 = vec_mladd(col1, quant1, pw_zero);
col2 = vec_mladd(col2, quant2, pw_zero);
col3 = vec_mladd(col3, quant3, pw_zero);
col4 = vec_mladd(col4, quant4, pw_zero);
col5 = vec_mladd(col5, quant5, pw_zero);
col6 = vec_mladd(col6, quant6, pw_zero);
col7 = vec_mladd(col7, quant7, pw_zero);
DO_IDCT(col);
TRANSPOSE(out, row);
}
/* Pass 2: process rows */
DO_IDCT(row);
out0 = vec_sra(out0, pass1_bits3);
out1 = vec_sra(out1, pass1_bits3);
out2 = vec_sra(out2, pass1_bits3);
out3 = vec_sra(out3, pass1_bits3);
out4 = vec_sra(out4, pass1_bits3);
out5 = vec_sra(out5, pass1_bits3);
out6 = vec_sra(out6, pass1_bits3);
out7 = vec_sra(out7, pass1_bits3);
TRANSPOSE(out, col);
outb = vec_packs(col0, col0);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[0] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col1, col1);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[1] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col2, col2);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[2] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col3, col3);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[3] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col4, col4);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[4] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col5, col5);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[5] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col6, col6);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[6] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col7, col7);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[7] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
}

View file

@ -0,0 +1,357 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* SLOW INTEGER INVERSE DCT */
#include "jsimd_altivec.h"
#define F_0_298 2446 /* FIX(0.298631336) */
#define F_0_390 3196 /* FIX(0.390180644) */
#define F_0_541 4433 /* FIX(0.541196100) */
#define F_0_765 6270 /* FIX(0.765366865) */
#define F_0_899 7373 /* FIX(0.899976223) */
#define F_1_175 9633 /* FIX(1.175875602) */
#define F_1_501 12299 /* FIX(1.501321110) */
#define F_1_847 15137 /* FIX(1.847759065) */
#define F_1_961 16069 /* FIX(1.961570560) */
#define F_2_053 16819 /* FIX(2.053119869) */
#define F_2_562 20995 /* FIX(2.562915447) */
#define F_3_072 25172 /* FIX(3.072711026) */
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
#define DO_IDCT(in, PASS) { \
/* Even part \
* \
* (Original) \
* z1 = (z2 + z3) * 0.541196100; \
* tmp2 = z1 + z3 * -1.847759065; \
* tmp3 = z1 + z2 * 0.765366865; \
* \
* (This implementation) \
* tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
* tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
*/ \
\
in##26l = vec_mergeh(in##2, in##6); \
in##26h = vec_mergel(in##2, in##6); \
\
tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
\
tmp0 = vec_add(in##0, in##4); \
tmp1 = vec_sub(in##0, in##4); \
\
tmp0l = vec_unpackh(tmp0); \
tmp0h = vec_unpackl(tmp0); \
tmp0l = vec_sl(tmp0l, const_bits); \
tmp0h = vec_sl(tmp0h, const_bits); \
tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
\
tmp10l = vec_add(tmp0l, tmp3l); \
tmp10h = vec_add(tmp0h, tmp3h); \
tmp13l = vec_sub(tmp0l, tmp3l); \
tmp13h = vec_sub(tmp0h, tmp3h); \
\
tmp1l = vec_unpackh(tmp1); \
tmp1h = vec_unpackl(tmp1); \
tmp1l = vec_sl(tmp1l, const_bits); \
tmp1h = vec_sl(tmp1h, const_bits); \
tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
\
tmp11l = vec_add(tmp1l, tmp2l); \
tmp11h = vec_add(tmp1h, tmp2h); \
tmp12l = vec_sub(tmp1l, tmp2l); \
tmp12h = vec_sub(tmp1h, tmp2h); \
\
/* Odd part */ \
\
z3 = vec_add(in##3, in##7); \
z4 = vec_add(in##1, in##5); \
\
/* (Original) \
* z5 = (z3 + z4) * 1.175875602; \
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
* z3 += z5; z4 += z5; \
* \
* (This implementation) \
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
*/ \
\
z34l = vec_mergeh(z3, z4); \
z34h = vec_mergel(z3, z4); \
\
z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
\
/* (Original) \
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
* tmp0 += z1 + z3; tmp1 += z2 + z4; \
* tmp2 += z2 + z3; tmp3 += z1 + z4; \
* \
* (This implementation) \
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
* tmp0 += z3; tmp1 += z4; \
* tmp2 += z3; tmp3 += z4; \
*/ \
\
in##71l = vec_mergeh(in##7, in##1); \
in##71h = vec_mergel(in##7, in##1); \
\
tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
\
in##53l = vec_mergeh(in##5, in##3); \
in##53h = vec_mergel(in##5, in##3); \
\
tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
\
/* Final output stage */ \
\
out0l = vec_add(tmp10l, tmp3l); \
out0h = vec_add(tmp10h, tmp3h); \
out7l = vec_sub(tmp10l, tmp3l); \
out7h = vec_sub(tmp10h, tmp3h); \
\
out0l = vec_sra(out0l, descale_p##PASS); \
out0h = vec_sra(out0h, descale_p##PASS); \
out7l = vec_sra(out7l, descale_p##PASS); \
out7h = vec_sra(out7h, descale_p##PASS); \
\
out0 = vec_pack(out0l, out0h); \
out7 = vec_pack(out7l, out7h); \
\
out1l = vec_add(tmp11l, tmp2l); \
out1h = vec_add(tmp11h, tmp2h); \
out6l = vec_sub(tmp11l, tmp2l); \
out6h = vec_sub(tmp11h, tmp2h); \
\
out1l = vec_sra(out1l, descale_p##PASS); \
out1h = vec_sra(out1h, descale_p##PASS); \
out6l = vec_sra(out6l, descale_p##PASS); \
out6h = vec_sra(out6h, descale_p##PASS); \
\
out1 = vec_pack(out1l, out1h); \
out6 = vec_pack(out6l, out6h); \
\
out2l = vec_add(tmp12l, tmp1l); \
out2h = vec_add(tmp12h, tmp1h); \
out5l = vec_sub(tmp12l, tmp1l); \
out5h = vec_sub(tmp12h, tmp1h); \
\
out2l = vec_sra(out2l, descale_p##PASS); \
out2h = vec_sra(out2h, descale_p##PASS); \
out5l = vec_sra(out5l, descale_p##PASS); \
out5h = vec_sra(out5h, descale_p##PASS); \
\
out2 = vec_pack(out2l, out2h); \
out5 = vec_pack(out5l, out5h); \
\
out3l = vec_add(tmp13l, tmp0l); \
out3h = vec_add(tmp13h, tmp0h); \
out4l = vec_sub(tmp13l, tmp0l); \
out4h = vec_sub(tmp13h, tmp0h); \
\
out3l = vec_sra(out3l, descale_p##PASS); \
out3h = vec_sra(out3h, descale_p##PASS); \
out4l = vec_sra(out4l, descale_p##PASS); \
out4h = vec_sra(out4h, descale_p##PASS); \
\
out3 = vec_pack(out3l, out3h); \
out4 = vec_pack(out4l, out4h); \
}
void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
short *dct_table = (short *)dct_table_;
int *outptr;
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
col0, col1, col2, col3, col4, col5, col6, col7,
quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
tmp0, tmp1, tmp2, tmp3, z3, z4,
z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
row71l, row71h, row26l, row26h, row53l, row53h,
out0, out1, out2, out3, out4, out5, out6, out7;
__vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
z3l, z3h, z4l, z4h,
out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
out5l, out5h, out6l, out6h, out7l, out7h;
__vector signed char outb;
/* Constants */
__vector short pw_zero = { __8X(0) },
pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
__vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
__vector int pd_zero = { __4X(0) },
pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
__vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
descale_p2 = { __4X(DESCALE_P2) },
const_bits = { __4X(CONST_BITS) };
__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
/* Pass 1: process columns */
col0 = vec_ld(0, coef_block);
col1 = vec_ld(16, coef_block);
col2 = vec_ld(32, coef_block);
col3 = vec_ld(48, coef_block);
col4 = vec_ld(64, coef_block);
col5 = vec_ld(80, coef_block);
col6 = vec_ld(96, coef_block);
col7 = vec_ld(112, coef_block);
tmp1 = vec_or(col1, col2);
tmp2 = vec_or(col3, col4);
tmp1 = vec_or(tmp1, tmp2);
tmp3 = vec_or(col5, col6);
tmp3 = vec_or(tmp3, col7);
tmp1 = vec_or(tmp1, tmp3);
quant0 = vec_ld(0, dct_table);
col0 = vec_mladd(col0, quant0, pw_zero);
if (vec_all_eq(tmp1, pw_zero)) {
/* AC terms all zero */
col0 = vec_sl(col0, pass1_bits);
row0 = vec_splat(col0, 0);
row1 = vec_splat(col0, 1);
row2 = vec_splat(col0, 2);
row3 = vec_splat(col0, 3);
row4 = vec_splat(col0, 4);
row5 = vec_splat(col0, 5);
row6 = vec_splat(col0, 6);
row7 = vec_splat(col0, 7);
} else {
quant1 = vec_ld(16, dct_table);
quant2 = vec_ld(32, dct_table);
quant3 = vec_ld(48, dct_table);
quant4 = vec_ld(64, dct_table);
quant5 = vec_ld(80, dct_table);
quant6 = vec_ld(96, dct_table);
quant7 = vec_ld(112, dct_table);
col1 = vec_mladd(col1, quant1, pw_zero);
col2 = vec_mladd(col2, quant2, pw_zero);
col3 = vec_mladd(col3, quant3, pw_zero);
col4 = vec_mladd(col4, quant4, pw_zero);
col5 = vec_mladd(col5, quant5, pw_zero);
col6 = vec_mladd(col6, quant6, pw_zero);
col7 = vec_mladd(col7, quant7, pw_zero);
DO_IDCT(col, 1);
TRANSPOSE(out, row);
}
/* Pass 2: process rows */
DO_IDCT(row, 2);
TRANSPOSE(out, col);
outb = vec_packs(col0, col0);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[0] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col1, col1);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[1] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col2, col2);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[2] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col3, col3);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[3] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col4, col4);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[4] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col5, col5);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[5] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col6, col6);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[6] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col7, col7);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[7] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
}

View file

@ -0,0 +1,250 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
#include "jsimd_altivec.h"
/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
* always get the data we want by using a single vector load (although we may
* have to permute the result.)
*/
#if __BIG_ENDIAN__
#define LOAD_ROW(row) { \
elemptr = sample_data[row] + start_col; \
in##row = vec_ld(0, elemptr); \
if ((size_t)elemptr & 15) \
in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
}
#else
#define LOAD_ROW(row) { \
elemptr = sample_data[row] + start_col; \
in##row = vec_vsx_ld(0, elemptr); \
}
#endif
void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
JSAMPROW elemptr;
__vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
__vector short out0, out1, out2, out3, out4, out5, out6, out7;
/* Constants */
__vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
__vector unsigned char pb_zero = { __16X(0) };
LOAD_ROW(0);
LOAD_ROW(1);
LOAD_ROW(2);
LOAD_ROW(3);
LOAD_ROW(4);
LOAD_ROW(5);
LOAD_ROW(6);
LOAD_ROW(7);
out0 = (__vector short)VEC_UNPACKHU(in0);
out1 = (__vector short)VEC_UNPACKHU(in1);
out2 = (__vector short)VEC_UNPACKHU(in2);
out3 = (__vector short)VEC_UNPACKHU(in3);
out4 = (__vector short)VEC_UNPACKHU(in4);
out5 = (__vector short)VEC_UNPACKHU(in5);
out6 = (__vector short)VEC_UNPACKHU(in6);
out7 = (__vector short)VEC_UNPACKHU(in7);
out0 = vec_sub(out0, pw_centerjsamp);
out1 = vec_sub(out1, pw_centerjsamp);
out2 = vec_sub(out2, pw_centerjsamp);
out3 = vec_sub(out3, pw_centerjsamp);
out4 = vec_sub(out4, pw_centerjsamp);
out5 = vec_sub(out5, pw_centerjsamp);
out6 = vec_sub(out6, pw_centerjsamp);
out7 = vec_sub(out7, pw_centerjsamp);
vec_st(out0, 0, workspace);
vec_st(out1, 16, workspace);
vec_st(out2, 32, workspace);
vec_st(out3, 48, workspace);
vec_st(out4, 64, workspace);
vec_st(out5, 80, workspace);
vec_st(out6, 96, workspace);
vec_st(out7, 112, workspace);
}
#define WORD_BIT 16
/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
We basically need an unsigned equivalent of vec_madds(). */
#define MULTIPLY(vs0, vs1, out) { \
tmpe = vec_mule((__vector unsigned short)vs0, \
(__vector unsigned short)vs1); \
tmpo = vec_mulo((__vector unsigned short)vs0, \
(__vector unsigned short)vs1); \
out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
(__vector unsigned short)tmpo, \
shift_pack_index); \
}
void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
DCTELEM *workspace)
{
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
__vector unsigned int tmpe, tmpo;
/* Constants */
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
#if __BIG_ENDIAN__
__vector unsigned char shift_pack_index =
{ 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 };
#else
__vector unsigned char shift_pack_index =
{ 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
#endif
row0 = vec_ld(0, workspace);
row1 = vec_ld(16, workspace);
row2 = vec_ld(32, workspace);
row3 = vec_ld(48, workspace);
row4 = vec_ld(64, workspace);
row5 = vec_ld(80, workspace);
row6 = vec_ld(96, workspace);
row7 = vec_ld(112, workspace);
/* Branch-less absolute value */
row0s = vec_sra(row0, pw_word_bit_m1);
row1s = vec_sra(row1, pw_word_bit_m1);
row2s = vec_sra(row2, pw_word_bit_m1);
row3s = vec_sra(row3, pw_word_bit_m1);
row4s = vec_sra(row4, pw_word_bit_m1);
row5s = vec_sra(row5, pw_word_bit_m1);
row6s = vec_sra(row6, pw_word_bit_m1);
row7s = vec_sra(row7, pw_word_bit_m1);
row0 = vec_xor(row0, row0s);
row1 = vec_xor(row1, row1s);
row2 = vec_xor(row2, row2s);
row3 = vec_xor(row3, row3s);
row4 = vec_xor(row4, row4s);
row5 = vec_xor(row5, row5s);
row6 = vec_xor(row6, row6s);
row7 = vec_xor(row7, row7s);
row0 = vec_sub(row0, row0s);
row1 = vec_sub(row1, row1s);
row2 = vec_sub(row2, row2s);
row3 = vec_sub(row3, row3s);
row4 = vec_sub(row4, row4s);
row5 = vec_sub(row5, row5s);
row6 = vec_sub(row6, row6s);
row7 = vec_sub(row7, row7s);
corr0 = vec_ld(DCTSIZE2 * 2, divisors);
corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
row0 = vec_add(row0, corr0);
row1 = vec_add(row1, corr1);
row2 = vec_add(row2, corr2);
row3 = vec_add(row3, corr3);
row4 = vec_add(row4, corr4);
row5 = vec_add(row5, corr5);
row6 = vec_add(row6, corr6);
row7 = vec_add(row7, corr7);
recip0 = vec_ld(0, divisors);
recip1 = vec_ld(16, divisors);
recip2 = vec_ld(32, divisors);
recip3 = vec_ld(48, divisors);
recip4 = vec_ld(64, divisors);
recip5 = vec_ld(80, divisors);
recip6 = vec_ld(96, divisors);
recip7 = vec_ld(112, divisors);
MULTIPLY(row0, recip0, row0);
MULTIPLY(row1, recip1, row1);
MULTIPLY(row2, recip2, row2);
MULTIPLY(row3, recip3, row3);
MULTIPLY(row4, recip4, row4);
MULTIPLY(row5, recip5, row5);
MULTIPLY(row6, recip6, row6);
MULTIPLY(row7, recip7, row7);
scale0 = vec_ld(DCTSIZE2 * 4, divisors);
scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
MULTIPLY(row0, scale0, row0);
MULTIPLY(row1, scale1, row1);
MULTIPLY(row2, scale2, row2);
MULTIPLY(row3, scale3, row3);
MULTIPLY(row4, scale4, row4);
MULTIPLY(row5, scale5, row5);
MULTIPLY(row6, scale6, row6);
MULTIPLY(row7, scale7, row7);
row0 = vec_xor(row0, row0s);
row1 = vec_xor(row1, row1s);
row2 = vec_xor(row2, row2s);
row3 = vec_xor(row3, row3s);
row4 = vec_xor(row4, row4s);
row5 = vec_xor(row5, row5s);
row6 = vec_xor(row6, row6s);
row7 = vec_xor(row7, row7s);
row0 = vec_sub(row0, row0s);
row1 = vec_sub(row1, row1s);
row2 = vec_sub(row2, row2s);
row3 = vec_sub(row3, row3s);
row4 = vec_sub(row4, row4s);
row5 = vec_sub(row5, row5s);
row6 = vec_sub(row6, row6s);
row7 = vec_sub(row7, row7s);
vec_st(row0, 0, coef_block);
vec_st(row1, 16, coef_block);
vec_st(row2, 32, coef_block);
vec_st(row3, 48, coef_block);
vec_st(row4, 64, coef_block);
vec_st(row5, 80, coef_block);
vec_st(row6, 96, coef_block);
vec_st(row7, 112, coef_block);
}

View file

@ -0,0 +1,872 @@
/*
* jsimd_powerpc.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* For conditions of distribution and use, see copyright notice in jsimdext.inc
*
* This file contains the interface between the "normal" portions
* of the library and the SIMD implementations when running on a
* PowerPC architecture.
*/
#ifdef __amigaos4__
/* This must be defined first as it re-defines GLOBAL otherwise */
#include <proto/exec.h>
#endif
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#if defined(__OpenBSD__)
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif
static unsigned int simd_support = ~0;
#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
LOCAL(int)
check_feature(char *buffer, char *feature)
{
char *p;
if (*feature == 0)
return 0;
if (strncmp(buffer, "cpu", 3) != 0)
return 0;
buffer += 3;
while (isspace(*buffer))
buffer++;
/* Check if 'feature' is present in the buffer as a separate word */
while ((p = strstr(buffer, feature))) {
if (p > buffer && !isspace(*(p - 1))) {
buffer++;
continue;
}
p += strlen(feature);
if (*p != 0 && !isspace(*p)) {
buffer++;
continue;
}
return 1;
}
return 0;
}
LOCAL(int)
parse_proc_cpuinfo(int bufsize)
{
char *buffer = (char *)malloc(bufsize);
FILE *fd;
simd_support = 0;
if (!buffer)
return 0;
fd = fopen("/proc/cpuinfo", "r");
if (fd) {
while (fgets(buffer, bufsize, fd)) {
if (!strchr(buffer, '\n') && !feof(fd)) {
/* "impossible" happened - insufficient size of the buffer! */
fclose(fd);
free(buffer);
return 0;
}
if (check_feature(buffer, "altivec"))
simd_support |= JSIMD_ALTIVEC;
}
fclose(fd);
}
free(buffer);
return 1;
}
#endif
/*
* Check what SIMD accelerations are supported.
*
* FIXME: This code is racy under a multi-threaded environment.
*/
LOCAL(void)
init_simd(void)
{
#ifndef NO_GETENV
char *env = NULL;
#endif
#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
int bufsize = 1024; /* an initial guess for the line buffer size limit */
#elif defined(__amigaos4__)
uint32 altivec = 0;
#elif defined(__OpenBSD__)
int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
int altivec;
size_t len = sizeof(altivec);
#endif
if (simd_support != ~0U)
return;
simd_support = 0;
#if defined(__ALTIVEC__) || defined(__APPLE__)
simd_support |= JSIMD_ALTIVEC;
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
while (!parse_proc_cpuinfo(bufsize)) {
bufsize *= 2;
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
break;
}
#elif defined(__amigaos4__)
IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
if (altivec == VECTORTYPE_ALTIVEC)
simd_support |= JSIMD_ALTIVEC;
#elif defined(__OpenBSD__)
if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
simd_support |= JSIMD_ALTIVEC;
#endif
#ifndef NO_GETENV
/* Force different settings through environment variables */
env = getenv("JSIMD_FORCEALTIVEC");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_support = JSIMD_ALTIVEC;
env = getenv("JSIMD_FORCENONE");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_support = 0;
#endif
}
GLOBAL(int)
jsimd_can_rgb_ycc(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_rgb_gray(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb565(void)
{
return 0;
}
GLOBAL(void)
jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
altivecfct = jsimd_extrgb_ycc_convert_altivec;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
altivecfct = jsimd_extrgbx_ycc_convert_altivec;
break;
case JCS_EXT_BGR:
altivecfct = jsimd_extbgr_ycc_convert_altivec;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
altivecfct = jsimd_extbgrx_ycc_convert_altivec;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
altivecfct = jsimd_extxbgr_ycc_convert_altivec;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
altivecfct = jsimd_extxrgb_ycc_convert_altivec;
break;
default:
altivecfct = jsimd_rgb_ycc_convert_altivec;
break;
}
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
altivecfct = jsimd_extrgb_gray_convert_altivec;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
altivecfct = jsimd_extrgbx_gray_convert_altivec;
break;
case JCS_EXT_BGR:
altivecfct = jsimd_extbgr_gray_convert_altivec;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
altivecfct = jsimd_extbgrx_gray_convert_altivec;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
altivecfct = jsimd_extxbgr_gray_convert_altivec;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
altivecfct = jsimd_extxrgb_gray_convert_altivec;
break;
default:
altivecfct = jsimd_rgb_gray_convert_altivec;
break;
}
altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
altivecfct = jsimd_ycc_extrgb_convert_altivec;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
altivecfct = jsimd_ycc_extrgbx_convert_altivec;
break;
case JCS_EXT_BGR:
altivecfct = jsimd_ycc_extbgr_convert_altivec;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
altivecfct = jsimd_ycc_extbgrx_convert_altivec;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
altivecfct = jsimd_ycc_extxbgr_convert_altivec;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
altivecfct = jsimd_ycc_extxrgb_convert_altivec;
break;
default:
altivecfct = jsimd_ycc_rgb_convert_altivec;
break;
}
altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
}
GLOBAL(int)
jsimd_can_h2v2_downsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_downsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor,
compptr->width_in_blocks, input_data,
output_data);
}
GLOBAL(void)
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor,
compptr->width_in_blocks, input_data,
output_data);
}
GLOBAL(int)
jsimd_can_h2v2_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_merged_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
break;
case JCS_EXT_BGR:
altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
break;
default:
altivecfct = jsimd_h2v2_merged_upsample_altivec;
break;
}
altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(void)
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
break;
case JCS_EXT_BGR:
altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
break;
default:
altivecfct = jsimd_h2v1_merged_upsample_altivec;
break;
}
altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(int)
jsimd_can_convsamp(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_convsamp_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
jsimd_convsamp_altivec(sample_data, start_col, workspace);
}
GLOBAL(void)
jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_fdct_islow(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_fdct_ifast(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_fdct_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_fdct_islow(DCTELEM *data)
{
jsimd_fdct_islow_altivec(data);
}
GLOBAL(void)
jsimd_fdct_ifast(DCTELEM *data)
{
jsimd_fdct_ifast_altivec(data);
}
GLOBAL(void)
jsimd_fdct_float(FAST_FLOAT *data)
{
}
GLOBAL(int)
jsimd_can_quantize(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_quantize_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
jsimd_quantize_altivec(coef_block, divisors, workspace);
}
GLOBAL(void)
jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_idct_2x2(void)
{
return 0;
}
GLOBAL(int)
jsimd_can_idct_4x4(void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(void)
jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(int)
jsimd_can_idct_islow(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_ifast(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_ALTIVEC)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
output_col);
}
GLOBAL(void)
jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
output_col);
}
GLOBAL(void)
jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(int)
jsimd_can_huff_encode_one_block(void)
{
return 0;
}
GLOBAL(JOCTET *)
jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
int last_dc_val, c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
}

View file

@ -0,0 +1,98 @@
/*
* AltiVec optimizations for libjpeg-turbo
*
* Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include <altivec.h>
/* Common code */
#define __4X(a) a, a, a, a
#define __4X2(a, b) a, b, a, b, a, b, a, b
#define __8X(a) __4X(a), __4X(a)
#define __16X(a) __8X(a), __8X(a)
#define TRANSPOSE(row, col) { \
__vector short row04l, row04h, row15l, row15h, \
row26l, row26h, row37l, row37h; \
__vector short col01e, col01o, col23e, col23o, \
col45e, col45o, col67e, col67o; \
\
/* transpose coefficients (phase 1) */ \
row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
\
/* transpose coefficients (phase 2) */ \
col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
\
/* transpose coefficients (phase 3) */ \
col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
}
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
/* Macros to abstract big/little endian bit twiddling */
#if __BIG_ENDIAN__
#define VEC_LD(a, b) vec_ld(a, b)
#define VEC_ST(a, b, c) vec_st(a, b, c)
#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
#else
#define VEC_LD(a, b) vec_vsx_ld(a, b)
#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
#endif