Elliott's Firefox Builds for Mac OS X
Here are Firefox builds for Mac OS X. They run on Intel processors.
May
3, 2008 Firefox 3.0Pre Trunk 20080503 for Mac OS X JPEG,
rounding
August
23, 2007 firefox 2.0.0.6 x08 for Mac OS X optimized PNG
decompresser and GIF decoder
August
23, 2007 firefox 2.0.0.6 x07 for Mac OS X optimized PNG
decoder copying
August
18, 2007 firefox 2.0.0.6 x06 for Mac OS X fixed cnet.com bug
in x05, added gfxImageFrame optimization, added -mfpmath=sse
-ftree-vectorize
August
17, 2007 firefox 2.0.0.6 x05 for Mac OS X added JPEG decoding
SSE2 optimization (crash bug at cnet.com)
August
16, 2007 firefox 2.0.0.6 x04 for Mac OS X added JPEG SSE2
Color Optimization
August
12, 2007 firefox 2.0.0.6 x03 for Mac OS X added JPEG
Inverse Discrete Cosine Transform SSE2 acceleration
August
8, 2007 firefox 2.0.0.6 x01 for Mac OS X added
jslock lightweight locking and lightweight rounding
August
8, 2007 firefox 2.0.0.6 for Mac OS X stock build
with -o3 optimization
Things to do:
- Port range limit tables from x64 (JPEG)
- Use SSE2 processing for jdsample box filters (JPEG)
- Use integer work variable instead of a double for JS
conversion of string ints to doubles
- Port PNG MMX optimizations to intrinsics (work in progress)
- Consider writing a highly optimized memcpy routine
- Get PowerPC G5 builds up and running
Patches applied to
x08 build (excluding IJG code):
Index: Makefile.in
===================================================================
RCS file: /cvsroot/mozilla/modules/libimg/png/Makefile.in,v
retrieving revision 1.27.18.1
diff -u -r1.27.18.1 Makefile.in
--- Makefile.in 20 May 2006 17:20:48
-0000 1.27.18.1
+++ Makefile.in 25 Aug 2007 23:01:07 -0000
@@ -68,6 +68,7 @@
pngwtran.c \
pngrtran.c \
pngwutil.c \
+
pnggccrd.c \
$(NULL)
EXPORTS
= png.h pngconf.h mozpngconf.h
@@ -85,6 +86,8 @@
DEFINES
+= -DMOZ_PNG_WRITE
endif
+DEFINES += -DPNG_USE_PNGGCCRD
+
ifeq ($(OS_ARCH),WINNT)
DEFINES
+= -DZLIB_DLL
ifdef BUILD_DEBUG_GC
Index: pnggccrd.c
===================================================================
RCS file: /cvsroot/mozilla/modules/libimg/png/pnggccrd.c,v
retrieving revision 3.7
diff -u -r3.7 pnggccrd.c
--- pnggccrd.c 7 Oct 2004 19:21:29
-0000 3.7
+++ pnggccrd.c 25 Aug 2007 23:00:41 -0000
@@ -236,13 +236,15 @@
* - rewrite all MMX
interlacing code so it's aligned with beginning
*
of the row buffer, not the end (see 19991007 for details)
* x pick one version
of mmxsupport() and get rid of the other
- * - add error messages to any
remaining bogus default cases
+ * - add error messages to any
remaining bogus defau>lt cases
* - enable
pixel_depth == 8 cases in png_read_filter_row()? (test speed)
* x add support for
runtime enable/disable/query of various MMX routines
*/
#define PNG_INTERNAL
#include "png.h"
+#include "emmintrin.h"
+#include "xmmintrin.h"
#if defined(PNG_USE_PNGGCCRD)
@@ -254,6 +256,8 @@
static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2,
1, 1};
#endif
+#undef PNG_ASSEMBLER_CODE_SUPPORTED
+
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
/* djgpp, Win32, and Cygwin add their own underscores to
global variables,
* so define them without: */
@@ -5190,12 +5194,32 @@
else
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
{
-
png_uint_32 i;
-
png_uint_32 istop = row_info->rowbytes;
-
png_bytep rp = row;
-
png_bytep pp = prev_row;
+ png_uint_32 i, j;
+ png_uint_32 istop
= row_info->rowbytes;
+ png_bytep rp = row;
+ png_bytep pp =
prev_row;
+ __m128i rp_vec1,
pp_vec1;
+ unsigned int diff;
+
+ if (istop >
40) {
+
diff = ((unsigned int) rp)
& 0xf;
+
diff = (16 - diff) &
0xf;
+
for (i = 0; i < diff;
++i)
+
{
+
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+
rp++;
+
}
+
for (j = diff; j <
istop - 16; j += 16) {
+
rp_vec1 =
_mm_load_si128((__m128i const *) rp);
+
pp_vec1 =
_mm_loadu_si128((__m128i const *) pp);
+
rp_vec1 =
_mm_add_epi8(rp_vec1, pp_vec1);
+
_mm_store_si128((__m128i *) rp, rp_vec1);
+
rp += 16;
+
pp += 16;
+
}
+ }
-
for (i = 0; i < istop; ++i)
+
for (i = j; i < istop; ++i)
{
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
rp++;
@@ -5338,69 +5362,7 @@
int PNGAPI
png_mmx_support(void)
{
-#if defined(PNG_MMX_CODE_SUPPORTED)
- __asm__ __volatile__ (
-
"pushl
%%ebx
\n\t"
// ebx gets clobbered by CPUID instruction
-
"pushl
%%ecx
\n\t"
// so does ecx...
-
"pushl
%%edx
\n\t"
// ...and edx (but ecx & edx safe on Linux)
-// ".byte
0x66
\n\t"
// convert 16-bit pushf to 32-bit pushfd
-//
"pushf
\n\t" // 16-bit pushf
-
"pushfl
\n\t" // save Eflag to stack
-
"popl
%%eax
\n\t" // get Eflag from stack into eax
-
"movl %%eax,
%%ecx \n\t" // make another
copy of Eflag in ecx
-
"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
-
"pushl
%%eax
\n\t"
// save modified Eflag back to stack
-// ".byte
0x66
\n\t"
// convert 16-bit popf to 32-bit popfd
-//
"popf
\n\t" // 16-bit popf
-
"popfl
\n\t" // restore modified value to Eflag reg
-
"pushfl
\n\t" // save Eflag to stack
-
"popl
%%eax
\n\t" // get Eflag from stack
-
"pushl
%%ecx
\n\t"
// save original Eflag to stack
-
"popfl
\n\t" // restore original Eflag
-
"xorl %%ecx,
%%eax \n\t" // compare new
Eflag with original
Eflag
-
"jz
0f
\n\t" // if same, CPUID instr. is not supported
-
-
"xorl %%eax, %%eax \n\t" // set
eax to zero
-// ".byte
0x0f, 0xa2 \n\t" // CPUID
instruction (two-byte opcode)
-
"cpuid
\n\t" // get the CPU identification info
-
"cmpl $1,
%%eax
\n\t" // make sure eax
return non-zero value
-
"jl
0f
\n\t" // if eax is zero, MMX is not supported
-
-
"xorl %%eax, %%eax \n\t" // set
eax to zero and...
-
"incl
%%eax
\n\t" // ...increment eax to 1. This pair is
-
// faster than the instruction "mov eax, 1"
-
"cpuid
\n\t" // get the CPU identification info again
-
"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
-
"cmpl $0,
%%edx
\n\t" // 0 = MMX not
supported
-
"jz
0f
\n\t" // non-zero = yes, MMX IS supported
-
-
"movl $1,
%%eax
\n\t" // set return
value to 1
-
"jmp
1f
\n\t" // DONE: have MMX support
-
-
"0:
\n\t" // .NOT_SUPPORTED: target label for jump instructions
-
"movl $0,
%%eax
\n\t" // set return
value to 0
-
"1:
\n\t" // .RETURN: target label for jump instructions
-
"movl %%eax, _mmx_supported \n\t" // save in global static variable, too
-
"popl
%%edx
\n\t" // restore edx
-
"popl
%%ecx
\n\t" // restore ecx
-
"popl
%%ebx
\n\t" // restore ebx
-
-//
"ret
\n\t" // DONE: no MMX support
-
// (fall through to standard C "ret")
-
-
:
// output list (none)
-
-
:
// any variables used on input (none)
-
-
:
"%eax"
// clobber list
-// , "%ebx", "%ecx",
"%edx" // GRR: we
handle these manually
-// ,
"memory" // if write to a variable gcc thought was
in a reg
-// ,
"cc" //
"condition codes" (flag bits)
- );
-#else
_mmx_supported = 0;
-#endif /* PNG_MMX_CODE_SUPPORTED */
-
return _mmx_supported;
}
Index: nsJPEGDecoder.cpp
===================================================================
RCS file:
/cvsroot/mozilla/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp,v
retrieving revision 1.62.18.1
diff -u -r1.62.18.1 nsJPEGDecoder.cpp
--- nsJPEGDecoder.cpp 3 Feb 2006 14:41:10
-0000 1.62.18.1
+++ nsJPEGDecoder.cpp 18 Aug 2007
17:33:20 -0000
@@ -49,6 +49,8 @@
#include "ImageLogging.h"
#include "jerror.h"
+#include "emmintrin.h"
+#include "xmmintrin.h"
NS_IMPL_ISUPPORTS1(nsJPEGDecoder, imgIDecoder)
@@ -361,7 +363,7 @@
mSamples =
(*mInfo.mem->alloc_sarray)((j_common_ptr) &mInfo,
JPOOL_IMAGE,
-
row_stride, 1);
+
mInfo.output_width * 3, 1);
#if defined(XP_WIN) || defined(XP_OS2) || defined(XP_BEOS) ||
defined(XP_MAC) || defined(XP_MACOSX) || defined(MOZ_WIDGET_PHOTON)
// allocate buffer to do byte
flipping / padding
@@ -379,7 +381,7 @@
/* FIXME -- Should reset
dct_method and dither mode
* for final pass of
progressive JPEG
*/
- mInfo.dct_method = JDCT_ISLOW;
+ mInfo.dct_method = JDCT_IFAST;
mInfo.dither_mode = JDITHER_FS;
mInfo.do_fancy_upsampling =
TRUE;
mInfo.enable_2pass_quant =
FALSE;
@@ -499,12 +501,12 @@
return NS_OK;
}
-
int
nsJPEGDecoder::OutputScanlines()
{
PRUint32 top = mInfo.output_scanline;
PRBool rv = PR_TRUE;
+ PRUint32 i;
while ((mInfo.output_scanline <
mInfo.output_height)) {
JSAMPROW
samples;
@@ -529,11 +531,29 @@
}
samples =
mRGBRow;
-#elif defined(XP_MAC) || defined(XP_MACOSX)
+#elif defined(XP_MAC1) || defined(XP_MACOSX1)
PRUint8
*ptrOutputBuf = mRGBRow;
JSAMPLE
*j1 = mSamples[0];
- for (PRUint32
i=0;i<mInfo.output_width;++i) {
+ mask1 = *((__m128i
*) color_mask_1a);
+ mask2 = *((__m128i
*) color_mask_2a);
+ for (i=0; i
< mInfo.output_width - 4; i += 4) {
+
tmp1 = mask1;
+
tmp2 = mask2;
+
read1 = _mm_loadl_epi64((__m128i *) j1);
+
read1 = (__m128i) _mm_loadh_pi((__m128) read1, (__m64 *)(j1+6));
+
tmp1 = _mm_and_si128(tmp1, read1);
+
tmp2 = _mm_and_si128(tmp2, read1);
+
tmp1 = _mm_slli_si128(tmp1, 1);
+
tmp2 = _mm_slli_si128(tmp2, 2);
+
tmp1 = _mm_or_si128(tmp1, tmp2);
+
_mm_storel_pi((__m64 *) ptrOutputBuf, (__m128) tmp1);
+
_mm_storeh_pi((__m64 *) (ptrOutputBuf + 8), (__m128) tmp1);
+
ptrOutputBuf += 16;
+
j1 += 12;
+ }
+
+ for (; i <
mInfo.output_width; ++i) {
ptrOutputBuf[0] = 0;
ptrOutputBuf[1] = *j1++;
ptrOutputBuf[2] = *j1++;
@@ -557,8 +577,8 @@
PRUint32
bpr;
mFrame->GetImageBytesPerRow(&bpr);
mFrame->SetImageData(
-
samples,
// data
-
row_stride,
// length
+
samples,
// data
+
-mInfo.output_width,
// length
(mInfo.output_scanline-1) * bpr); // offset
}
Index: gfxImageFrame.cpp
===================================================================
RCS file: /cvsroot/mozilla/gfx/src/shared/gfxImageFrame.cpp,v
retrieving revision 1.32
diff -u -r1.32 gfxImageFrame.cpp
--- gfxImageFrame.cpp 8 Mar 2005 03:44:27
-0000 1.32
+++ gfxImageFrame.cpp 18 Aug 2007
23:51:51 -0000
@@ -39,6 +39,8 @@
#include "gfxImageFrame.h"
#include "nsIServiceManager.h"
+#include "emmintrin.h"
+#include "xmmintrin.h"
NS_IMPL_ISUPPORTS2(gfxImageFrame, gfxIImageFrame,
nsIInterfaceRequestor)
@@ -279,9 +281,15 @@
return NS_OK;
}
+unsigned char color_mask_1b[16] __attribute__ ((aligned(16))) =
{255,255,255,0,0,0,0,0,255,255,255,0,0,0,0,0};
+unsigned char color_mask_2b[16] __attribute__ ((aligned(16))) =
{0,0,0,255,255,255,0,0,0,0,0,255,255,255,0,0};
+
/* void setImageData ([array, size_is (length), const] in
PRUint8 data, in unsigned long length, in long offset); */
NS_IMETHODIMP gfxImageFrame::SetImageData(const PRUint8
*aData, PRUint32 aLength, PRInt32 aOffset)
{
+ int express_flag = 0;
+ __m128i mask1, mask2, tmp1, tmp2, read1;
+
if (!mInitalized)
return
NS_ERROR_NOT_INITIALIZED;
@@ -292,6 +300,11 @@
if (aLength == 0)
return NS_OK;
+ if ((int) aLength < 0) {
+ express_flag = - ((int) aLength);
+ aLength = (-4) * ((int) aLength);
+ }
+
PRInt32 row_stride =
mImage->GetLineStride();
mImage->LockImagePixels(PR_FALSE);
@@ -313,7 +326,40 @@
}
if (aData)
- memcpy(imgData + newOffset, aData,
aLength);
+ if (express_flag == 0)
+ memcpy(imgData +
newOffset, aData, aLength);
+ else {
+ const PRUint8 *j1 =
aData;
+ PRUint8
*ptrOutputBuf = imgData + newOffset;
+ int i;
+
+ mask1 = *((__m128i
*) color_mask_1b);
+ mask2 = *((__m128i
*) color_mask_2b);
+ for (i=0; i
< express_flag - 4; i += 4) {
+
tmp1 = mask1;
+
tmp2 = mask2;
+
read1 = _mm_loadl_epi64((__m128i *) j1);
+
read1 = (__m128i) _mm_loadh_pi((__m128) read1, (__m64 *)(j1+6));
+
tmp1 = _mm_and_si128(tmp1, read1);
+
tmp2 = _mm_and_si128(tmp2, read1);
+
tmp1 = _mm_slli_si128(tmp1, 1);
+
tmp2 = _mm_slli_si128(tmp2, 2);
+
tmp1 = _mm_or_si128(tmp1, tmp2);
+
_mm_storel_pi((__m64 *) ptrOutputBuf, (__m128) tmp1);
+
_mm_storeh_pi((__m64 *) (ptrOutputBuf + 8), (__m128) tmp1);
+
ptrOutputBuf += 16;
+
j1 += 12;
+ }
+
+ for (; i <
express_flag; ++i) {
+
ptrOutputBuf[0] = 0;
+
ptrOutputBuf[1] = *j1++;
+
ptrOutputBuf[2] = *j1++;
+
ptrOutputBuf[3] = *j1++;
+
ptrOutputBuf += 4;
+ }
+
+ }
else
memset(imgData + newOffset, 0,
aLength);
mImage->UnlockImagePixels(PR_FALSE);
Index: nsUnitConversion.h
===================================================================
RCS file: /cvsroot/mozilla/xpcom/ds/nsUnitConversion.h,v
retrieving revision 3.12
diff -r3.12 nsUnitConversion.h
43a44
> #include <xmmintrin.h>
82c83,85
< return ((0.0f <= aValue) ?
nscoord(aValue + ROUND_CONST_FLOAT) : nscoord(aValue -
ROUND_CONST_FLOAT));
---
> /* return ((0.0f <= aValue) ?
nscoord(aValue + ROUND_CONST_FLOAT) : nscoord(aValue -
ROUND_CONST_FLOAT)); */
>
return(_mm_cvtss_si32(_mm_load_ss(&aValue)));
>
101c104,105
< return ((0.0f <= aValue) ?
PRInt32(aValue + ROUND_CONST_FLOAT) : PRInt32(aValue -
ROUND_CONST_FLOAT));
---
> /* return ((0.0f <= aValue) ?
PRInt32(aValue + ROUND_CONST_FLOAT) : PRInt32(aValue -
ROUND_CONST_FLOAT)); */
>
return(_mm_cvtss_si32(_mm_load_ss(&aValue)));
Index: GIF2.cpp
===================================================================
RCS file: /cvsroot/mozilla/modules/libpr0n/decoders/gif/Attic/GIF2.cpp,v
retrieving revision 1.52.8.2
diff -u -r1.52.8.2 GIF2.cpp
--- GIF2.cpp 4 Apr 2007 19:30:42
-0000 1.52.8.2
+++ GIF2.cpp 29 Aug 2007 02:38:57 -0000
@@ -565,7 +565,18 @@
case gif_global_colormap:
{
-
memcpy(gs->global_colormap, q, 3 *
gs->global_colormap_size);
+ int i, target;
+ PRUint8 * gcmap =
gs->global_colormap;
+
+ target =
gs->global_colormap_size;
+ for (i = 0; i
< target; i++) {
+
*gcmap++ = 0;
+
*gcmap++ = *q++;
+
*gcmap++ = *q++;
+
*gcmap++ = *q++;
+ }
+
+ /*
memcpy(gs->global_colormap, q, 3 *
gs->global_colormap_size); */
GETN(1,
gif_image_start);
}
@@ -858,15 +869,23 @@
case gif_image_colormap:
{
PRUint8
*map = gs->local_colormap;
+ int i, target;
if (!map) {
-
map = gs->local_colormap = (PRUint8*)PR_MALLOC(3 *
gs->local_colormap_size);
+
map = gs->local_colormap = (PRUint8*)PR_MALLOC(4 *
gs->local_colormap_size);
if (!map) {
gs->state = gif_oom;
break;
}
}
+ target =
gs->local_colormap_size;
+ for (i = 0; i
< target; i++) {
+
*map++ = 0;
+
*map++ = *q++;
+
*map++ = *q++;
+
*map++ = *q++;
+ }
- memcpy(map, q, 3 *
gs->local_colormap_size);
+ /* memcpy(map, q, 3
* gs->local_colormap_size); */
GETN(1,
gif_lzw_start);
}
Index: nsGIFDecoder2.cpp
===================================================================
RCS file:
/cvsroot/mozilla/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp,v
retrieving revision 1.65
diff -u -r1.65 nsGIFDecoder2.cpp
--- nsGIFDecoder2.cpp 10 Mar 2005
13:31:26 -0000 1.65
+++ nsGIFDecoder2.cpp 29 Aug 2007
02:38:58 -0000
@@ -484,10 +484,11 @@
if(decoder->mGIFStruct->global_colormap &&
decoder->mGIFStruct->screen_bgcolor < cmapsize) {
gfx_color
bgColor = 0;
- PRUint32 bgIndex =
decoder->mGIFStruct->screen_bgcolor * 3;
- bgColor |=
cmap[bgIndex];
- bgColor |=
cmap[bgIndex + 1] << 8;
- bgColor |=
cmap[bgIndex + 2] << 16;
+ PRUint32 bgIndex =
decoder->mGIFStruct->screen_bgcolor * 4;
+ bgColor =
((PRUint32 *) cmap)[decoder->mGIFStruct->screen_bgcolor];
+ // bgColor |=
cmap[bgIndex];
+ // bgColor |=
cmap[bgIndex + 1] << 8;
+ // bgColor |=
cmap[bgIndex + 2] << 16;
decoder->mImageFrame->SetBackgroundColor(bgColor);
}
if
(decoder->mGIFStruct->is_local_colormap_defined) {
@@ -514,8 +515,10 @@
case gfxIFormats::BGR:
{
while (rowBufIndex != decoder->mGIFStruct->rowend) {
+
PRUint32 * rgbRowIndexLong = (PRUint32 *) rgbRowIndex;
+
PRUint32 * cmaplong = (PRUint32 *) cmap;
#if defined(XP_MAC) || defined(XP_MACOSX)
-
*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
+
//*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
#endif
if (*rowBufIndex < cmapsize) {
PRUint32 colorIndex = *rowBufIndex * 3;
@@ -524,14 +527,18 @@
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
*rgbRowIndex++ =
cmap[colorIndex]; // red
#else
-
*rgbRowIndex++ =
cmap[colorIndex]; // red
-
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
-
*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
//*rgbRowIndex++ =
cmap[colorIndex]; // red
+
//*rgbRowIndex++ = cmap[colorIndex + 1]; // green
+
//*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
*rgbRowIndexLong = cmaplong[*rowBufIndex];
+
rgbRowIndex += 4;
#endif
} else {
-
*rgbRowIndex++ =
0;
// red
-
*rgbRowIndex++ =
0;
// green
-
*rgbRowIndex++ =
0;
// blue
+
*rgbRowIndexLong = 0;
+
// *rgbRowIndex++ =
0;
// red
+
// *rgbRowIndex++ =
0;
// green
+
// *rgbRowIndex++ =
0;
// blue
+
rgbRowIndex += 4;
}
++rowBufIndex;
}
@@ -547,9 +554,11 @@
memset(decoder->mRGBLine, 0, bpr);
memset(decoder->mAlphaLine, 0, abpr);
for (PRUint32 x = 0; x < (PRUint32)width; ++x) {
+
PRUint32 * rgbRowIndexLong = (PRUint32 *) rgbRowIndex;
+
PRUint32 * cmaplong = (PRUint32 *) cmap;
if (*rowBufIndex != decoder->mGIFStruct->tpixel) {
#if defined(XP_MAC) || defined(XP_MACOSX)
-
*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
+
//*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
#endif
if (*rowBufIndex < cmapsize) {
PRUint32 colorIndex = *rowBufIndex * 3;
@@ -558,14 +567,18 @@
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
*rgbRowIndex++ =
cmap[colorIndex]; // red
#else
-
*rgbRowIndex++ =
cmap[colorIndex]; // red
-
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
-
*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
//*rgbRowIndex++ =
cmap[colorIndex]; // red
+
//*rgbRowIndex++ = cmap[colorIndex + 1]; // green
+
//*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
*rgbRowIndexLong = cmaplong[*rowBufIndex];
+
rgbRowIndex += 4;
#endif
} else {
-
*rgbRowIndex++ =
0;
// red
-
*rgbRowIndex++ =
0;
// green
-
*rgbRowIndex++ =
0;
// blue
+
*rgbRowIndexLong = 0;
+
//*rgbRowIndex++ =
0;
// red
+
//*rgbRowIndex++ =
0;
// green
+
//*rgbRowIndex++ =
0;
// blue
+
rgbRowIndex += 4;
}
decoder->mAlphaLine[x>>3] |=
1<<(7-x&0x7);
} else {
Index: GIF2.cpp
===================================================================
RCS file: /cvsroot/mozilla/modules/libpr0n/decoders/gif/Attic/GIF2.cpp,v
retrieving revision 1.52.8.2
diff -u -r1.52.8.2 GIF2.cpp
--- GIF2.cpp 4 Apr 2007 19:30:42
-0000 1.52.8.2
+++ GIF2.cpp 29 Aug 2007 02:38:57 -0000
@@ -565,7 +565,18 @@
case gif_global_colormap:
{
-
memcpy(gs->global_colormap, q, 3 *
gs->global_colormap_size);
+ int i, target;
+ PRUint8 * gcmap =
gs->global_colormap;
+
+ target =
gs->global_colormap_size;
+ for (i = 0; i
< target; i++) {
+
*gcmap++ = 0;
+
*gcmap++ = *q++;
+
*gcmap++ = *q++;
+
*gcmap++ = *q++;
+ }
+
+ /*
memcpy(gs->global_colormap, q, 3 *
gs->global_colormap_size); */
GETN(1,
gif_image_start);
}
@@ -858,15 +869,23 @@
case gif_image_colormap:
{
PRUint8
*map = gs->local_colormap;
+ int i, target;
if (!map) {
-
map = gs->local_colormap = (PRUint8*)PR_MALLOC(3 *
gs->local_colormap_size);
+
map = gs->local_colormap = (PRUint8*)PR_MALLOC(4 *
gs->local_colormap_size);
if (!map) {
gs->state = gif_oom;
break;
}
}
+ target =
gs->local_colormap_size;
+ for (i = 0; i
< target; i++) {
+
*map++ = 0;
+
*map++ = *q++;
+
*map++ = *q++;
+
*map++ = *q++;
+ }
- memcpy(map, q, 3 *
gs->local_colormap_size);
+ /* memcpy(map, q, 3
* gs->local_colormap_size); */
GETN(1,
gif_lzw_start);
}
Index: nsGIFDecoder2.cpp
===================================================================
RCS file:
/cvsroot/mozilla/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp,v
retrieving revision 1.65
diff -u -r1.65 nsGIFDecoder2.cpp
--- nsGIFDecoder2.cpp 10 Mar 2005
13:31:26 -0000 1.65
+++ nsGIFDecoder2.cpp 29 Aug 2007
02:38:58 -0000
@@ -484,10 +484,11 @@
if(decoder->mGIFStruct->global_colormap &&
decoder->mGIFStruct->screen_bgcolor < cmapsize) {
gfx_color
bgColor = 0;
- PRUint32 bgIndex =
decoder->mGIFStruct->screen_bgcolor * 3;
- bgColor |=
cmap[bgIndex];
- bgColor |=
cmap[bgIndex + 1] << 8;
- bgColor |=
cmap[bgIndex + 2] << 16;
+ PRUint32 bgIndex =
decoder->mGIFStruct->screen_bgcolor * 4;
+ bgColor =
((PRUint32 *) cmap)[decoder->mGIFStruct->screen_bgcolor];
+ // bgColor |=
cmap[bgIndex];
+ // bgColor |=
cmap[bgIndex + 1] << 8;
+ // bgColor |=
cmap[bgIndex + 2] << 16;
decoder->mImageFrame->SetBackgroundColor(bgColor);
}
if
(decoder->mGIFStruct->is_local_colormap_defined) {
@@ -514,8 +515,10 @@
case gfxIFormats::BGR:
{
while (rowBufIndex != decoder->mGIFStruct->rowend) {
+
PRUint32 * rgbRowIndexLong = (PRUint32 *) rgbRowIndex;
+
PRUint32 * cmaplong = (PRUint32 *) cmap;
#if defined(XP_MAC) || defined(XP_MACOSX)
-
*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
+
//*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
#endif
if (*rowBufIndex < cmapsize) {
PRUint32 colorIndex = *rowBufIndex * 3;
@@ -524,14 +527,18 @@
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
*rgbRowIndex++ =
cmap[colorIndex]; // red
#else
-
*rgbRowIndex++ =
cmap[colorIndex]; // red
-
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
-
*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
//*rgbRowIndex++ =
cmap[colorIndex]; // red
+
//*rgbRowIndex++ = cmap[colorIndex + 1]; // green
+
//*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
*rgbRowIndexLong = cmaplong[*rowBufIndex];
+
rgbRowIndex += 4;
#endif
} else {
-
*rgbRowIndex++ =
0;
// red
-
*rgbRowIndex++ =
0;
// green
-
*rgbRowIndex++ =
0;
// blue
+
*rgbRowIndexLong = 0;
+
// *rgbRowIndex++ =
0;
// red
+
// *rgbRowIndex++ =
0;
// green
+
// *rgbRowIndex++ =
0;
// blue
+
rgbRowIndex += 4;
}
++rowBufIndex;
}
@@ -547,9 +554,11 @@
memset(decoder->mRGBLine, 0, bpr);
memset(decoder->mAlphaLine, 0, abpr);
for (PRUint32 x = 0; x < (PRUint32)width; ++x) {
+
PRUint32 * rgbRowIndexLong = (PRUint32 *) rgbRowIndex;
+
PRUint32 * cmaplong = (PRUint32 *) cmap;
if (*rowBufIndex != decoder->mGIFStruct->tpixel) {
#if defined(XP_MAC) || defined(XP_MACOSX)
-
*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
+
//*rgbRowIndex++ = 0; // Mac is always 32bits per pixel, this is pad
#endif
if (*rowBufIndex < cmapsize) {
PRUint32 colorIndex = *rowBufIndex * 3;
@@ -558,14 +567,18 @@
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
*rgbRowIndex++ =
cmap[colorIndex]; // red
#else
-
*rgbRowIndex++ =
cmap[colorIndex]; // red
-
*rgbRowIndex++ = cmap[colorIndex + 1]; // green
-
*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
//*rgbRowIndex++ =
cmap[colorIndex]; // red
+
//*rgbRowIndex++ = cmap[colorIndex + 1]; // green
+
//*rgbRowIndex++ = cmap[colorIndex + 2]; // blue
+
*rgbRowIndexLong = cmaplong[*rowBufIndex];
+
rgbRowIndex += 4;
#endif
} else {
-
*rgbRowIndex++ =
0;
// red
-
*rgbRowIndex++ =
0;
// green
-
*rgbRowIndex++ =
0;
// blue
+
*rgbRowIndexLong = 0;
+
//*rgbRowIndex++ =
0;
// red
+
//*rgbRowIndex++ =
0;
// green
+
//*rgbRowIndex++ =
0;
// blue
+
rgbRowIndex += 4;
}
decoder->mAlphaLine[x>>3] |=
1<<(7-x&0x7);
} else {
3.0 patches
===================================================================
RCS file: /cvsroot/mozilla/gfx/thebes/public/gfxPlatform.h,v
retrieving revision 1.29
diff -u -8 -p -r1.29 gfxPlatform.h
--- gfx/thebes/public/gfxPlatform.h 4 Apr 2008 04:23:19 -0000 1.29
+++ gfx/thebes/public/gfxPlatform.h 3 May 2008 18:44:47 -0000
@@ -44,16 +44,68 @@
#include "gfxTypes.h"
#include "gfxASurface.h"
#ifdef XP_OS2
#undef OS2EMX_PLAIN_CHAR
#endif
+/* SSE2 Support Defines */
+
+/* GFX_HAVE_SSE2_INTEL_MNEMONICS means that the platform */
+/* architecture supports SSE2 but the processor that we're */
+/* running on may or may not have SSE2 capability and means */
+/* that we have to check processor capabilities before using */
+/*
SSE2.
*/
+
+/* GFX_HAVE_SSE2_INTRINSICS means that the platform always */
+/* supports SSE2. Mac OSX Intel is an example of this as their */
+/* Intel Macs started with the Intel Core Duo processor which */
+/*
supports
SSE2.
*/
+
+/* GFX_HAVE_SSE2_INTEL_MNEMONICS and GFX_HAVE_SSE2_INTRINSICS */
+/* are mutually exclusive - they should not be both defined at */
+/*
the same
time.
*/
+
+#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
+#define GFX_HAVE_SSE2_INTEL_MNEMONICS
+#endif
+
+#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
+#define GFX_HAVE_SSE2_INTRINSICS
+#endif
+
+#if defined(GFX_HAVE_SSE2_INTEL_MNEMONICS) && defined(GFX_HAVE_SSE2_INTRINSICS)
+#error Can't have both GFX_HAVE_SSE2_INTEL_MNEMONICS and GFX_HAVE_SSE2_INTRINSICS
+#endif
+
+#ifdef GFX_HAVE_SSE2_INTRINSICS
+#define GFX_SSE2_ALIGN __attribute__ ((aligned (16)))
+#endif
+
+#ifdef GFX_HAVE_SSE2_INTEL_MNEMONICS
+#define GFX_SSE2_ALIGN __declspec(align(16))
+#endif
+
+/* Define GFX_DEFINE_SSE2Present before the include of this file in the module */
+/*
where SSE2Present will be
declared.
*/
+
+#if defined(GFX_HAVE_SSE2_INTEL_MNEMONICS)
+#if defined(GFX_DEFINE_SSE2Present)
+int SSE2Present = 0;
+#else
+extern int SSE2Present;
+#endif /* GFX_DEFINE_SSE2Present */
+#elif defined(GFX_HAVE_SSE2_INTRINSICS)
+#define SSE2Present 1
+#else
+#define SSE2Present 0
+#endif
+
typedef void* cmsHPROFILE;
typedef void* cmsHTRANSFORM;
class gfxImageSurface;
class gfxFontGroup;
struct gfxFontStyle;
// pref lang id's for font prefs
@@ -230,16 +282,23 @@ public:
*/
static cmsHTRANSFORM GetCMSInverseRGBTransform();
/**
* Return sRGBA -> output device transform.
*/
static cmsHTRANSFORM GetCMSRGBATransform();
+ /**
+ * Runtime check for SSE2 Support
+ */
+#if defined(GFX_HAVE_SSE2_INTEL_MNEMONICS)
+ static int SSE2Support();
+#endif
+
protected:
gfxPlatform() { }
virtual ~gfxPlatform();
private:
virtual cmsHPROFILE GetPlatformCMSOutputProfile();
};
Index: gfx/thebes/src/gfxPlatform.cpp
===================================================================
RCS file: /cvsroot/mozilla/gfx/thebes/src/gfxPlatform.cpp,v
retrieving revision 1.36
diff -u -8 -p -r1.36 gfxPlatform.cpp
--- gfx/thebes/src/gfxPlatform.cpp 4 Apr 2008 04:23:19 -0000 1.36
+++ gfx/thebes/src/gfxPlatform.cpp 3 May 2008 18:44:47 -0000
@@ -30,16 +30,19 @@
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
+/* This declares SSE2Present = 0 */
+#define GFX_DEFINE_SSE2Present
+
#include "gfxPlatform.h"
#if defined(XP_WIN)
#include "gfxWindowsPlatform.h"
#elif defined(XP_MACOSX)
#include "gfxPlatformMac.h"
#include "gfxQuartzFontCache.h"
#elif defined(MOZ_WIDGET_GTK2)
@@ -64,16 +67,20 @@
#include "cairo.h"
#include "lcms.h"
#include "plstr.h"
#include "nsIPrefService.h"
#include "nsIPrefBranch.h"
+#ifdef GFX_HAVE_SSE2_INTEL_MNEMONICS
+#include "intrin.h"
+#endif
+
gfxPlatform *gPlatform = nsnull;
int gGlitzState = -1;
static cmsHPROFILE gCMSOutputProfile = nsnull;
static cmsHTRANSFORM gCMSRGBTransform = nsnull;
static cmsHTRANSFORM gCMSInverseRGBTransform = nsnull;
static cmsHTRANSFORM gCMSRGBATransform = nsnull;
// this needs to match the list of pref font.default.xx entries listed in all.js!
@@ -536,8 +543,23 @@ gfxPlatform::GetCMSRGBATransform()
gCMSRGBATransform = cmsCreateTransform(inProfile, TYPE_RGBA_8,
outProfile, TYPE_RGBA_8,
INTENT_PERCEPTUAL, 0);
}
return gCMSRGBATransform;
}
+
+/* SSE2 runtime support check */
+
+#ifdef GFX_HAVE_SSE2_INTEL_MNEMONICS
+int gfxPlatform::SSE2Support()
+{
+ int CPUInfo[4];
+
+ __cpuid(CPUInfo, 1);
+ if (CPUInfo[3] & (0x1 << 26))
+ return 1;
+ else
+ return 2;
+}
+#endif
Index: jpeg/jconfig.h
===================================================================
RCS file: /cvsroot/mozilla/jpeg/jconfig.h,v
retrieving revision 3.14
diff -u -8 -p -r3.14 jconfig.h
--- jpeg/jconfig.h 30 Jan 2008 06:29:24 -0000 3.14
+++ jpeg/jconfig.h 3 May 2008 18:44:49 -0000
@@ -90,18 +90,8 @@
#undef TWO_FILE_COMMANDLINE
#undef NEED_SIGNAL_CATCHER
#undef DONT_USE_B_MODE
#undef PROGRESS_REPORT
#endif /* JPEG_CJPEG_DJPEG */
-/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
-
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define ALIGN16_const_vector_short(name) __declspec(align(16)) const short name[8]
-#define ALIGN16_const_vector_uchar(name) __declspec(align(16)) const unsigned char name[16]
-#else
-#define ALIGN16_const_vector_short(name) const short name[8] __attribute__ ((aligned (16)))
-#define ALIGN16_const_vector_uchar(name) const unsigned char name[16] __attribute__ ((aligned (16)))
-#endif /* ! XP_WIN32 && _M_IX86 && !__GNUC */
-
Index: jpeg/jdapimin.c
===================================================================
RCS file: /cvsroot/mozilla/jpeg/jdapimin.c,v
retrieving revision 3.11
diff -u -8 -p -r3.11 jdapimin.c
--- jpeg/jdapimin.c 12 Mar 2008 21:36:47 -0000 3.11
+++ jpeg/jdapimin.c 3 May 2008 18:44:49 -0000
@@ -62,24 +62,20 @@ NOT_SUPPORTED:
CPUInfo[3] = my_edx;
}
#endif /* _MSC_VER >= 1400 */
int MMXAvailable;
static int mmxsupport();
#endif
-#ifdef HAVE_SSE2_INTRINSICS
-int SSE2Available = 0;
#ifdef HAVE_SSE2_INTEL_MNEMONICS
+int SSE2Available = 0;
static int sse2support();
-#else
-static int sse2supportGCC();
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
-#endif /* HAVE_SSE2_INTRINSICS */
/*
* Initialization of a JPEG decompression object.
* The error manager must already be set up (in case memory manager fails).
*/
GLOBAL(void)
@@ -87,37 +83,27 @@ jpeg_CreateDecompress (j_decompress_ptr
{
int i;
#ifdef HAVE_MMX_INTEL_MNEMONICS
static int cpuidDetected = 0;
if(!cpuidDetected)
{
- MMXAvailable = mmxsupport();
+ MMXAvailable = mmxsupport();
#ifdef HAVE_SSE2_INTEL_MNEMONICS
/* only do the sse2 support check if mmx is supported (so
we know the processor supports cpuid) */
if (MMXAvailable)
SSE2Available = sse2support();
-#endif
+#endif /* HAVE_SSE2_INTEL_MNEMONICS */
cpuidDetected = 1;
}
-#else
-#ifdef HAVE_SSE2_INTRINSICS
- static int cpuidDetected = 0;
-
- if(!cpuidDetected) {
- SSE2Available = sse2supportGCC();
- cpuidDetected = 1;
- }
-
-#endif /* HAVE_SSE2_INTRINSICS */
#endif /* HAVE_MMX_INTEL_MNEMONICS */
/* For debugging purposes, zero the whole master structure.
* But error manager pointer is already there, so save and restore it.
*/
/* Guard against version mismatches between library and caller. */
cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */
@@ -488,43 +474,23 @@ static int mmxsupport()
int CPUInfo[4];
__cpuid(CPUInfo, 1);
if (CPUInfo[3] & (0x1 << 23))
return 1;
else
return 0;
}
-#endif
+#endif /* HAVE_MMX_INTEL_MNEMONICS */
#ifdef HAVE_SSE2_INTEL_MNEMONICS
static int sse2support()
{
int CPUInfo[4];
__cpuid(CPUInfo, 1);
if (CPUInfo[3] & (0x1 << 26))
return 1;
else
return 2;
}
-#else
-#ifdef HAVE_SSE2_INTRINSICS
-static int sse2supportGCC()
-{
-
- /* Mac Intel started with Core Duo chips which have SSE2 Support */
-
-#if defined(__GNUC__) && defined(__i386__)
-#if defined(XP_MACOSX)
- return 1;
-#endif /* XP_MACOSX */
-#endif /* GNUC && i386 */
-
- /* Add checking for SSE2 support for other platforms here */
-
- /* We don't have SSE2 intrinsics support */
-
- return 2;
-}
-#endif /* HAVE_SSE2_INTRINSICS */
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
Index: jpeg/jdcoefct.c
===================================================================
RCS file: /cvsroot/mozilla/jpeg/jdcoefct.c,v
retrieving revision 3.2
diff -u -8 -p -r3.2 jdcoefct.c
--- jpeg/jdcoefct.c 22 Jul 1998 21:15:10 -0000 3.2
+++ jpeg/jdcoefct.c 3 May 2008 18:44:49 -0000
@@ -40,21 +40,21 @@ typedef struct {
* We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
* and let the entropy decoder write into that workspace each time.
* (On 80x86, the workspace is FAR even though it's not really very big;
* this is to keep the module interfaces unchanged when a large coefficient
* buffer is necessary.)
* In multi-pass modes, this array points to the current MCU's blocks
* within the virtual arrays; it is used only by the input side.
*/
- JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
+ SSE2_ALIGN JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
#ifdef D_MULTISCAN_FILES_SUPPORTED
/* In multi-pass modes, we need a virtual block array for each component. */
- jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+ SSE2_ALIGN jvirt_barray_ptr whole_image[MAX_COMPONENTS];
#endif
#ifdef BLOCK_SMOOTHING_SUPPORTED
/* When doing block smoothing, we latch coefficient Al values here */
int * coef_bits_latch;
#define SAVED_COEFS 6 /* we save coef_bits[0..5] */
#endif
} my_coef_controller;
@@ -466,17 +466,17 @@ decompress_smooth_data (j_decompress_ptr
int ci, block_row, block_rows, access_rows;
JBLOCKARRAY buffer;
JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
JSAMPARRAY output_ptr;
JDIMENSION output_col;
jpeg_component_info *compptr;
inverse_DCT_method_ptr inverse_DCT;
boolean first_row, last_row;
- JBLOCK workspace;
+ SSE2_ALIGN JBLOCK workspace;
int *coef_bits;
JQUANT_TBL *quanttbl;
INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
int Al, pred;
/* Force some input to be done if we are getting ahead of the input. */
while (cinfo->input_scan_number <= cinfo->output_scan_number &&
@@ -718,19 +718,31 @@ jinit_d_coef_controller (j_decompress_pt
#else
ERREXIT(cinfo, JERR_NOT_COMPILED);
#endif
} else {
/* We only need a single-MCU buffer. */
JBLOCKROW buffer;
int i;
+#ifdef HAVE_SSE2_INTRINSICS
+ /* Align on 16-byte boundaries for the SSE2 code. */
+ /* This won't work for 64-bit systems so conditional code it. */
+
+ PRUptrdiff buffer_shift;
+
+ buffer_shift = (PRUptrdiff)
+ (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+
D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK) + 15);
+ buffer = (JBLOCKROW) ((buffer_shift + 15) & ~15);
+#else
buffer = (JBLOCKROW)
(*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
+#endif /* HAVE_SSE2_INTRINSICS */
for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
coef->MCU_buffer[i] = buffer + i;
}
coef->pub.consume_data = dummy_consume_data;
coef->pub.decompress_data = decompress_onepass;
coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
}
}
Index: jpeg/jddctmgr.c
===================================================================
RCS file: /cvsroot/mozilla/jpeg/jddctmgr.c,v
retrieving revision 3.4
diff -u -8 -p -r3.4 jddctmgr.c
--- jpeg/jddctmgr.c 25 Jan 2008 06:52:01 -0000 3.4
+++ jpeg/jddctmgr.c 3 May 2008 18:44:49 -0000
@@ -14,17 +14,17 @@
* dequantization as well as the IDCT proper. This module sets up the
* dequantization multiplier table needed by the IDCT routine.
*/
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
-#ifdef HAVE_SSE2_INTRINSICS
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
extern int SSE2Available;
#endif
/*
* The decompressor input side (jdinput.c) saves away the appropriate
* quantization table for each component at the start of the first scan
* involving that component. (This is necessary in order to correctly
* decode files that reuse Q-table slots.)
@@ -75,24 +75,34 @@ typedef union {
#ifdef DCT_ISLOW_SUPPORTED
#define PROVIDE_ISLOW_TABLES
#else
#ifdef IDCT_SCALING_SUPPORTED
#define PROVIDE_ISLOW_TABLES
#endif
#endif
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
GLOBAL(void)
jpeg_idct_islow_sse2 (
j_decompress_ptr cinfo,
jpeg_component_info * compptr,
JCOEFPTR coef_block,
JSAMPARRAY output_buf,
JDIMENSION output_col);
+#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+#ifdef HAVE_SSE2_INTRINSICS
+jpeg_idct_ifast_sse2 (
+ j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+#endif /* HAVE_SSE2_INTRINSICS */
/*
* Prepare for an output pass.
* Here we select the proper IDCT routine for each component and build
* a matching multiplier table.
*/
METHODDEF(void)
@@ -142,31 +152,23 @@ start_pass (j_decompress_ptr cinfo)
method_ptr = jpeg_idct_islow;
method = JDCT_ISLOW;
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
break;
#endif
#ifdef DCT_IFAST_SUPPORTED
case JDCT_IFAST:
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
- if (SSE2Available==1)
- {
- method_ptr = jpeg_idct_islow_sse2;
- method = JDCT_ISLOW;
- }
- else
- {
- method_ptr = jpeg_idct_ifast;
- method = JDCT_IFAST;
- }
+#ifdef HAVE_SSE2_INTRINSICS
+ method_ptr = jpeg_idct_ifast_sse2;
+ method = JDCT_IFAST;
#else
- method_ptr = jpeg_idct_ifast;
- method = JDCT_IFAST;
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+ method_ptr = jpeg_idct_ifast;
+ method = JDCT_IFAST;
+#endif /* HAVE_SSE2_INTRINSICS */
break;
#endif
#ifdef DCT_FLOAT_SUPPORTED
case JDCT_FLOAT:
method_ptr = jpeg_idct_float;
method = JDCT_FLOAT;
break;
@@ -292,16 +294,28 @@ jinit_inverse_dct (j_decompress_ptr cinf
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
SIZEOF(my_idct_controller));
cinfo->idct = (struct jpeg_inverse_dct *) idct;
idct->pub.start_pass = start_pass;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
/* Allocate and pre-zero a multiplier table for each component */
+#ifdef HAVE_SSE2_INTRINSICS
+
+ /* Align dct_table on 16 bytes for SSE2 IDCT IFAST optimization */
+
+ PRUptrdiff buffer_pointer;
+
+ buffer_pointer = (PRUptrdiff)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+
SIZEOF(multiplier_table) + 15);
+ compptr->dct_table = (void *) ((buffer_pointer + 15) & ~15);
+#else
compptr->dct_table =
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
SIZEOF(multiplier_table));
+#endif /* HAVE_SSE2_INTRINSICS */
MEMZERO(compptr->dct_table, SIZEOF(multiplier_table));
/* Mark multiplier table not yet set up for any method */
idct->cur_method[ci] = -1;
}
}
Index: jpeg/jidctfst.c
===================================================================
RCS file: /cvsroot/mozilla/jpeg/jidctfst.c,v
retrieving revision 3.4
diff -u -8 -p -r3.4 jidctfst.c
--- jpeg/jidctfst.c 27 Mar 2002 06:02:19 -0000 3.4
+++ jpeg/jidctfst.c 3 May 2008 18:44:50 -0000
@@ -166,16 +166,23 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci
JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col);
__inline GLOBAL(void)
jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col);
#endif
+#ifdef HAVE_SSE2_INTRINSICS
+GLOBAL(void)
+jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+
JSAMPARRAY output_buf, JDIMENSION output_col);
+#endif /* HAVE_SSE2_INTRINSICS */
+
GLOBAL(void)
jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col);
#ifdef HAVE_MMX_INTEL_MNEMONICS
GLOBAL(void)
@@ -1642,9 +1649,463 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci
movq [ebx], mm3
emms
}
}
#endif
+#ifdef HAVE_SSE2_INTRINSICS
+#include "xmmintrin.h"
+#include "emmintrin.h"
+
+/* These values are taken from the decimal shifted left 14. When multiplying */
+/* by these constants, we have to shift left the other multiplier so that we */
+/*
we can use pmulhw which will descale us by the 16
bits.
*/
+
+SSE2_ALIGN unsigned short SSE2_1_414213562[8] =
+ {23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170};
+SSE2_ALIGN unsigned short SSE2_1_847759065[8] =
+ {30274, 30274, 30274, 30274, 30274, 30274, 30274, 30274};
+SSE2_ALIGN unsigned short SSE2_1_082392200[8] =
+ {17734, 17734, 17734, 17734, 17734, 17734, 17734, 17734};
+SSE2_ALIGN unsigned short SSE2_NEG_0_765366865[8] =
+ {52996, 52996, 52996, 52996, 52996, 52996, 52996, 52996};
+
+/* This following is to compensate for range_limit and is shifted five to */
+/*
the left as that's what we're scaled for at the
time.
*/
+
+SSE2_ALIGN unsigned short SSE2_ADD_128[8] =
+ {4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096};
+
+/*---------------------------------------------------------------------------*/
+
+inline GLOBAL(void)
+ jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+
JCOEFPTR coef_block,
+
JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp10, tmp11, tmp12, tmp13, z10, z11, z12, z13;
+ __m128i xps0, xps1, xps2, xps3, rowa;
+ __m128i * quantptr;
+ __m128i * coefptr;
+ JSAMPROW outptr0, outptr1, outptr2, outptr3, outptr4, outptr5, outptr6, outptr7;
+
+ /* ====================================================================== */
+
/* Load registers from the coefficient
block
*/
+ /* ====================================================================== */
+
+ quantptr = (__m128i *) compptr->dct_table;
+ coefptr = (__m128i *) coef_block;
+
+ row0 = *(coefptr+0);
+ row1 = *(coefptr+1);
+ row2 = *(coefptr+2);
+ row3 = *(coefptr+3);
+ row4 = *(coefptr+4);
+ row5 = *(coefptr+5);
+ row6 = *(coefptr+6);
+ row7 = *(coefptr+7);
+ row0 = _mm_mullo_epi16(row0, *(quantptr+0));
+ row1 = _mm_mullo_epi16(row1, *(quantptr+1));
+ row2 = _mm_mullo_epi16(row2, *(quantptr+2));
+ row3 = _mm_mullo_epi16(row3, *(quantptr+3));
+ row4 = _mm_mullo_epi16(row4, *(quantptr+4));
+ row5 = _mm_mullo_epi16(row5, *(quantptr+5));
+ row6 = _mm_mullo_epi16(row6, *(quantptr+6));
+ row7 = _mm_mullo_epi16(row7, *(quantptr+7));
+
+ /* ====================================================================== */
+
/* Inverse Discrete Cosine Transform Column
Processing
*/
+ /* ====================================================================== */
+
+ /* Even part */
+
+ /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
+ /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
+
+ tmp10 = _mm_add_epi16(row0, row4);
+ tmp11 = _mm_sub_epi16(row0, row4);
+
+ /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
+ /* tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
+ /* - tmp13; */
+
+ tmp13 = _mm_add_epi16(row2, row6);
+ tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
+
*((__m128i *) SSE2_1_414213562));
+ tmp12 = _mm_sub_epi16(tmp12, tmp13);
+
+ /* tmp0 = tmp10 + tmp13; */
+ /* tmp3 = tmp10 - tmp13; */
+ /* tmp1 = tmp11 + tmp12; */
+ /* tmp2 = tmp11 - tmp12; */
+
+ tmp0 = _mm_add_epi16(tmp10, tmp13);
+ tmp3 = _mm_sub_epi16(tmp10, tmp13);
+ tmp1 = _mm_add_epi16(tmp11, tmp12);
+ tmp2 = _mm_sub_epi16(tmp11, tmp12);
+
+ /* Odd part */
+
+ /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
+ /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
+ /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
+ /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
+
+ z13 = _mm_add_epi16(row5, row3);
+ z10 = _mm_sub_epi16(row5, row3);
+ z11 = _mm_add_epi16(row1, row7);
+ z12 = _mm_sub_epi16(row1, row7);
+
+ /* phase 5 */
+
+ /* tmp7 = z11 + z13; */
+ /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
+
+ tmp7 = _mm_add_epi16(z11, z13);
+ tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
+
*((__m128i *) SSE2_1_414213562));
+
+ /* z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
+ /* tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
+ /* tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
+
+ /* This is really a mess as the negative 2.61 doesn't work well */
+ /* as we're working with 16-bit integers. So we're going to apply */
+ /* a little algebra to get: */
+
+ /* tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
+ /* tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
+
+ /* or */
+
+ /* tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
+ /* tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
+
+ z10 = _mm_slli_epi16(z10, 2);
+ z12 = _mm_slli_epi16(z12, 2);
+
+ tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
+
_mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
+ tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
+
_mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
+
+ /* phase 2 */
+
+ /* tmp6 = tmp12 - tmp7; */
+ /* tmp5 = tmp11 - tmp6; */
+ /* tmp4 = tmp10 + tmp5; */
+
+ tmp6 = _mm_sub_epi16(tmp12, tmp7);
+ tmp5 = _mm_sub_epi16(tmp11, tmp6);
+ tmp4 = _mm_add_epi16(tmp10, tmp5);
+
+ /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row0 = _mm_add_epi16(tmp0, tmp7);
+ row7 = _mm_sub_epi16(tmp0, tmp7);
+
+ /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row1 = _mm_add_epi16(tmp1, tmp6);
+ row6 = _mm_sub_epi16(tmp1, tmp6);
+
+ /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row2 = _mm_add_epi16(tmp2, tmp5);
+ row5 = _mm_sub_epi16(tmp2, tmp5);
+
+ /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row4 = _mm_add_epi16(tmp3, tmp4);
+ row3 = _mm_sub_epi16(tmp3, tmp4);
+
+ /* ====================================================================== */
+
/* Transpose Matrix
(Word)
*/
+ /* ====================================================================== */
+
+ /* Save off the low quadwords from the even rows */
+
+ xps0 = _mm_move_epi64(row0);
+ xps1 = _mm_move_epi64(row2);
+ xps2 = _mm_move_epi64(row4);
+ xps3 = _mm_move_epi64(row6);
+
+ /* Interleave the high quadwords */
+
+ row0 = _mm_unpackhi_epi16(row0, row1); /* 04 14 05 15 06 16 07 17 */
+ row2 = _mm_unpackhi_epi16(row2, row3); /* 24 34 25 35 26 36 27 37 */
+ row4 = _mm_unpackhi_epi16(row4, row5); /* 44 54 45 55 46 56 47 57 */
+ row6 = _mm_unpackhi_epi16(row6, row7); /* 64 74 65 75 66 76 67 77 */
+
+ rowa = row0;
+ row0 = _mm_unpacklo_epi32(row0, row2); /* 04 14 24 34 05 15 25 35 */
+ rowa = _mm_unpackhi_epi32(rowa, row2); /* 06 16 26 36 07 17 27 37 */
+
+ row2 = row4;
+ row4 = _mm_unpacklo_epi32(row4, row6); /* 44 54 64 74 45 55 65 75 */
+ row2 = _mm_unpackhi_epi32(row2, row6); /* 46 56 66 76 47 57 67 77 */
+
+ row6 = row0;
+ row0 = _mm_unpacklo_epi64(row0, row4); /* 04 14 24 34 44 54 64 74 */
+ row6 = _mm_unpackhi_epi64(row6, row4); /* 05 15 25 35 45 55 65 75 */
+
+ row4 = rowa;
+ rowa = _mm_unpacklo_epi64(rowa, row2); /* 06 16 26 36 46 56 66 76 */
+ row4 = _mm_unpackhi_epi64(row4, row2); /* 07 17 27 37 47 57 67 77 */
+
+ /* row0, row6, rowa, row4 now contain *4, *5, *6, *7 */
+
+ /* Interleave the low quadwords */
+
+ xps0 = _mm_unpacklo_epi16(xps0, row1); /* 00 10 01 11 02 12 03 13 */
+ xps1 = _mm_unpacklo_epi16(xps1, row3); /* 20 30 21 31 22 32 23 33 */
+ xps2 = _mm_unpacklo_epi16(xps2, row5); /* 40 50 41 51 42 52 43 43 */
+ xps3 = _mm_unpacklo_epi16(xps3, row7); /* 60 70 61 71 62 72 63 73 */
+
+ row2 = xps0;
+ xps0 = _mm_unpacklo_epi32(xps0, xps1); /* 00 10 20 30 01 11 21 31 */
+ row2 = _mm_unpackhi_epi32(row2, xps1); /* 02 12 22 32 03 13 23 33 */
+
+ xps1 = xps2;
+ xps2 = _mm_unpacklo_epi32(xps2, xps3); /* 40 50 60 70 41 51 61 71 */
+ xps1 = _mm_unpackhi_epi32(xps1, xps3); /* 42 52 62 72 43 53 63 73 */
+
+ xps3 = xps0;
+ xps0 = _mm_unpacklo_epi64(xps0, xps2); /* 00 10 20 30 40 50 60 70 */
+ xps3 = _mm_unpackhi_epi64(xps3, xps2); /* 01 11 21 31 41 51 61 71 */
+
+ xps2 = row2;
+ row2 = _mm_unpacklo_epi64(row2, xps1); /* 02 12 22 32 42 52 62 72 */
+ xps2 = _mm_unpackhi_epi64(xps2, xps1); /* 03 13 23 33 43 53 63 73 */
+
+ /* xps0, xps3, row2, xps2 now contain *0, *1, *2, *3 */
+
+ /* Restructure back to row0 to row7. */
+
+ row7 = row4;
+ row5 = row6;
+ row6 = rowa;
+ row4 = row0;
+ row0 = xps0;
+ row1 = xps3;
+ row3 = xps2;
+
+ /* ====================================================================== */
+
/* Inverse Discrete Cosine Transform Row
Processing
*/
+ /* ====================================================================== */
+
+ /* Even part */
+
+ /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
+ /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
+
+ tmp10 = _mm_add_epi16(row0, row4);
+ tmp11 = _mm_sub_epi16(row0, row4);
+
+ /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
+ /* tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
+ /* - tmp13; */
+
+ tmp13 = _mm_add_epi16(row2, row6);
+ tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
+
*((__m128i *) SSE2_1_414213562));
+ tmp12 = _mm_sub_epi16(tmp12, tmp13);
+
+ /* tmp0 = tmp10 + tmp13; */
+ /* tmp3 = tmp10 - tmp13; */
+ /* tmp1 = tmp11 + tmp12; */
+ /* tmp2 = tmp11 - tmp12; */
+
+ tmp0 = _mm_add_epi16(tmp10, tmp13);
+ tmp3 = _mm_sub_epi16(tmp10, tmp13);
+ tmp1 = _mm_add_epi16(tmp11, tmp12);
+ tmp2 = _mm_sub_epi16(tmp11, tmp12);
+
+ /* Odd part */
+
+ /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
+ /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
+ /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
+ /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
+
+ z13 = _mm_add_epi16(row5, row3);
+ z10 = _mm_sub_epi16(row5, row3);
+ z11 = _mm_add_epi16(row1, row7);
+ z12 = _mm_sub_epi16(row1, row7);
+
+ /* phase 5 */
+
+ /* tmp7 = z11 + z13; */
+ /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
+
+ tmp7 = _mm_add_epi16(z11, z13);
+ tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
+
*((__m128i *) SSE2_1_414213562));
+
+ /* z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
+ /* tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
+ /* tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
+
+ /* This is really a mess as the negative 2.61 doesn't work well */
+ /* as we're working with 16-bit integers. So we're going to apply */
+ /* a little algebra to get: */
+
+ /* tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
+ /* tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
+
+ /* or */
+
+ /* tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
+ /* tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
+
+ z10 = _mm_slli_epi16(z10, 2);
+ z12 = _mm_slli_epi16(z12, 2);
+
+ tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
+
_mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
+ tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
+
_mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
+
+ /* phase 2 */
+
+ /* tmp6 = tmp12 - tmp7; */
+ /* tmp5 = tmp11 - tmp6; */
+ /* tmp4 = tmp10 + tmp5; */
+
+ tmp6 = _mm_sub_epi16(tmp12, tmp7);
+ tmp5 = _mm_sub_epi16(tmp11, tmp6);
+ tmp4 = _mm_add_epi16(tmp10, tmp5);
+
+ /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row0 = _mm_add_epi16(tmp0, tmp7);
+ row7 = _mm_sub_epi16(tmp0, tmp7);
+
+ /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row1 = _mm_add_epi16(tmp1, tmp6);
+ row6 = _mm_sub_epi16(tmp1, tmp6);
+
+ /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row2 = _mm_add_epi16(tmp2, tmp5);
+ row5 = _mm_sub_epi16(tmp2, tmp5);
+
+ /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+ /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
+
/*
& RANGE_MASK]; */
+
+ row4 = _mm_add_epi16(tmp3, tmp4);
+ row3 = _mm_sub_epi16(tmp3, tmp4);
+
+ /* ====================================================================== */
+
/* Transpose Matrix
(Byte)
*/
+ /* ====================================================================== */
+
+ /* Shift right after adding half to descale */
+
+ row0 = _mm_add_epi16(row0, *((__m128i *) SSE2_ADD_128));
+ row1 = _mm_add_epi16(row1, *((__m128i *) SSE2_ADD_128));
+ row2 = _mm_add_epi16(row2, *((__m128i *) SSE2_ADD_128));
+ row3 = _mm_add_epi16(row3, *((__m128i *) SSE2_ADD_128));
+ row4 = _mm_add_epi16(row4, *((__m128i *) SSE2_ADD_128));
+ row5 = _mm_add_epi16(row5, *((__m128i *) SSE2_ADD_128));
+ row6 = _mm_add_epi16(row6, *((__m128i *) SSE2_ADD_128));
+ row7 = _mm_add_epi16(row7, *((__m128i *) SSE2_ADD_128));
+
+ row0 = _mm_srai_epi16(row0, 5);
+ row1 = _mm_srai_epi16(row1, 5);
+ row2 = _mm_srai_epi16(row2, 5);
+ row3 = _mm_srai_epi16(row3, 5);
+ row4 = _mm_srai_epi16(row4, 5);
+ row5 = _mm_srai_epi16(row5, 5);
+ row6 = _mm_srai_epi16(row6, 5);
+ row7 = _mm_srai_epi16(row7, 5);
+
+ /* Range Limit and convert Words to Bytes */
+
+ row0 = _mm_packus_epi16(row0, row4);
+ row1 = _mm_packus_epi16(row1, row5);
+ row2 = _mm_packus_epi16(row2, row6);
+ row3 = _mm_packus_epi16(row3, row7);
+
+ /* Transpose from rows back to columns */
+
+ row4 = _mm_unpackhi_epi8(row0, row1); /* 40 50 41 51 42 52 43 43 ... */
+ row5 = _mm_unpackhi_epi8(row2, row3); /* 60 70 61 71 62 72 63 73 ... */
+ row0 = _mm_unpacklo_epi8(row0, row1); /* 00 10 01 11 02 12 03 13 ... */
+ row2 = _mm_unpacklo_epi8(row2, row3); /* 20 30 21 31 22 32 23 33 ... */
+
+ row6 = _mm_unpackhi_epi16(row0, row2); /* 04 14 24 34 05 15 25 35 ... */
+ row7 = _mm_unpackhi_epi16(row4, row5); /* 44 54 64 74 45 55 65 75 ... */
+ row0 = _mm_unpacklo_epi16(row0, row2); /* 00 10 20 30 01 11 21 31 ... */
+ row4 = _mm_unpacklo_epi16(row4, row5); /* 40 50 60 70 41 51 61 71 ... */
+
+ row1 = _mm_unpackhi_epi32(row0, row4); /* 02 12 22 32 42 52 62 72 ... */
+ row2 = _mm_unpackhi_epi32(row6, row7); /* 06 16 26 36 46 56 66 76 ... */
+ row0 = _mm_unpacklo_epi32(row0, row4); /* 00 10 20 30 40 50 60 70 ... */
+ row6 = _mm_unpacklo_epi32(row6, row7); /* 04 14 24 34 55 54 64 74 ... */
+
+ /* Rearrange for readability */
+
+ row3 = row2;
+ row2 = row6;
+
+ /* ====================================================================== */
+
/* Final
output
*/
+ /* ====================================================================== */
+
+ outptr0 = output_buf[0] + output_col;
+ outptr1 = output_buf[1] + output_col;
+ outptr2 = output_buf[2] + output_col;
+ outptr3 = output_buf[3] + output_col;
+ outptr4 = output_buf[4] + output_col;
+ outptr5 = output_buf[5] + output_col;
+ outptr6 = output_buf[6] + output_col;
+ outptr7 = output_buf[7] + output_col;
+
+ /* We can't use the faster FP instructions to write out the high quad */
+
/* so do a rotate right and write them out as the low
quad
*/
+
+ row4 = _mm_srli_si128(row0, 8);
+ row5 = _mm_srli_si128(row1, 8);
+ row6 = _mm_srli_si128(row2, 8);
+ row7 = _mm_srli_si128(row3, 8);
+
+ _mm_storel_epi64((__m128i *) outptr0, row0);
+ _mm_storel_epi64((__m128i *) outptr2, row1);
+ _mm_storel_epi64((__m128i *) outptr4, row2);
+ _mm_storel_epi64((__m128i *) outptr6, row3);
+ _mm_storel_epi64((__m128i *) outptr1, row4);
+ _mm_storel_epi64((__m128i *) outptr3, row5);
+ _mm_storel_epi64((__m128i *) outptr5, row6);
+ _mm_storel_epi64((__m128i *) outptr7, row7);
+}
+#endif /* HAVE_SSE2_INTRINSICS */
+
#endif /* DCT_IFAST_SUPPORTED */
Index: jpeg/jmemmgr.c
===================================================================
RCS file: /cvsroot/mozilla/jpeg/jmemmgr.c,v
retrieving revision 3.2
diff -u -8 -p -r3.2 jmemmgr.c
--- jpeg/jmemmgr.c 22 Jul 1998 21:15:41 -0000 3.2
+++ jpeg/jmemmgr.c 3 May 2008 18:44:50 -0000
@@ -444,16 +444,19 @@ alloc_barray (j_common_ptr cinfo, int po
JDIMENSION blocksperrow, JDIMENSION numrows)
/* Allocate a 2-D coefficient-block array */
{
my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
JBLOCKARRAY result;
JBLOCKROW workspace;
JDIMENSION rowsperchunk, currow, i;
long ltemp;
+#ifdef HAVE_SSE2_INTRINSICS
+ PRUptrdiff workspace_raw;
+#endif
/* Calculate max # of rows allowed in one allocation chunk */
ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
((long) blocksperrow * SIZEOF(JBLOCK));
if (ltemp <= 0)
ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
if (ltemp < (long) numrows)
rowsperchunk = (JDIMENSION) ltemp;
@@ -464,19 +467,26 @@ alloc_barray (j_common_ptr cinfo, int po
/* Get space for row pointers (small object) */
result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
(size_t) (numrows * SIZEOF(JBLOCKROW)));
/* Get the rows themselves (large objects) */
currow = 0;
while (currow < numrows) {
rowsperchunk = MIN(rowsperchunk, numrows - currow);
+#ifdef HAVE_SSE2_INTRINSICS
+ workspace_raw = (JBLOCKROW) alloc_large(cinfo, pool_id,
+ (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
+ * SIZEOF(JBLOCK) + 15));
+ workspace = (JBLOCKROW) ((workspace_raw + 15) & ~15);
+#else
workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
(size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
* SIZEOF(JBLOCK)));
+#endif /* HAVE_SSE2_INTRINSICS */
for (i = rowsperchunk; i > 0; i--) {
result[currow++] = workspace;
workspace += blocksperrow;
}
}
return result;
}
Index: jpeg/jmorecfg.h
===================================================================
RCS file: /cvsroot/mozilla/jpeg/jmorecfg.h,v
retrieving revision 3.22
diff -u -8 -p -r3.22 jmorecfg.h
--- jpeg/jmorecfg.h 30 Jan 2008 06:29:24 -0000 3.22
+++ jpeg/jmorecfg.h 3 May 2008 18:44:50 -0000
@@ -102,32 +102,40 @@ typedef short JSAMPLE;
* Again, we allocate large arrays of these, but you can change to int
* if you have memory to burn and "short" is really slow.
*/
typedef short JCOEF;
/* Defines for MMX/SSE2 support. */
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define HAVE_MMX_INTEL_MNEMONICS
+/* HAVE_SSE2_INTEL_MNEMONICS means Win32 and MSVC. HAVE_SSE2_INTRINSICS */
+/* means that the platform supports SSE2 or higher so that we don't */
+/*
have to check runtime CPU
capabilities.
*/
-/* SSE2 code appears broken for some cpus (bug 247437) */
+#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
+#define HAVE_MMX_INTEL_MNEMONICS
#define HAVE_SSE2_INTEL_MNEMONICS
-#define HAVE_SSE2_INTRINSICS
#endif
-#if defined(__GNUC__) && defined(__i386__)
-#if defined(XP_MACOSX)
+#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
#define HAVE_SSE2_INTRINSICS
-#endif /* ! XP_MACOSX */
-#endif /* ! GNUC && i386 */
+#endif /* GNUC && i386 && XP_MACOSX */
/* Add support for other platforms here */
+/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
+/* We're only aligning Mac OSX structures right now */
+
+#ifdef HAVE_SSE2_INTRINSICS
+#define SSE2_ALIGN __attribute__ ((aligned (16)))
+#else
+#define SSE2_ALIGN
+#endif
+
/* Compressed datastreams are represented as arrays of JOCTET.
* These must be EXACTLY 8 bits wide, at least once they are written to
* external storage. Note that when using the stdio data source/destination
* managers, this is also the data type passed to fread/fwrite.
*/
#ifdef HAVE_UNSIGNED_CHAR
@@ -315,17 +323,17 @@ typedef unsigned char boolean;
* QUANT_1PASS_SUPPORTED QUANT_2PASS_SUPPORTED
*/
/* Arithmetic coding is unsupported for legal reasons. Complaints to IBM. */
/* Capability options common to encoder and decoder: */
#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */
-#undef DCT_IFAST_SUPPORTED /* faster, less accurate integer method */
+#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */
#undef DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */
/* Encoder capability options: */
#undef C_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */
#define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
#define C_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/
#define ENTROPY_OPT_SUPPORTED /* Optimization of entropy coding parms? */
Index: modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
===================================================================
RCS file: /cvsroot/mozilla/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp,v
retrieving revision 1.94
diff -u -8 -p -r1.94 nsJPEGDecoder.cpp
--- modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp 4 Apr 2008 16:02:44 -0000 1.94
+++ modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp 3 May 2008 18:44:58 -0000
@@ -51,16 +51,20 @@
#include "nsIImage.h"
#include "nsIInterfaceRequestorUtils.h"
#include "gfxColor.h"
#include "jerror.h"
#include "gfxPlatform.h"
+#if defined(GFX_HAVE_SSE2_INTEL_MNEMONICS) || defined(GFX_HAVE_SSE2_INTRINSICS)
+#include "emmintrin.h"
+#endif /* GFX_HAVE_SSE2_INTEL_MNEMONICS || GFX_HAVE_SSE2_INTRINSICS */
+
extern "C" {
#include "iccjpeg.h"
/* Colorspace conversion (copied from jpegint.h) */
struct jpeg_color_deconverter {
JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
JMETHOD(void, color_convert, (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -519,17 +523,21 @@ nsresult nsJPEGDecoder::ProcessData(cons
case JPEG_START_DECOMPRESS:
{
LOG_SCOPE(gJPEGlog, "nsJPEGDecoder::ProcessData -- entering JPEG_START_DECOMPRESS case");
/* Step 4: set parameters for decompression */
/* FIXME -- Should reset dct_method and dither mode
* for final pass of progressive JPEG
*/
+#ifdef GFX_HAVE_SSE2_INTRINSICS
+ mInfo.dct_method = JDCT_IFAST;
+#else
mInfo.dct_method = JDCT_ISLOW;
+#endif /* GFX_HAVE_SSE2_INTRINSICS */
mInfo.dither_mode = JDITHER_FS;
mInfo.do_fancy_upsampling = TRUE;
mInfo.enable_2pass_quant = FALSE;
mInfo.do_block_smoothing = TRUE;
/* Step 5: Start decompressor */
if (jpeg_start_decompress(&mInfo) == FALSE) {
PR_LOG(gJPEGDecoderAccountingLog, PR_LOG_DEBUG,
@@ -1245,50 +1253,175 @@ const int Cb_g_tab[(MAXJSAMPLE+1) * size
((shift_temp = (x)) < 0 ? \
(shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \
(shift_temp >> (shft)))
#else
#define SHIFT_TEMPS
#define RIGHT_SHIFT(x,shft) ((x) >> (shft))
#endif
+/* Constants for SSE2 support */
+
+GFX_SSE2_ALIGN const short color_constants_1[8] = {128, 128, 128, 128, 128, 128, 128, 128};
+GFX_SSE2_ALIGN const short color_constants_2[8] = {359, 359, 359, 359, 359, 359, 359, 359};
+GFX_SSE2_ALIGN const short color_constants_3[8] = {88, 88, 88, 88, 88, 88, 88, 88};
+GFX_SSE2_ALIGN const short color_constants_4[8] = {183, 183, 183, 183, 183, 183, 183, 183};
+GFX_SSE2_ALIGN const short color_constants_5[8] = {454, 454, 454, 454, 454, 454, 454, 454};
+GFX_SSE2_ALIGN const short color_constants_6[8] = {11382, 11382, 11382, 11382, 11382, 11382, 11382, 11382};
METHODDEF(void)
ycc_rgb_convert_argb (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows)
{
JDIMENSION num_cols = cinfo->output_width;
JSAMPLE * range_limit = cinfo->sample_range_limit;
+/* Locals for SSE2 support */
+
+#if defined(GFX_HAVE_SSE2_INTRINSICS) || defined(GFX_HAVE_SSE2_INTEL_MNEMONICS)
+ JSAMPROW inptr0, inptr1, inptr2, outptr;
+ JDIMENSION col;
+ __m128i row_Y, row_cb, row_cr, zeroes, ones;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i row_B, row_G, row_R;
+#endif /* GFX_HAVE_SSE2_INTRINSICS || GFX_HAVE_SSE2_INTEL_MNEMONICS */
+
SHIFT_TEMPS
- /* This is used if we don't have SSE2 */
+#ifdef GFX_HAVE_SSE2_INTEL_MNEMONICS
+
+ /* See if SSE2 is available if we haven't done so */
- while (--num_rows >= 0) {
- JSAMPROW inptr0 = input_buf[0][input_row];
- JSAMPROW inptr1 = input_buf[1][input_row];
- JSAMPROW inptr2 = input_buf[2][input_row];
- input_row++;
- PRUint32 *outptr = (PRUint32 *) *output_buf++;
- for (JDIMENSION col = 0; col < num_cols; col++) {
- int y = GETJSAMPLE(inptr0[col]);
- int cb = GETJSAMPLE(inptr1[col]);
- int cr = GETJSAMPLE(inptr2[col]);
- JSAMPLE * range_limit_y = range_limit + y;
- /* Range-limiting is essential due to noise introduced by DCT losses. */
- outptr[col] = 0xFF000000 |
-
( range_limit_y[Cr_r_tab[cr]] << 16 ) |
-
( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr],
SCALEBITS))] << 8 ) |
-
( range_limit_y[Cb_b_tab[cb]] );
+ if (SSE2Present == 0)
+ SSE2Present = gfxPlatform::SSE2Support();
+#endif /* GFX_HAVE_SSE2_INTEL_MNEMONICS */
+
+ if (SSE2Present == 1) {
+
+ /* Load registers with constants if we're going to do at least one quad */
+
+ if (num_cols >= 8) {
+ zeroes = _mm_setzero_si128();
+ ones = _mm_cmpeq_epi8(zeroes, zeroes);
+ }
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+
+ /* YCbCr -> RGB using scaled multiplication */
+
+ for (col = 0; col + 8 <= num_cols; col += 8) {
+
+ /* Read a row of each color component */
+
+ row_Y = _mm_loadl_epi64((__m128i *) (inptr0 + col));
+ row_cb = _mm_loadl_epi64((__m128i *) inptr1);
+ row_cr = _mm_loadl_epi64((__m128i *) inptr2);
+
+ /* Convert bytes to words */
+
+ row_Y = _mm_unpacklo_epi8(row_Y, zeroes);
+ row_cb = _mm_unpacklo_epi8(row_cb, zeroes);
+ row_cr = _mm_unpacklo_epi8(row_cr, zeroes);
+
+ /* Compute B and R */
+
+
tmp0 = _mm_sub_epi16(row_cb, *((__m128i *)
color_constants_1)); /* cb - 128 */
+
tmp1 = _mm_sub_epi16(row_cr, *((__m128i *)
color_constants_1)); /* cr - 128 */
+
tmp2 = _mm_slli_epi16(tmp0,
8);
/* (cb - 128) << 8 */
+
tmp3 = _mm_slli_epi16(tmp1,
8);
/* (cr - 128) << 8 */
+
tmp2 = _mm_mulhi_epi16(tmp2, *((__m128i *)
color_constants_5)); /* (454 * ((cb - 128) <<
8)) >> 16 */
+ tmp3
= _mm_mulhi_epi16(tmp3, *((__m128i *)
color_constants_2)); /* (359 * ((cr - 128) <<
8)) >> 16 */
+
row_B
= _mm_add_epi16(tmp2,
row_Y);
/* B = y + (454 * ((cb - 128) << 8)) >> 16 */
+
row_R = _mm_add_epi16(tmp3,
row_Y);
/* R = y + (359 * ((cr - 128) << 8)) >> 16 */
+
+ /* Compute G */
+
+
tmp4 = _mm_mullo_epi16(row_cb, *((__m128i *)
color_constants_3)); /* 88 * cb */
+
tmp5 = _mm_mullo_epi16(tmp1, *((__m128i *)
color_constants_4)); /* 183 * (cr - 128)
*/
+ tmp4 =
_mm_sub_epi16(tmp4, *((__m128i *)
color_constants_6)); /* 88 *
cb - (34806 - 128*256) */
+
tmp4 = _mm_add_epi16(tmp4,
tmp5);
/* (88 * cb) + 183 * (cr - 128) - (34806 - 128*256) */
+
tmp4 = _mm_srai_epi16(tmp4,
8);
/* Shift the above right by 8 to divide by 256 */
+
row_G = _mm_sub_epi16(row_Y,
tmp4);
/* G */
+
+ /* Pack word to byte with unsigned saturation which range limits too */
+
+ row_R = _mm_packus_epi16(row_R, zeroes);
+ row_G = _mm_packus_epi16(row_G, zeroes);
+ row_B = _mm_packus_epi16(row_B, zeroes);
+
+ /* Pack horizontally */
+
+
tmp0 = _mm_unpacklo_epi8(row_B,
row_G); /* ABAB
ABAB ABAB ABAB */
+
tmp1 = _mm_unpacklo_epi8(row_R,
ones); /*
GRGR GRGR GRGR GRGR */
+
tmp2 = _mm_unpacklo_epi16(tmp0,
tmp1); /*
ARGB ARGB ARGB ARGB (low) */
+
tmp3 = _mm_unpackhi_epi16(tmp0,
tmp1); /*
ARGB ARGB ARGB ARGB (high) */
+
+ /* Write out 32 bytes of ARGB */
+
+ _mm_storel_epi64((__m128i *) outptr, tmp2);
+ _mm_storel_epi64((__m128i *) (outptr + 8), _mm_srli_si128(tmp2, 8));
+ _mm_storel_epi64((__m128i *) (outptr + 16), tmp3);
+ _mm_storel_epi64((__m128i *) (outptr + 24), _mm_srli_si128(tmp3, 8));
+
+ /* Slide the pointers */
+
+ /* inptr0 += 8; */
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += 32;
+ }
+
+ /* Process remainder using scalar code */
+ for (; col < num_cols; col++) {
+ int y = GETJSAMPLE(inptr0[col]);
+ int cb = GETJSAMPLE(inptr1[0]);
+ int cr = GETJSAMPLE(inptr2[0]);
+ inptr1++;
+ inptr2++;
+ JSAMPLE * range_limit_y = range_limit + y;
+ /* Range-limiting is essential due to noise introduced by DCT losses. */
+ *((PRUint32 *) outptr) = 0xFF000000 |
+
( range_limit_y[Cr_r_tab[cr]] << 16 ) |
+
( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr],
SCALEBITS))] << 8 ) |
+