Elliott's Firefox Builds for Mac OS X
Here are Firefox builds for Mac OS X. They run on Intel processors.
August
18, 2007 firefox 2.0.0.6 x06 for Mac OS X fixed cnet.com bug in x05, added gfxImageFrame optimization, added -mfpmath=sse -ftree-vectorize
August
17, 2007 firefox 2.0.0.6 x05 for Mac OS X added JPEG decoding SSE2 optimization (crash bug at cnet.com)
August
16, 2007 firefox 2.0.0.6 x04 for Mac OS X added JPEG SSE2 Color Optimization
August
12, 2007 firefox 2.0.0.6 x03 for Mac OS X added JPEG
Inverse Discrete Cosine Transform SSE2 acceleration
August
8, 2007 firefox 2.0.0.6 x01 for Mac OS X added
jslock lightweight locking and lightweight rounding
August
8, 2007 firefox 2.0.0.6 for Mac OS X stock build
with -o3 optimization
Things to do:
- Port color and range limit tables from x64 (JPEG)
- Use SSE2 processing for jdsample box filters (JPEG)
- Optimize decoder routines to use SSE2 for RGB to ARGB conversion
- See if we can optimize out the extra copying stage in the decoders
- Use integer work variable instead of a double for JS conversion of string ints to doubles
- Port PNG MMX optimizations to intrinsics
- Consider writing a highly optimized memcpy routine
- Get PowerPC G5 builds up and running
- Get performance preferences in the build
Patches applied to
x01 build:
Index: nsUnitConversion.h
===================================================================
RCS file: /cvsroot/mozilla/xpcom/ds/nsUnitConversion.h,v
retrieving revision 3.12
diff -r3.12 nsUnitConversion.h
43a44
> #include <xmmintrin.h>
82c83,85
< return ((0.0f <= aValue) ?
nscoord(aValue + ROUND_CONST_FLOAT) : nscoord(aValue -
ROUND_CONST_FLOAT));
---
> /* return ((0.0f <= aValue) ?
nscoord(aValue + ROUND_CONST_FLOAT) : nscoord(aValue -
ROUND_CONST_FLOAT)); */
>
return(_mm_cvtss_si32(_mm_load_ss(&aValue)));
>
101c104,105
< return ((0.0f <= aValue) ?
PRInt32(aValue + ROUND_CONST_FLOAT) : PRInt32(aValue -
ROUND_CONST_FLOAT));
---
> /* return ((0.0f <= aValue) ?
PRInt32(aValue + ROUND_CONST_FLOAT) : PRInt32(aValue -
ROUND_CONST_FLOAT)); */
>
return(_mm_cvtss_si32(_mm_load_ss(&aValue)));
Index: jslock.c
===================================================================
RCS file: /cvsroot/mozilla/js/src/jslock.c,v
retrieving revision 3.55.20.3
diff -u -r3.55.20.3 jslock.c
--- jslock.c 29 Mar 2007 21:55:35
-0000 3.55.20.3
+++ jslock.c 10 Aug 2007 19:38:14 -0000
@@ -54,6 +54,7 @@
#include "jslock.h"
#include "jsscope.h"
#include "jsstr.h"
+#include <libkern/OSAtomic.h>
#define ReadWord(W) (W)
@@ -102,11 +103,17 @@
#elif defined(__GNUC__) && defined(__i386__)
/* Note: This fails on 386 cpus, cmpxchgl is a >= 486
instruction */
+
static JS_INLINE int
js_CompareAndSwap(jsword *w, jsword ov, jsword nv)
{
- unsigned int res;
+ /* unsigned int res; */
+
+ return(OSAtomicCompareAndSwap32(ov, nv,
(int32 *) w));
+ /*
return(_InterlockedCompareExchange(w, nv, ov) == ov); */
+
+ /*
__asm__ __volatile__ (
"lock\n"
"cmpxchgl %2, (%1)\n"
@@ -116,6 +123,7 @@
: "r" (w), "r" (nv), "a" (ov)
: "cc", "memory");
return (int)res;
+ */
}
#elif (defined(__USLC__) || defined(_SCO_DS))
&& defined(i386)
Index: jslock.h
===================================================================
RCS file: /cvsroot/mozilla/js/src/jslock.h,v
retrieving revision 3.27.22.2
diff -u -r3.27.22.2 jslock.h
--- jslock.h 28 Nov 2006 19:54:57
-0000 3.27.22.2
+++ jslock.h 10 Aug 2007 19:38:15 -0000
@@ -183,7 +183,7 @@
JS_END_MACRO
/* FIXME: bug 353962 hackaround */
-#define JS_USE_ONLY_NSPR_LOCKS 1
+/* #define JS_USE_ONLY_NSPR_LOCKS 1 */
#if defined(JS_USE_ONLY_NSPR_LOCKS)
||
\
!( (defined(_WIN32)
&&
defined(_M_IX86))
||
\
=========================================
Index: gfxImageFrame.cpp
===================================================================
RCS file: /cvsroot/mozilla/gfx/src/shared/gfxImageFrame.cpp,v
retrieving revision 1.32
diff -u -r1.32 gfxImageFrame.cpp
--- gfxImageFrame.cpp 8 Mar 2005 03:44:27 -0000 1.32
+++ gfxImageFrame.cpp 18 Aug 2007 23:51:51 -0000
@@ -39,6 +39,8 @@
#include "gfxImageFrame.h"
#include "nsIServiceManager.h"
+#include "emmintrin.h"
+#include "xmmintrin.h"
NS_IMPL_ISUPPORTS2(gfxImageFrame, gfxIImageFrame, nsIInterfaceRequestor)
@@ -279,9 +281,15 @@
return NS_OK;
}
+unsigned char color_mask_1b[16] __attribute__ ((aligned(16))) = {255,255,255,0,0,0,0,0,255,255,255,0,0,0,0,0};
+unsigned char color_mask_2b[16] __attribute__ ((aligned(16))) = {0,0,0,255,255,255,0,0,0,0,0,255,255,255,0,0};
+
/* void setImageData ([array, size_is (length), const] in PRUint8 data, in unsigned long length, in long offset); */
NS_IMETHODIMP gfxImageFrame::SetImageData(const PRUint8 *aData, PRUint32 aLength, PRInt32 aOffset)
{
+ int express_flag = 0;
+ __m128i mask1, mask2, tmp1, tmp2, read1;
+
if (!mInitalized)
return NS_ERROR_NOT_INITIALIZED;
@@ -292,6 +300,11 @@
if (aLength == 0)
return NS_OK;
+ if ((int) aLength < 0) {
+ express_flag = - ((int) aLength);
+ aLength = (-4) * ((int) aLength);
+ }
+
PRInt32 row_stride = mImage->GetLineStride();
mImage->LockImagePixels(PR_FALSE);
@@ -313,7 +326,40 @@
}
if (aData)
- memcpy(imgData + newOffset, aData, aLength);
+ if (express_flag == 0)
+ memcpy(imgData + newOffset, aData, aLength);
+ else {
+ const PRUint8 *j1 = aData;
+ PRUint8 *ptrOutputBuf = imgData + newOffset;
+ int i;
+
+ mask1 = *((__m128i *) color_mask_1b);
+ mask2 = *((__m128i *) color_mask_2b);
+ for (i=0; i < express_flag - 4; i += 4) {
+ tmp1 = mask1;
+ tmp2 = mask2;
+ read1 = _mm_loadl_epi64((__m128i *) j1);
+ read1 = (__m128i) _mm_loadh_pi((__m128) read1, (__m64 *)(j1+6));
+ tmp1 = _mm_and_si128(tmp1, read1);
+ tmp2 = _mm_and_si128(tmp2, read1);
+ tmp1 = _mm_slli_si128(tmp1, 1);
+ tmp2 = _mm_slli_si128(tmp2, 2);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ _mm_storel_pi((__m64 *) ptrOutputBuf, (__m128) tmp1);
+ _mm_storeh_pi((__m64 *) (ptrOutputBuf + 8), (__m128) tmp1);
+ ptrOutputBuf += 16;
+ j1 += 12;
+ }
+
+ for (; i < express_flag; ++i) {
+ ptrOutputBuf[0] = 0;
+ ptrOutputBuf[1] = *j1++;
+ ptrOutputBuf[2] = *j1++;
+ ptrOutputBuf[3] = *j1++;
+ ptrOutputBuf += 4;
+ }
+
+ }
else
memset(imgData + newOffset, 0, aLength);
mImage->UnlockImagePixels(PR_FALSE);
Index: nsJPEGDecoder.cpp
===================================================================
RCS file: /cvsroot/mozilla/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp,v
retrieving revision 1.62.18.1
diff -u -r1.62.18.1 nsJPEGDecoder.cpp
--- nsJPEGDecoder.cpp 3 Feb 2006 14:41:10 -0000 1.62.18.1
+++ nsJPEGDecoder.cpp 18 Aug 2007 17:33:20 -0000
@@ -49,6 +49,8 @@
#include "ImageLogging.h"
#include "jerror.h"
+#include "emmintrin.h"
+#include "xmmintrin.h"
NS_IMPL_ISUPPORTS1(nsJPEGDecoder, imgIDecoder)
@@ -361,7 +363,7 @@
mSamples = (*mInfo.mem->alloc_sarray)((j_common_ptr) &mInfo,
JPOOL_IMAGE,
-
row_stride, 1);
+
mInfo.output_width * 3, 1);
#if defined(XP_WIN) || defined(XP_OS2) || defined(XP_BEOS) ||
defined(XP_MAC) || defined(XP_MACOSX) || defined(MOZ_WIDGET_PHOTON)
// allocate buffer to do byte flipping / padding
@@ -379,7 +381,7 @@
/* FIXME -- Should reset dct_method and dither mode
* for final pass of progressive JPEG
*/
- mInfo.dct_method = JDCT_ISLOW;
+ mInfo.dct_method = JDCT_IFAST;
mInfo.dither_mode = JDITHER_FS;
mInfo.do_fancy_upsampling = TRUE;
mInfo.enable_2pass_quant = FALSE;
@@ -499,12 +501,12 @@
return NS_OK;
}
-
int
nsJPEGDecoder::OutputScanlines()
{
PRUint32 top = mInfo.output_scanline;
PRBool rv = PR_TRUE;
+ PRUint32 i;
while ((mInfo.output_scanline < mInfo.output_height)) {
JSAMPROW samples;
@@ -529,11 +531,29 @@
}
samples = mRGBRow;
-#elif defined(XP_MAC) || defined(XP_MACOSX)
+#elif defined(XP_MAC1) || defined(XP_MACOSX1)
PRUint8 *ptrOutputBuf = mRGBRow;
JSAMPLE *j1 = mSamples[0];
- for (PRUint32 i=0;i<mInfo.output_width;++i) {
+ mask1 = *((__m128i *) color_mask_1a);
+ mask2 = *((__m128i *) color_mask_2a);
+ for (i=0; i < mInfo.output_width - 4; i += 4) {
+ tmp1 = mask1;
+ tmp2 = mask2;
+ read1 = _mm_loadl_epi64((__m128i *) j1);
+ read1 = (__m128i) _mm_loadh_pi((__m128) read1, (__m64 *)(j1+6));
+ tmp1 = _mm_and_si128(tmp1, read1);
+ tmp2 = _mm_and_si128(tmp2, read1);
+ tmp1 = _mm_slli_si128(tmp1, 1);
+ tmp2 = _mm_slli_si128(tmp2, 2);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ _mm_storel_pi((__m64 *) ptrOutputBuf, (__m128) tmp1);
+ _mm_storeh_pi((__m64 *) (ptrOutputBuf + 8), (__m128) tmp1);
+ ptrOutputBuf += 16;
+ j1 += 12;
+ }
+
+ for (; i < mInfo.output_width; ++i) {
ptrOutputBuf[0] = 0;
ptrOutputBuf[1] = *j1++;
ptrOutputBuf[2] = *j1++;
@@ -557,8 +577,8 @@
PRUint32 bpr;
mFrame->GetImageBytesPerRow(&bpr);
mFrame->SetImageData(
-
samples,
// data
- row_stride, // length
+
samples,
// data
+
-mInfo.output_width,
// length
(mInfo.output_scanline-1) * bpr); // offset
}
Updated by Elliott
on August 18, 2007.