Elliott's Firefox Builds for Mac OS X



Here are Firefox builds for Mac OS X. They run on Intel processors.

August 18, 2007 firefox 2.0.0.6 x06 for Mac OS X fixed cnet.com bug in x05, added gfxImageFrame optimization, added -mfpmath=sse -ftree-vectorize
August 17, 2007 firefox 2.0.0.6 x05 for Mac OS X added JPEG decoding SSE2 optimization (crash bug at cnet.com)
August 16, 2007 firefox 2.0.0.6 x04 for Mac OS X added JPEG SSE2 Color Optimization
August 12, 2007 firefox 2.0.0.6 x03 for Mac OS X added JPEG Inverse Discrete Cosine Transform SSE2 acceleration
August 8, 2007 firefox 2.0.0.6 x01 for Mac OS X added jslock lightweight locking and lightweight rounding
August 8, 2007 firefox 2.0.0.6 for Mac OS X stock build with -o3 optimization

Things to do:





Patches applied to x01 build:

Index: nsUnitConversion.h
===================================================================
RCS file: /cvsroot/mozilla/xpcom/ds/nsUnitConversion.h,v
retrieving revision 3.12
diff -r3.12 nsUnitConversion.h
43a44
> #include <xmmintrin.h>
82c83,85
<   return ((0.0f <= aValue) ? nscoord(aValue + ROUND_CONST_FLOAT) : nscoord(aValue - ROUND_CONST_FLOAT));
---
>   /* return ((0.0f <= aValue) ? nscoord(aValue + ROUND_CONST_FLOAT) : nscoord(aValue - ROUND_CONST_FLOAT)); */
>   return(_mm_cvtss_si32(_mm_load_ss(&aValue)));
>
101c104,105
<   return ((0.0f <= aValue) ? PRInt32(aValue + ROUND_CONST_FLOAT) : PRInt32(aValue - ROUND_CONST_FLOAT));
---
>   /* return ((0.0f <= aValue) ? PRInt32(aValue + ROUND_CONST_FLOAT) : PRInt32(aValue - ROUND_CONST_FLOAT)); */
>   return(_mm_cvtss_si32(_mm_load_ss(&aValue)));

Index: jslock.c
===================================================================
RCS file: /cvsroot/mozilla/js/src/jslock.c,v
retrieving revision 3.55.20.3
diff -u -r3.55.20.3 jslock.c
--- jslock.c    29 Mar 2007 21:55:35 -0000      3.55.20.3
+++ jslock.c    10 Aug 2007 19:38:14 -0000
@@ -54,6 +54,7 @@
 #include "jslock.h"
 #include "jsscope.h"
 #include "jsstr.h"
+#include <libkern/OSAtomic.h>
 
 #define ReadWord(W) (W)
 
@@ -102,11 +103,17 @@
 #elif defined(__GNUC__) && defined(__i386__)
 
 /* Note: This fails on 386 cpus, cmpxchgl is a >= 486 instruction */
+
 static JS_INLINE int
 js_CompareAndSwap(jsword *w, jsword ov, jsword nv)
 {
-    unsigned int res;
+    /* unsigned int res; */
+
+    return(OSAtomicCompareAndSwap32(ov, nv, (int32 *) w));
 
+    /* return(_InterlockedCompareExchange(w, nv, ov) == ov); */
+
+    /*
     __asm__ __volatile__ (
                           "lock\n"
                           "cmpxchgl %2, (%1)\n"
@@ -116,6 +123,7 @@
                           : "r" (w), "r" (nv), "a" (ov)
                           : "cc", "memory");
     return (int)res;
+    */
 }
 
 #elif (defined(__USLC__) || defined(_SCO_DS)) && defined(i386)
Index: jslock.h
===================================================================
RCS file: /cvsroot/mozilla/js/src/jslock.h,v
retrieving revision 3.27.22.2
diff -u -r3.27.22.2 jslock.h
--- jslock.h    28 Nov 2006 19:54:57 -0000      3.27.22.2
+++ jslock.h    10 Aug 2007 19:38:15 -0000
@@ -183,7 +183,7 @@
     JS_END_MACRO
 
 /* FIXME: bug 353962 hackaround */
-#define JS_USE_ONLY_NSPR_LOCKS  1
+/* #define JS_USE_ONLY_NSPR_LOCKS  1 */
 
 #if defined(JS_USE_ONLY_NSPR_LOCKS) ||                                        \
     !( (defined(_WIN32) && defined(_M_IX86)) ||                               \

=========================================

Index: gfxImageFrame.cpp
===================================================================
RCS file: /cvsroot/mozilla/gfx/src/shared/gfxImageFrame.cpp,v
retrieving revision 1.32
diff -u -r1.32 gfxImageFrame.cpp
--- gfxImageFrame.cpp    8 Mar 2005 03:44:27 -0000    1.32
+++ gfxImageFrame.cpp    18 Aug 2007 23:51:51 -0000
@@ -39,6 +39,8 @@
 
 #include "gfxImageFrame.h"
 #include "nsIServiceManager.h"
+#include "emmintrin.h"
+#include "xmmintrin.h"
 
 NS_IMPL_ISUPPORTS2(gfxImageFrame, gfxIImageFrame, nsIInterfaceRequestor)
 
@@ -279,9 +281,15 @@
   return NS_OK;
 }
 
+unsigned char color_mask_1b[16] __attribute__ ((aligned(16))) = {255,255,255,0,0,0,0,0,255,255,255,0,0,0,0,0};
+unsigned char color_mask_2b[16] __attribute__ ((aligned(16))) = {0,0,0,255,255,255,0,0,0,0,0,255,255,255,0,0};
+
 /* void setImageData ([array, size_is (length), const] in PRUint8 data, in unsigned long length, in long offset); */
 NS_IMETHODIMP gfxImageFrame::SetImageData(const PRUint8 *aData, PRUint32 aLength, PRInt32 aOffset)
 {
+  int express_flag = 0;
+  __m128i mask1, mask2, tmp1, tmp2, read1;
+
   if (!mInitalized)
     return NS_ERROR_NOT_INITIALIZED;
 
@@ -292,6 +300,11 @@
   if (aLength == 0)
     return NS_OK;
 
+  if ((int) aLength < 0) {
+    express_flag = - ((int) aLength);
+    aLength = (-4) * ((int) aLength);
+  }
+
   PRInt32 row_stride = mImage->GetLineStride();
 
   mImage->LockImagePixels(PR_FALSE);
@@ -313,7 +326,40 @@
   }
 
   if (aData)
-    memcpy(imgData + newOffset, aData, aLength);
+    if (express_flag == 0)
+      memcpy(imgData + newOffset, aData, aLength);
+    else {
+      const PRUint8 *j1 = aData;
+      PRUint8 *ptrOutputBuf = imgData + newOffset;
+      int i;
+
+      mask1 = *((__m128i *) color_mask_1b);
+      mask2 = *((__m128i *) color_mask_2b);
+      for (i=0; i < express_flag - 4; i += 4) {
+        tmp1 = mask1;
+        tmp2 = mask2;
+        read1 = _mm_loadl_epi64((__m128i *) j1);
+        read1 = (__m128i) _mm_loadh_pi((__m128) read1, (__m64 *)(j1+6));
+        tmp1 = _mm_and_si128(tmp1, read1);
+        tmp2 = _mm_and_si128(tmp2, read1);
+        tmp1 = _mm_slli_si128(tmp1, 1);
+        tmp2 = _mm_slli_si128(tmp2, 2);
+        tmp1 = _mm_or_si128(tmp1, tmp2);
+        _mm_storel_pi((__m64 *) ptrOutputBuf, (__m128) tmp1);
+        _mm_storeh_pi((__m64 *) (ptrOutputBuf + 8), (__m128) tmp1);
+        ptrOutputBuf += 16;
+        j1 += 12;
+      }
+
+      for (; i < express_flag; ++i) {
+        ptrOutputBuf[0] = 0;
+        ptrOutputBuf[1] = *j1++;
+        ptrOutputBuf[2] = *j1++;
+        ptrOutputBuf[3] = *j1++;
+        ptrOutputBuf += 4;
+      }
+
+    }
   else
     memset(imgData + newOffset, 0, aLength);
   mImage->UnlockImagePixels(PR_FALSE);


Index: nsJPEGDecoder.cpp
===================================================================
RCS file: /cvsroot/mozilla/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp,v
retrieving revision 1.62.18.1
diff -u -r1.62.18.1 nsJPEGDecoder.cpp
--- nsJPEGDecoder.cpp    3 Feb 2006 14:41:10 -0000    1.62.18.1
+++ nsJPEGDecoder.cpp    18 Aug 2007 17:33:20 -0000
@@ -49,6 +49,8 @@
 #include "ImageLogging.h"
 
 #include "jerror.h"
+#include "emmintrin.h"
+#include "xmmintrin.h"
 
 NS_IMPL_ISUPPORTS1(nsJPEGDecoder, imgIDecoder)
 
@@ -361,7 +363,7 @@
 
     mSamples = (*mInfo.mem->alloc_sarray)((j_common_ptr) &mInfo,
                                            JPOOL_IMAGE,
-                                           row_stride, 1);
+                                           mInfo.output_width * 3, 1);
 
 #if defined(XP_WIN) || defined(XP_OS2) || defined(XP_BEOS) || defined(XP_MAC) || defined(XP_MACOSX) || defined(MOZ_WIDGET_PHOTON)
     // allocate buffer to do byte flipping / padding
@@ -379,7 +381,7 @@
     /* FIXME -- Should reset dct_method and dither mode
      * for final pass of progressive JPEG
      */
-    mInfo.dct_method =  JDCT_ISLOW;
+    mInfo.dct_method =  JDCT_IFAST;
     mInfo.dither_mode = JDITHER_FS;
     mInfo.do_fancy_upsampling = TRUE;
     mInfo.enable_2pass_quant = FALSE;
@@ -499,12 +501,12 @@
   return NS_OK;
 }
 
-
 int
 nsJPEGDecoder::OutputScanlines()
 {
   PRUint32 top = mInfo.output_scanline;
   PRBool rv = PR_TRUE;
+  PRUint32 i;
 
   while ((mInfo.output_scanline < mInfo.output_height)) {
       JSAMPROW samples;
@@ -529,11 +531,29 @@
       }
 
       samples = mRGBRow;
-#elif defined(XP_MAC) || defined(XP_MACOSX)
+#elif defined(XP_MAC1) || defined(XP_MACOSX1)
       PRUint8 *ptrOutputBuf = mRGBRow;
 
       JSAMPLE *j1 = mSamples[0];
-      for (PRUint32 i=0;i<mInfo.output_width;++i) {
+      mask1 = *((__m128i *) color_mask_1a);
+      mask2 = *((__m128i *) color_mask_2a);
+      for (i=0; i < mInfo.output_width - 4; i += 4) {
+        tmp1 = mask1;
+        tmp2 = mask2;
+        read1 = _mm_loadl_epi64((__m128i *) j1);
+        read1 = (__m128i) _mm_loadh_pi((__m128) read1, (__m64 *)(j1+6));
+        tmp1 = _mm_and_si128(tmp1, read1);
+        tmp2 = _mm_and_si128(tmp2, read1);
+        tmp1 = _mm_slli_si128(tmp1, 1);
+        tmp2 = _mm_slli_si128(tmp2, 2);
+        tmp1 = _mm_or_si128(tmp1, tmp2);
+        _mm_storel_pi((__m64 *) ptrOutputBuf, (__m128) tmp1);
+        _mm_storeh_pi((__m64 *) (ptrOutputBuf + 8), (__m128) tmp1);
+        ptrOutputBuf += 16;
+        j1 += 12;
+      }
+
+      for (; i < mInfo.output_width; ++i) {
         ptrOutputBuf[0] = 0;
         ptrOutputBuf[1] = *j1++;
         ptrOutputBuf[2] = *j1++;
@@ -557,8 +577,8 @@
       PRUint32 bpr;
       mFrame->GetImageBytesPerRow(&bpr);
       mFrame->SetImageData(
-        samples,             // data
-        row_stride,          // length
+        samples,                          // data
+        -mInfo.output_width,              // length
         (mInfo.output_scanline-1) * bpr); // offset
   }
 



Updated by Elliott on August 18, 2007.