Index: jddctmgr.c =================================================================== RCS file: /cvsroot/mozilla/jpeg/jddctmgr.c,v retrieving revision 3.3 diff -u -r3.3 jddctmgr.c --- jddctmgr.c 29 Apr 2002 23:27:33 -0000 3.3 +++ jddctmgr.c 31 Mar 2005 00:02:24 -0000 @@ -78,6 +78,7 @@ #endif #endif +#ifndef QUEK15 GLOBAL(void) jpeg_idct_islow_sse2 ( j_decompress_ptr cinfo, @@ -85,7 +86,15 @@ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); - +#else +GLOBAL(void) +jpeg_idct_ifast_sse2 ( + j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col); +#endif /* * Prepare for an output pass. @@ -128,13 +137,18 @@ #ifdef HAVE_SSE2_INTEL_MNEMONICS if(SSE2Available == 1) { +#ifndef QUEK15 method_ptr = jpeg_idct_islow_sse2; method = JDCT_ISLOW; +#else + method_ptr = jpeg_idct_ifast_sse2; + method = JDCT_IFAST; +#endif ! QUEK15 } else { - method_ptr = jpeg_idct_islow; - method = JDCT_ISLOW; + method_ptr = jpeg_idct_ifast; + method = JDCT_IFAST; } #else method_ptr = jpeg_idct_islow; @@ -148,8 +162,13 @@ #ifdef HAVE_SSE2_INTEL_MNEMONICS if (SSE2Available==1) { +#ifndef QUEK15 method_ptr = jpeg_idct_islow_sse2; method = JDCT_ISLOW; +#else + method_ptr = jpeg_idct_ifast_sse2; + method = JDCT_ISLOW; +#endif ! QUEK15 } else { @@ -200,9 +219,63 @@ * coefficients, but are stored as ints to ensure access efficiency. */ ISLOW_MULT_TYPE * ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table; - for (i = 0; i < DCTSIZE2; i++) { - ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i]; + + /* We're moving 64 words. Use something fast. MGM Summer 2004. */ + /* If we were really smart, we'd just point to the original */ + /* tables if that's possible. We might want to see if these */ + /* numbers change or if they're always the same. */ + + void * input = (void *) qtbl->quantval; + + __asm{ + mov eax, ismtbl + mov ecx, input + movlps xmm0, QWORD PTR [ecx] + movhps xmm0, QWORD PTR [ecx+8] + movlps xmm1, QWORD PTR [ecx+16] + movhps xmm1, QWORD PTR [ecx+24] + movlps xmm2, QWORD PTR [ecx+32] + movhps xmm2, QWORD PTR [ecx+40] + movlps xmm3, QWORD PTR [ecx+48] + movhps xmm3, QWORD PTR [ecx+56] + movlps QWORD PTR [eax], xmm0 + movhps QWORD PTR [eax+8], xmm0 + movlps QWORD PTR [eax+16], xmm1 + movhps QWORD PTR [eax+24], xmm1 + movlps QWORD PTR [eax+32], xmm2 + movhps QWORD PTR [eax+40], xmm2 + movlps QWORD PTR [eax+48], xmm3 + movhps QWORD PTR [eax+56], xmm3 + movlps xmm0, QWORD PTR [ecx+64] + movhps xmm0, QWORD PTR [ecx+72] + movlps xmm1, QWORD PTR [ecx+80] + movhps xmm1, QWORD PTR [ecx+88] + movlps xmm2, QWORD PTR [ecx+96] + movhps xmm2, QWORD PTR [ecx+104] + movlps xmm3, QWORD PTR [ecx+112] + movhps xmm3, QWORD PTR [ecx+120] + movlps QWORD PTR [eax+64], xmm0 + movhps QWORD PTR [eax+72], xmm0 + movlps QWORD PTR [eax+80], xmm1 + movhps QWORD PTR [eax+88], xmm1 + movlps QWORD PTR [eax+96], xmm2 + movhps QWORD PTR [eax+104], xmm2 + movlps QWORD PTR [eax+112], xmm3 + movhps QWORD PTR [eax+120], xmm3 + } + +/* + for (i = 0; i < DCTSIZE2; i+=8) { + ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i]; + ismtbl[i+1] = (ISLOW_MULT_TYPE) qtbl->quantval[i+1]; + ismtbl[i+2] = (ISLOW_MULT_TYPE) qtbl->quantval[i+2]; + ismtbl[i+3] = (ISLOW_MULT_TYPE) qtbl->quantval[i+3]; + ismtbl[i+4] = (ISLOW_MULT_TYPE) qtbl->quantval[i+4]; + ismtbl[i+5] = (ISLOW_MULT_TYPE) qtbl->quantval[i+5]; + ismtbl[i+6] = (ISLOW_MULT_TYPE) qtbl->quantval[i+6]; + ismtbl[i+7] = (ISLOW_MULT_TYPE) qtbl->quantval[i+7]; } +*/ } break; #endif @@ -216,6 +289,9 @@ * For integer operation, the multiplier table is to be scaled by * IFAST_SCALE_BITS. */ + + /* Optimize this!!!!!!!!!!!!!!! */ + IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table; #define CONST_BITS 14 static const INT16 aanscales[DCTSIZE2] = { @@ -295,8 +371,22 @@ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { /* Allocate and pre-zero a multiplier table for each component */ - compptr->dct_table = - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + +#ifdef HAVE_SSE2_INTEL_MNEMONICS + if (SSE2Available == 1) { + + /* For SSE2, align this table so that we can use */ + /* high-performance parallel multiply in jidctint.c. */ + + compptr->dct_table = + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + SIZEOF(multiplier_table)+16); + compptr->dct_table = (((unsigned int) compptr->dct_table) & 0xFFFFFFF0 ) + 16; + } + else +#endif + compptr->dct_table = + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(multiplier_table)); MEMZERO(compptr->dct_table, SIZEOF(multiplier_table)); /* Mark multiplier table not yet set up for any method */