preproc.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. /****************************************************************************
  2. *
  3. * Module Title : preproc.c
  4. *
  5. * Description : Simple pre-processor.
  6. *
  7. ****************************************************************************/
  8. /****************************************************************************
  9. * Header Files
  10. ****************************************************************************/
  11. #include "memory.h"
  12. #include "preproc.h"
  13. /****************************************************************************
  14. * Macros
  15. ****************************************************************************/
  16. #define FRAMECOUNT 7
  17. #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
  18. /****************************************************************************
  19. * Imports
  20. ****************************************************************************/
  21. extern void GetProcessorFlags (int *MmxEnabled, int *XmmEnabled, int *WmtEnabled );
  22. /****************************************************************************
  23. * Exported Global Variables
  24. ****************************************************************************/
  25. void (*tempFilter)( PreProcInstance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength );
  26. #ifndef MAPCA
  27. /****************************************************************************
  28. *
  29. * ROUTINE : spatialFilter_wmt
  30. *
  31. * INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
  32. * unsigned char *s : Pointer to source frame.
  33. * unsigned char *d : Pointer to destination frame.
  34. * int width : WIdth of images.
  35. * int height : Height of images.
  36. * int pitch : Stride of images.
  37. * int strength : Strength of filter to apply.
  38. *
  39. * OUTPUTS : None.
  40. *
  41. * RETURNS : void
  42. *
  43. * FUNCTION : Performs a closesness adjusted temporarl blur
  44. *
  45. * SPECIAL NOTES : Destination frame can be same as source frame.
  46. *
  47. ****************************************************************************/
  48. void spatialFilter_wmt
  49. (
  50. PreProcInstance *ppi,
  51. unsigned char *s,
  52. unsigned char *d,
  53. int width,
  54. int height,
  55. int pitch,
  56. int strength
  57. )
  58. {
  59. int i;
  60. int row = 1;
  61. int PixelOffsets[] =
  62. {
  63. -pitch-1, -pitch, -pitch+1,
  64. -1, 0, +1,
  65. pitch-1, pitch, pitch+1
  66. };
  67. unsigned char *frameptr = ppi->frameBuffer;
  68. __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};
  69. __declspec(align(16)) unsigned short sixteens[]= {16,16,16,16,16,16,16,16};
  70. memcpy ( d, s, width );
  71. d += pitch;
  72. s += pitch;
  73. do
  74. {
  75. // NOTE: By doing it this way I am ensuring that pixels will always be unaligned!!!
  76. int col = 1;
  77. d[0] = s[0];
  78. d[width - 1] = s[width - 1];
  79. do
  80. {
  81. __declspec(align(16)) unsigned short counts[8];
  82. __declspec(align(16)) unsigned short sums[8];
  83. _asm
  84. {
  85. mov esi, s // get the source line
  86. add esi, col // add the column offset
  87. pxor xmm1,xmm1 // accumulator
  88. pxor xmm2,xmm2 // count
  89. pxor xmm7,xmm7 // 0s for use with unpack
  90. movq xmm3, QWORD PTR [esi] // get 8 pixels
  91. punpcklbw xmm3, xmm7 // unpack to shorts
  92. xor eax, eax // neighbor iterator
  93. NextNeighbor:
  94. mov ecx, [PixelOffsets+eax*4] // get eax index pixel neighbor offset
  95. movq xmm4, QWORD PTR [esi + ecx] // get ecx index neighbor values
  96. punpcklbw xmm4, xmm7 // xmm4 unpacked neighbor values
  97. movdqa xmm6, xmm4 // save the pixel values
  98. psubsw xmm4, xmm3 // subtracted pixel values
  99. pmullw xmm4, xmm4 // square xmm4
  100. movd xmm5, strength
  101. psrlw xmm4, xmm5 // should be strength
  102. pmullw xmm4, threes // 3 * modifier
  103. movdqa xmm5, sixteens // 16s
  104. psubusw xmm5, xmm4 // 16 - modifiers
  105. movdqa xmm4, xmm5 // save the modifiers
  106. pmullw xmm4, xmm6 // multiplier values
  107. paddusw xmm1, xmm4 // accumulator
  108. paddusw xmm2, xmm5 // count
  109. inc eax // next neighbor
  110. cmp eax,9 // there are nine neigbors
  111. jne NextNeighbor
  112. movdqa counts, xmm2
  113. psrlw xmm2,1 // divide count by 2 for rounding
  114. paddusw xmm1,xmm2 // rounding added in
  115. mov frameptr,esi
  116. movdqa sums, xmm1
  117. }
  118. for ( i=0; i<8; i++ )
  119. {
  120. int blurvalue = sums[i] * ppi->fixedDivide[counts[i]];
  121. blurvalue >>= 16;
  122. d[col+i] = blurvalue;
  123. }
  124. col += 8;
  125. } while ( col<width-1 );
  126. d += pitch;
  127. s += pitch;
  128. ++row;
  129. } while ( row<height-1 );
  130. memcpy ( d, s, width );
  131. __asm emms
  132. }
  133. #endif
  134. /****************************************************************************
  135. *
  136. * ROUTINE : tempFilter_c
  137. *
  138. * INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
  139. * unsigned char *s : Pointer to source frame.
  140. * unsigned char *d : Pointer to destination frame.
  141. * int bytes : Number of bytes to filter.
  142. * int strength : Strength of filter to apply.
  143. *
  144. * OUTPUTS : None.
  145. *
  146. * RETURNS : void
  147. *
  148. * FUNCTION : Performs a closesness adjusted temporarl blur
  149. *
  150. * SPECIAL NOTES : Destination frame can be same as source frame.
  151. *
  152. ****************************************************************************/
  153. void tempFilter_c
  154. (
  155. PreProcInstance *ppi,
  156. unsigned char *s,
  157. unsigned char *d,
  158. int bytes,
  159. int strength
  160. )
  161. {
  162. int byte = 0;
  163. unsigned char *frameptr = ppi->frameBuffer;
  164. if ( ppi->frame == 0 )
  165. {
  166. do
  167. {
  168. int frame = 0;
  169. do
  170. {
  171. *frameptr = s[byte];
  172. ++frameptr;
  173. ++frame;
  174. } while ( frame < FRAMECOUNT );
  175. d[byte] = s[byte];
  176. ++byte;
  177. } while ( byte < bytes );
  178. }
  179. else
  180. {
  181. int modifier;
  182. int offset = (ppi->frame % FRAMECOUNT);
  183. do
  184. {
  185. int accumulator = 0;
  186. int count = 0;
  187. int frame = 0;
  188. frameptr[offset] = s[byte];
  189. do
  190. {
  191. int pixelValue = *frameptr;
  192. modifier = s[byte];
  193. modifier -= pixelValue;
  194. modifier *= modifier;
  195. modifier >>= strength;
  196. modifier *= 3;
  197. if(modifier > 16)
  198. modifier = 16;
  199. modifier = 16 - modifier;
  200. accumulator += modifier * pixelValue;
  201. count += modifier;
  202. frameptr++;
  203. ++frame;
  204. } while ( frame < FRAMECOUNT );
  205. accumulator += (count >> 1);
  206. accumulator *= ppi->fixedDivide[count];
  207. accumulator >>= 16;
  208. d[byte] = accumulator;
  209. ++byte;
  210. } while ( byte < bytes );
  211. }
  212. ++ppi->frame;
  213. }
  214. #ifndef MAPCA
  215. /****************************************************************************
  216. *
  217. * ROUTINE : tempFilter_wmt
  218. *
  219. * INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
  220. * unsigned char *s : Pointer to source frame.
  221. * unsigned char *d : Pointer to destination frame.
  222. * int bytes : Number of bytes to filter.
  223. * int strength : Strength of filter to apply.
  224. *
  225. * OUTPUTS : None.
  226. *
  227. * RETURNS : void
  228. *
  229. * FUNCTION : Performs a closesness adjusted temporarl blur
  230. *
  231. * SPECIAL NOTES : Destination frame can be same as source frame.
  232. *
  233. ****************************************************************************/
  234. void tempFilter_wmt
  235. (
  236. PreProcInstance *ppi,
  237. unsigned char *s,
  238. unsigned char *d,
  239. int bytes,
  240. int strength
  241. )
  242. {
  243. int byte = 0;
  244. unsigned char * frameptr = ppi->frameBuffer;
  245. __declspec(align(16)) unsigned short threes[] ={ 3, 3, 3, 3, 3, 3, 3, 3};
  246. __declspec(align(16)) unsigned short sixteens[]={16,16,16,16,16,16,16,16};
  247. if ( ppi->frame == 0 )
  248. {
  249. do
  250. {
  251. int i;
  252. int frame = 0;
  253. do
  254. {
  255. for ( i=0; i<8; i++ )
  256. {
  257. *frameptr = s[byte+i];
  258. ++frameptr;
  259. }
  260. ++frame;
  261. } while ( frame < FRAMECOUNT );
  262. for ( i=0; i<8; i++ )
  263. d[byte+i] = s[byte+i];
  264. byte += 8;
  265. } while ( byte < bytes );
  266. }
  267. else
  268. {
  269. int i;
  270. int offset2 = (ppi->frame % FRAMECOUNT);
  271. do
  272. {
  273. __declspec(align(16)) unsigned short counts[8];
  274. __declspec(align(16)) unsigned short sums[8];
  275. int accumulator = 0;
  276. int count = 0;
  277. int frame = 0;
  278. _asm
  279. {
  280. mov eax,offset2
  281. mov edi,s // source pixels
  282. pxor xmm1,xmm1 // accumulator
  283. pxor xmm7,xmm7
  284. mov esi,frameptr // accumulator
  285. pxor xmm2,xmm2 // count
  286. movq xmm3, QWORD PTR [edi]
  287. movq QWORD PTR [esi+8*eax],xmm3
  288. punpcklbw xmm3, xmm2 // xmm3 source pixels
  289. mov ecx, FRAMECOUNT
  290. NextFrame:
  291. movq xmm4, QWORD PTR [esi] // get frame buffer values
  292. punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels
  293. movdqa xmm6, xmm4 // save the pixel values
  294. psubsw xmm4, xmm3 // subtracted pixel values
  295. pmullw xmm4, xmm4 // square xmm4
  296. movd xmm5, strength
  297. psrlw xmm4, xmm5 // should be strength
  298. pmullw xmm4, threes // 3 * modifier
  299. movdqa xmm5, sixteens // 16s
  300. psubusw xmm5, xmm4 // 16 - modifiers
  301. movdqa xmm4, xmm5 // save the modifiers
  302. pmullw xmm4, xmm6 // multiplier values
  303. paddusw xmm1, xmm4 // accumulator
  304. paddusw xmm2, xmm5 // count
  305. add esi, 8 // next frame
  306. dec ecx // next set of eight pixels
  307. jnz NextFrame
  308. movdqa counts, xmm2
  309. psrlw xmm2,1 // divide count by 2 for rounding
  310. paddusw xmm1,xmm2 // rounding added in
  311. mov frameptr,esi
  312. movdqa sums, xmm1
  313. }
  314. for ( i=0; i<8; i++ )
  315. {
  316. int blurvalue = sums[i] * ppi->fixedDivide[counts[i]];
  317. blurvalue >>= 16;
  318. d[i] = blurvalue;
  319. }
  320. s += 8;
  321. d += 8;
  322. byte += 8;
  323. } while ( byte < bytes );
  324. }
  325. ++ppi->frame;
  326. __asm emms
  327. }
  328. /****************************************************************************
  329. *
  330. * ROUTINE : tempFilter_mmx
  331. *
  332. * INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
  333. * unsigned char *s : Pointer to source frame.
  334. * unsigned char *d : Pointer to destination frame.
  335. * int bytes : Number of bytes to filter.
  336. * int strength : Strength of filter to apply.
  337. *
  338. * OUTPUTS : None.
  339. *
  340. * RETURNS : void
  341. *
  342. * FUNCTION : Performs a closesness adjusted temporarl blur
  343. *
  344. * SPECIAL NOTES : Destination frame can be same as source frame.
  345. *
  346. ****************************************************************************/
  347. void tempFilter_mmx
  348. (
  349. PreProcInstance *ppi,
  350. unsigned char *s,
  351. unsigned char *d,
  352. int bytes,
  353. int strength
  354. )
  355. {
  356. int byte = 0;
  357. unsigned char *frameptr = ppi->frameBuffer;
  358. __declspec(align(16)) unsigned short threes[] ={ 3, 3, 3, 3};
  359. __declspec(align(16)) unsigned short sixteens[]={16,16,16,16};
  360. if ( ppi->frame == 0 )
  361. {
  362. do
  363. {
  364. int i;
  365. int frame = 0;
  366. do
  367. {
  368. for ( i=0; i<4; i++ )
  369. {
  370. *frameptr = s[byte+i];
  371. ++frameptr;
  372. }
  373. ++frame;
  374. } while ( frame < FRAMECOUNT );
  375. for ( i=0; i<4; i++ )
  376. d[byte+i] = s[byte+i];
  377. byte += 4;
  378. } while ( byte < bytes );
  379. }
  380. else
  381. {
  382. int i;
  383. int offset2 = (ppi->frame % FRAMECOUNT);
  384. do
  385. {
  386. __declspec(align(16)) unsigned short counts[8];
  387. __declspec(align(16)) unsigned short sums[8];
  388. int accumulator = 0;
  389. int count = 0;
  390. int frame = 0;
  391. _asm
  392. {
  393. mov eax,offset2
  394. mov edi,s // source pixels
  395. pxor mm1,mm1 // accumulator
  396. pxor mm7,mm7
  397. mov esi,frameptr // accumulator
  398. pxor mm2,mm2 // count
  399. movd mm3, DWORD PTR [edi]
  400. movd DWORD PTR [esi+4*eax],mm3
  401. punpcklbw mm3, mm2 // mm3 source pixels
  402. mov ecx, FRAMECOUNT
  403. NextFrame:
  404. movd mm4, DWORD PTR [esi] // get frame buffer values
  405. punpcklbw mm4, mm7 // mm4 frame buffer pixels
  406. movq mm6, mm4 // save the pixel values
  407. psubsw mm4, mm3 // subtracted pixel values
  408. pmullw mm4, mm4 // square mm4
  409. movd mm5, strength
  410. psrlw mm4, mm5 // should be strength
  411. pmullw mm4, threes // 3 * modifier
  412. movq mm5, sixteens // 16s
  413. psubusw mm5, mm4 // 16 - modifiers
  414. movq mm4, mm5 // save the modifiers
  415. pmullw mm4, mm6 // multiplier values
  416. paddusw mm1, mm4 // accumulator
  417. paddusw mm2, mm5 // count
  418. add esi, 4 // next frame
  419. dec ecx // next set of eight pixels
  420. jnz NextFrame
  421. movq counts, mm2
  422. psrlw mm2,1 // divide count by 2 for rounding
  423. paddusw mm1,mm2 // rounding added in
  424. mov frameptr,esi
  425. movq sums, mm1
  426. }
  427. for ( i=0; i<4; i++ )
  428. {
  429. int blurvalue = sums[i] * ppi->fixedDivide[counts[i]];
  430. blurvalue >>= 16;
  431. d[i] = blurvalue;
  432. }
  433. s += 4;
  434. d += 4;
  435. byte += 4;
  436. } while ( byte < bytes );
  437. }
  438. ++ppi->frame;
  439. __asm emms
  440. }
  441. #endif
  442. /****************************************************************************
  443. *
  444. * ROUTINE : DeletePreProc
  445. *
  446. * INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
  447. *
  448. * OUTPUTS : None.
  449. *
  450. * RETURNS : void
  451. *
  452. * FUNCTION : Deletes a pre-processing instance.
  453. *
  454. * SPECIAL NOTES : None.
  455. *
  456. ****************************************************************************/
  457. void DeletePreProc ( PreProcInstance *ppi )
  458. {
  459. if ( ppi->frameBufferAlloc )
  460. duck_free ( ppi->frameBufferAlloc );
  461. ppi->frameBufferAlloc = 0;
  462. ppi->frameBuffer = 0;
  463. if( ppi->fixedDivideAlloc )
  464. duck_free ( ppi->fixedDivideAlloc );
  465. ppi->fixedDivideAlloc = 0;
  466. ppi->fixedDivide = 0;
  467. }
  468. /****************************************************************************
  469. *
  470. * ROUTINE : InitPreProc
  471. *
  472. * INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
  473. * int FrameSize : Number of bytes in one frame.
  474. *
  475. * OUTPUTS : None.
  476. *
  477. * RETURNS : int: 1 if successful, 0 if failed.
  478. *
  479. * FUNCTION : Initializes prepprocessor instance.
  480. *
  481. * SPECIAL NOTES : None.
  482. *
  483. ****************************************************************************/
  484. int InitPreProc ( PreProcInstance *ppi, int FrameSize )
  485. {
  486. int i;
  487. int MmxEnabled;
  488. int XmmEnabled;
  489. int WmtEnabled;
  490. #ifndef MAPCA
  491. GetProcessorFlags ( &MmxEnabled, &XmmEnabled, &WmtEnabled );
  492. if ( WmtEnabled )
  493. tempFilter = tempFilter_wmt;
  494. else if ( MmxEnabled )
  495. tempFilter = tempFilter_mmx;
  496. else
  497. #endif
  498. tempFilter = tempFilter_c;
  499. DeletePreProc ( ppi );
  500. ppi->frameBufferAlloc = duck_malloc ( 32+FrameSize*7*sizeof(unsigned char), DMEM_GENERAL );
  501. if ( !ppi->frameBufferAlloc ) { DeletePreProc( ppi ); return 0; }
  502. ppi->frameBuffer = (unsigned char *) ROUNDUP32( ppi->frameBufferAlloc );
  503. ppi->fixedDivideAlloc = duck_malloc ( 32+255*sizeof(unsigned int), DMEM_GENERAL );
  504. if ( !ppi->fixedDivideAlloc ) { DeletePreProc( ppi ); return 0; }
  505. ppi->fixedDivide = (unsigned int *) ROUNDUP32( ppi->fixedDivideAlloc );
  506. for ( i=1; i<255; i++ )
  507. ppi->fixedDivide[i] = 0x10000 / i;
  508. return 1;
  509. }
  510. /****************************************************************************
  511. *
  512. * ROUTINE : spatialFilter_c
  513. *
  514. * INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
  515. * unsigned char *s : Pointer to source frame.
  516. * unsigned char *d : Pointer to destination frame.
  517. * int width : Width of images.
  518. * int height : Height of images.
  519. * int pitch : Stride of images.
  520. * int strength : Strength of filter to apply.
  521. *
  522. * OUTPUTS : None.
  523. *
  524. * RETURNS : void
  525. *
  526. * FUNCTION : Performs a closesness adjusted temporal blur.
  527. *
  528. * SPECIAL NOTES : None.
  529. *
  530. ****************************************************************************/
  531. void spatialFilter_c
  532. (
  533. PreProcInstance *ppi,
  534. unsigned char *s,
  535. unsigned char *d,
  536. int width,
  537. int height,
  538. int pitch,
  539. int strength
  540. )
  541. {
  542. int modifier;
  543. int byte = 0;
  544. int row = 1;
  545. int PixelOffsets[9];
  546. PixelOffsets[0] = -pitch - 1;
  547. PixelOffsets[1] = -pitch;
  548. PixelOffsets[2] = -pitch + 1;
  549. PixelOffsets[3] = - 1;
  550. PixelOffsets[4] = 0;
  551. PixelOffsets[5] = + 1;
  552. PixelOffsets[6] = pitch - 1;
  553. PixelOffsets[7] = pitch ;
  554. PixelOffsets[8] = pitch + 1;
  555. memcpy ( d, s, width );
  556. d += pitch;
  557. s += pitch;
  558. do
  559. {
  560. int col = 1;
  561. d[0] = s[0];
  562. d[width - 1] = s[width - 1];
  563. do
  564. {
  565. int accumulator = 0;
  566. int count = 0;
  567. int neighbor = 0;
  568. do
  569. {
  570. int pixelValue = s[ col + PixelOffsets[neighbor] ];
  571. modifier = s[col];
  572. modifier -= pixelValue;
  573. modifier *= modifier;
  574. modifier >>= strength;
  575. modifier *= 3;
  576. if(modifier > 16)
  577. modifier = 16;
  578. modifier = 16 - modifier;
  579. accumulator += modifier * pixelValue;
  580. count += modifier;
  581. neighbor++;
  582. } while ( neighbor < sizeof(PixelOffsets)/sizeof(int) );
  583. accumulator += (count >> 1);
  584. accumulator *= ppi->fixedDivide[count];
  585. accumulator >>= 16;
  586. d[col] = accumulator;
  587. ++col;
  588. } while ( col < width-1 );
  589. d += pitch;
  590. s += pitch;
  591. ++row;
  592. } while ( row < height-1 );
  593. memcpy ( d, s, width );
  594. }