123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696 |
- /*!
- ***************************************************************************
- * \file transform8x8.c
- *
- * \brief
- * 8x8 transform functions
- *
- * \author
- * Main contributors (see contributors.h for copyright, address and affiliation details)
- * - Yuri Vatis
- * - Jan Muenster
- *
- * \date
- * 12. October 2003
- **************************************************************************
- */
- #include "global.h"
- #include "image.h"
- #include "mb_access.h"
- #include "elements.h"
- #include "transform8x8.h"
- #include "transform.h"
- #include "quant.h"
- #include <emmintrin.h>
- static void inverse8x8_sse2(h264_short_8x8block_row_t *block)
- {
- __m128i a0, a1, a2, a3;
- __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
- __m128i b0, b1, b2, b3, b4, b5, b6, b7;
- __m128i r0, r1, r2, r3, r4, r5, r6, r7;
- // Horizontal
- b0 = _mm_load_si128((__m128i *)(block[0]));
- b1 = _mm_load_si128((__m128i *)(block[1]));
- b2 = _mm_load_si128((__m128i *)(block[2]));
- b3 = _mm_load_si128((__m128i *)(block[3]));
- b4 = _mm_load_si128((__m128i *)(block[4]));
- b5 = _mm_load_si128((__m128i *)(block[5]));
- b6 = _mm_load_si128((__m128i *)(block[6]));
- b7 = _mm_load_si128((__m128i *)(block[7]));
- /* rotate 8x8 (ugh) */
- r0 = _mm_unpacklo_epi16(b0, b2);
- r1 = _mm_unpacklo_epi16(b1, b3);
- r2 = _mm_unpackhi_epi16(b0, b2);
- r3 = _mm_unpackhi_epi16(b1, b3);
- r4 = _mm_unpacklo_epi16(b4, b6);
- r5 = _mm_unpacklo_epi16(b5, b7);
- r6 = _mm_unpackhi_epi16(b4, b6);
- r7 = _mm_unpackhi_epi16(b5, b7);
- b0 = _mm_unpacklo_epi16(r0, r1);
- b1 = _mm_unpackhi_epi16(r0, r1);
- b2 = _mm_unpacklo_epi16(r2, r3);
- b3 = _mm_unpackhi_epi16(r2, r3);
- b4 = _mm_unpacklo_epi16(r4, r5);
- b5 = _mm_unpackhi_epi16(r4, r5);
- b6 = _mm_unpacklo_epi16(r6, r7);
- b7 = _mm_unpackhi_epi16(r6, r7);
- p0 = _mm_unpacklo_epi64(b0, b4);
- p1 = _mm_unpackhi_epi64(b0, b4);
- p2 = _mm_unpacklo_epi64(b1, b5);
- p3 = _mm_unpackhi_epi64(b1, b5);
- p4 = _mm_unpacklo_epi64(b2, b6);
- p5 = _mm_unpackhi_epi64(b2, b6);
- p6 = _mm_unpacklo_epi64(b3, b7);
- p7 = _mm_unpackhi_epi64(b3, b7);
- /* perform approx DCT */
- a0 = _mm_add_epi16(p0, p4); // p0 + p4
- a1 = _mm_sub_epi16(p0, p4); // p0 - p4
- r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
- a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
- r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
- a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
- b0 = _mm_add_epi16(a0, a3); // a0 + a3;
- b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
- b4 = _mm_add_epi16(a1, a2); // a1 + a2;
- b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
- //-p3 + p5 - p7 - (p7 >> 1);
- r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
- a0 = _mm_sub_epi16(p5, p3); // p5 - p3
- a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
- a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
- //p1 + p7 - p3 - (p3 >> 1);
- r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
- a1 = _mm_add_epi16(p1, p7); // p1 + p7
- a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
- a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
- // -p1 + p7 + p5 + (p5 >> 1);
- r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
- a2 = _mm_sub_epi16(p7, p1); // p7 - p1
- a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
- a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
- // p3 + p5 + p1 + (p1 >> 1);
- a3 = _mm_add_epi16(p3, p5); // p3+p5
- a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
- p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
- a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
- r0 = _mm_srai_epi16(a3, 2); // a3>>2
- b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
- r0 = _mm_srai_epi16(a2, 2); // a2>>2
- b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
- a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
- b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
- a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
- b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
- p0 = _mm_add_epi16(b0, b7); // b0 + b7;
- p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
- p2 = _mm_add_epi16(b4, b3); // b4 + b3;
- p3 = _mm_add_epi16(b6, b1); // b6 + b1;
- p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
- p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
- p6 = _mm_add_epi16(b2, b5); // b2 + b5;
- p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
- /* rotate 8x8 (ugh) */
- r0 = _mm_unpacklo_epi16(p0, p2);
- r1 = _mm_unpacklo_epi16(p1, p3);
- r2 = _mm_unpackhi_epi16(p0, p2);
- r3 = _mm_unpackhi_epi16(p1, p3);
- r4 = _mm_unpacklo_epi16(p4, p6);
- r5 = _mm_unpacklo_epi16(p5, p7);
- r6 = _mm_unpackhi_epi16(p4, p6);
- r7 = _mm_unpackhi_epi16(p5, p7);
- b0 = _mm_unpacklo_epi16(r0, r1);
- b1 = _mm_unpackhi_epi16(r0, r1);
- b2 = _mm_unpacklo_epi16(r2, r3);
- b3 = _mm_unpackhi_epi16(r2, r3);
- b4 = _mm_unpacklo_epi16(r4, r5);
- b5 = _mm_unpackhi_epi16(r4, r5);
- b6 = _mm_unpacklo_epi16(r6, r7);
- b7 = _mm_unpackhi_epi16(r6, r7);
- p0 = _mm_unpacklo_epi64(b0, b4);
- p1 = _mm_unpackhi_epi64(b0, b4);
- p2 = _mm_unpacklo_epi64(b1, b5);
- p3 = _mm_unpackhi_epi64(b1, b5);
- p4 = _mm_unpacklo_epi64(b2, b6);
- p5 = _mm_unpackhi_epi64(b2, b6);
- p6 = _mm_unpacklo_epi64(b3, b7);
- p7 = _mm_unpackhi_epi64(b3, b7);
- /* Vertical */
- a0 = _mm_add_epi16(p0, p4); // p0 + p4
- a1 = _mm_sub_epi16(p0, p4); // p0 - p4
- r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
- a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
- r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
- a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
- b0 = _mm_add_epi16(a0, a3); // a0 + a3;
- b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
- b4 = _mm_add_epi16(a1, a2); // a1 + a2;
- b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
- //-p3 + p5 - p7 - (p7 >> 1);
- r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
- a0 = _mm_sub_epi16(p5, p3); // p5 - p3
- a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
- a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
- //p1 + p7 - p3 - (p3 >> 1);
- r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
- a1 = _mm_add_epi16(p1, p7); // p1 + p7
- a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
- a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
- // -p1 + p7 + p5 + (p5 >> 1);
- r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
- a2 = _mm_sub_epi16(p7, p1); // p7 - p1
- a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
- a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
- // p3 + p5 + p1 + (p1 >> 1);
- r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
- a3 = _mm_add_epi16(p3, p5); // p3+p5
- a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
- a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
- r0 = _mm_srai_epi16(a3, 2); // a3>>2
- b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
- r0 = _mm_srai_epi16(a2, 2); // a2>>2
- b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
- a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
- b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
- a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
- b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
- r0 = _mm_add_epi16(b0, b7); // b0 + b7;
- _mm_store_si128((__m128i *)(block[0]), r0);
- r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
- _mm_store_si128((__m128i *)(block[1]), r1);
- r2 = _mm_add_epi16(b4, b3); // b4 + b3;
- _mm_store_si128((__m128i *)(block[2]), r2);
- r3 = _mm_add_epi16(b6, b1); // b6 + b1;
- _mm_store_si128((__m128i *)(block[3]), r3);
- r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
- _mm_store_si128((__m128i *)(block[4]), r4);
- r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
- _mm_store_si128((__m128i *)(block[5]), r5);
- r6 = _mm_add_epi16(b2, b5); // b2 + b5;
- _mm_store_si128((__m128i *)(block[6]), r6);
- r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
- _mm_store_si128((__m128i *)(block[7]), r7);
- }
- static void inverse8x8(h264_short_8x8block_row_t *block)
- {
- int i;
- //int tmp[64];
- //int *pTmp = tmp;
- int a0, a1, a2, a3;
- int p0, p1, p2, p3, p4, p5 ,p6, p7;
- int b0, b1, b2, b3, b4, b5, b6, b7;
- // Horizontal
- for (i=0; i < BLOCK_SIZE_8x8; i++)
- {
- p0 = block[i][0];
- p1 = block[i][1];
- p2 = block[i][2];
- p3 = block[i][3];
- p4 = block[i][4];
- p5 = block[i][5];
- p6 = block[i][6];
- p7 = block[i][7];
- a0 = p0 + p4;
- a1 = p0 - p4;
- a2 = p6 - (p2 >> 1);
- a3 = p2 + (p6 >> 1);
- b0 = a0 + a3;
- b2 = a1 - a2;
- b4 = a1 + a2;
- b6 = a0 - a3;
- a0 = p5 - p3 - p7 - (p7 >> 1);
- a1 = p1 + p7 - p3 - (p3 >> 1);
- a2 = p7 - p1 + p5 + (p5 >> 1);
- a3 = p3 + p5 + p1 + (p1 >> 1);
- b1 = a0 + (a3>>2);
- b3 = a1 + (a2>>2);
- b5 = a2 - (a1>>2);
- b7 = a3 - (a0>>2);
- block[i][0] = b0 + b7;
- block[i][1] = b2 - b5;
- block[i][2] = b4 + b3;
- block[i][3] = b6 + b1;
- block[i][4] = b6 - b1;
- block[i][5] = b4 - b3;
- block[i][6] = b2 + b5;
- block[i][7] = b0 - b7;
- }
- // Vertical
- for (i=0; i < BLOCK_SIZE_8x8; i++)
- {
- // pTmp = tmp + i;
- p0 = block[0][i];
- p1 = block[1][i];
- p2 = block[2][i];
- p3 = block[3][i];
- p4 = block[4][i];
- p5 = block[5][i];
- p6 = block[6][i];
- p7 = block[7][i];
- a0 = p0 + p4;
- a1 = p0 - p4;
- a2 = p6 - (p2>>1);
- a3 = p2 + (p6>>1);
- b0 = a0 + a3;
- b2 = a1 - a2;
- b4 = a1 + a2;
- b6 = a0 - a3;
- a0 = -p3 + p5 - p7 - (p7 >> 1);
- a1 = p1 + p7 - p3 - (p3 >> 1);
- a2 = -p1 + p7 + p5 + (p5 >> 1);
- a3 = p3 + p5 + p1 + (p1 >> 1);
- b1 = a0 + (a3 >> 2);
- b7 = a3 - (a0 >> 2);
- b3 = a1 + (a2 >> 2);
- b5 = a2 - (a1 >> 2);
- block[0][i] = b0 + b7;
- block[1][i] = b2 - b5;
- block[2][i] = b4 + b3;
- block[3][i] = b6 + b1;
- block[4][i] = b6 - b1;
- block[5][i] = b4 - b3;
- block[6][i] = b2 + b5;
- block[7][i] = b0 - b7;
- }
-
- }
- #if defined(_DEBUG) || defined(_M_IX64)
- void itrans8x8_sse2(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
- {
- __m128i a0, a1, a2, a3;
- __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
- __m128i b0, b1, b2, b3, b4, b5, b6, b7;
- __m128i r0, r1, r2, r3, r4, r5, r6, r7;
- __m128i const32, zero;
- __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
- __m128i pred0, pred1;
- const32 = _mm_load_si128((const __m128i *)c32);
- zero = _mm_setzero_si128();
- // Horizontal
- b0 = _mm_load_si128((__m128i *)(block[0]));
- b1 = _mm_load_si128((__m128i *)(block[1]));
- b2 = _mm_load_si128((__m128i *)(block[2]));
- b3 = _mm_load_si128((__m128i *)(block[3]));
- b4 = _mm_load_si128((__m128i *)(block[4]));
- b5 = _mm_load_si128((__m128i *)(block[5]));
- b6 = _mm_load_si128((__m128i *)(block[6]));
- b7 = _mm_load_si128((__m128i *)(block[7]));
- /* rotate 8x8 (ugh) */
- r0 = _mm_unpacklo_epi16(b0, b2);
- r1 = _mm_unpacklo_epi16(b1, b3);
- r2 = _mm_unpackhi_epi16(b0, b2);
- r3 = _mm_unpackhi_epi16(b1, b3);
- r4 = _mm_unpacklo_epi16(b4, b6);
- r5 = _mm_unpacklo_epi16(b5, b7);
- r6 = _mm_unpackhi_epi16(b4, b6);
- r7 = _mm_unpackhi_epi16(b5, b7);
- b0 = _mm_unpacklo_epi16(r0, r1);
- b1 = _mm_unpackhi_epi16(r0, r1);
- b2 = _mm_unpacklo_epi16(r2, r3);
- b3 = _mm_unpackhi_epi16(r2, r3);
- b4 = _mm_unpacklo_epi16(r4, r5);
- b5 = _mm_unpackhi_epi16(r4, r5);
- b6 = _mm_unpacklo_epi16(r6, r7);
- b7 = _mm_unpackhi_epi16(r6, r7);
- p0 = _mm_unpacklo_epi64(b0, b4);
- p1 = _mm_unpackhi_epi64(b0, b4);
- p2 = _mm_unpacklo_epi64(b1, b5);
- p3 = _mm_unpackhi_epi64(b1, b5);
- p4 = _mm_unpacklo_epi64(b2, b6);
- p5 = _mm_unpackhi_epi64(b2, b6);
- p6 = _mm_unpacklo_epi64(b3, b7);
- p7 = _mm_unpackhi_epi64(b3, b7);
- /* perform approx DCT */
- a0 = _mm_add_epi16(p0, p4); // p0 + p4
- a1 = _mm_sub_epi16(p0, p4); // p0 - p4
- r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
- a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
- r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
- a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
- b0 = _mm_add_epi16(a0, a3); // a0 + a3;
- b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
- b4 = _mm_add_epi16(a1, a2); // a1 + a2;
- b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
- //-p3 + p5 - p7 - (p7 >> 1);
- r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
- a0 = _mm_sub_epi16(p5, p3); // p5 - p3
- a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
- a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
- //p1 + p7 - p3 - (p3 >> 1);
- r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
- a1 = _mm_add_epi16(p1, p7); // p1 + p7
- a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
- a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
- // -p1 + p7 + p5 + (p5 >> 1);
- r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
- a2 = _mm_sub_epi16(p7, p1); // p7 - p1
- a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
- a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
- // p3 + p5 + p1 + (p1 >> 1);
- a3 = _mm_add_epi16(p3, p5); // p3+p5
- a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
- p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
- a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
- r0 = _mm_srai_epi16(a3, 2); // a3>>2
- b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
- r0 = _mm_srai_epi16(a2, 2); // a2>>2
- b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
- a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
- b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
- a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
- b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
- p0 = _mm_add_epi16(b0, b7); // b0 + b7;
- p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
- p2 = _mm_add_epi16(b4, b3); // b4 + b3;
- p3 = _mm_add_epi16(b6, b1); // b6 + b1;
- p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
- p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
- p6 = _mm_add_epi16(b2, b5); // b2 + b5;
- p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
- /* rotate 8x8 (ugh) */
- r0 = _mm_unpacklo_epi16(p0, p2);
- r1 = _mm_unpacklo_epi16(p1, p3);
- r2 = _mm_unpackhi_epi16(p0, p2);
- r3 = _mm_unpackhi_epi16(p1, p3);
- r4 = _mm_unpacklo_epi16(p4, p6);
- r5 = _mm_unpacklo_epi16(p5, p7);
- r6 = _mm_unpackhi_epi16(p4, p6);
- r7 = _mm_unpackhi_epi16(p5, p7);
- b0 = _mm_unpacklo_epi16(r0, r1);
- b1 = _mm_unpackhi_epi16(r0, r1);
- b2 = _mm_unpacklo_epi16(r2, r3);
- b3 = _mm_unpackhi_epi16(r2, r3);
- b4 = _mm_unpacklo_epi16(r4, r5);
- b5 = _mm_unpackhi_epi16(r4, r5);
- b6 = _mm_unpacklo_epi16(r6, r7);
- b7 = _mm_unpackhi_epi16(r6, r7);
- p0 = _mm_unpacklo_epi64(b0, b4);
- p1 = _mm_unpackhi_epi64(b0, b4);
- p2 = _mm_unpacklo_epi64(b1, b5);
- p3 = _mm_unpackhi_epi64(b1, b5);
- p4 = _mm_unpacklo_epi64(b2, b6);
- p5 = _mm_unpackhi_epi64(b2, b6);
- p6 = _mm_unpacklo_epi64(b3, b7);
- p7 = _mm_unpackhi_epi64(b3, b7);
- /* Vertical */
- a0 = _mm_add_epi16(p0, p4); // p0 + p4
- a1 = _mm_sub_epi16(p0, p4); // p0 - p4
- r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
- a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
- r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
- a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
- b0 = _mm_add_epi16(a0, a3); // a0 + a3;
- b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
- b4 = _mm_add_epi16(a1, a2); // a1 + a2;
- b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
- //-p3 + p5 - p7 - (p7 >> 1);
- r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
- a0 = _mm_sub_epi16(p5, p3); // p5 - p3
- a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
- a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
- //p1 + p7 - p3 - (p3 >> 1);
- r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
- a1 = _mm_add_epi16(p1, p7); // p1 + p7
- a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
- a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
- // -p1 + p7 + p5 + (p5 >> 1);
- r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
- a2 = _mm_sub_epi16(p7, p1); // p7 - p1
- a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
- a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
- // p3 + p5 + p1 + (p1 >> 1);
- r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
- a3 = _mm_add_epi16(p3, p5); // p3+p5
- a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
- a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
- r0 = _mm_srai_epi16(a3, 2); // a3>>2
- b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
- r0 = _mm_srai_epi16(a2, 2); // a2>>2
- b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
- a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
- b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
- a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
- b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
- r0 = _mm_add_epi16(b0, b7); // b0 + b7;
- r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
- r2 = _mm_add_epi16(b4, b3); // b4 + b3;
- r3 = _mm_add_epi16(b6, b1); // b6 + b1;
- r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
- r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
- r6 = _mm_add_epi16(b2, b5); // b2 + b5;
- r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
- // add in prediction values
- pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
- pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
- // (x + 32) >> 6
- r0 = _mm_adds_epi16(r0, const32);
- r0 = _mm_srai_epi16(r0, 6);
- r1 = _mm_adds_epi16(r1, const32);
- r1 = _mm_srai_epi16(r1, 6);
- pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- pred0 = _mm_adds_epi16(pred0, r0);
- pred1 = _mm_adds_epi16(pred1, r1);
- pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- // store
- _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
- // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- pred0 = _mm_srli_si128(pred0, 8);
- _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
- /* --- */
- pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
- pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
- // (x + 32) >> 6
- r2 = _mm_adds_epi16(r2, const32);
- r2 = _mm_srai_epi16(r2, 6);
- r3 = _mm_adds_epi16(r3, const32);
- r3 = _mm_srai_epi16(r3, 6);
- pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- pred0 = _mm_adds_epi16(pred0, r2);
- pred1 = _mm_adds_epi16(pred1, r3);
- pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- // store
- _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
- // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- pred0 = _mm_srli_si128(pred0, 8);
- _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
- /* --- */
- pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
- pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
- // (x + 32) >> 6
- r4 = _mm_adds_epi16(r4, const32);
- r4 = _mm_srai_epi16(r4, 6);
- r5 = _mm_adds_epi16(r5, const32);
- r5 = _mm_srai_epi16(r5, 6);
- pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- pred0 = _mm_adds_epi16(pred0, r4);
- pred1 = _mm_adds_epi16(pred1, r5);
- pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- // store
- _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
- // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- pred0 = _mm_srli_si128(pred0, 8);
- _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
- /* --- */
- pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
- pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
- // (x + 32) >> 6
- r6 = _mm_adds_epi16(r6, const32);
- r6 = _mm_srai_epi16(r6, 6);
- r7 = _mm_adds_epi16(r7, const32);
- r7 = _mm_srai_epi16(r7, 6);
- pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
- pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
- pred0 = _mm_adds_epi16(pred0, r6);
- pred1 = _mm_adds_epi16(pred1, r7);
- pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
- // store
- _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
- // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
- pred0 = _mm_srli_si128(pred0, 8);
- _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
- }
- #endif
- #ifdef _M_IX86
- // TODO!! fix for 16bit coefficients instead of 32
- static void sample_reconstruct8x8_mmx(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *mb_rres8, int pos_x)
- {
- __asm
- {
- mov esi, 8 // loop 8 times
- mov eax, mb_rec
- add eax, pos_x
- mov ebx, mb_pred
- add ebx, pos_x
- mov ecx, mb_rres8
- // mm0 : constant value 32
- mov edx, 0x00200020
- movd mm0, edx
- punpckldq mm0, mm0
- // mm5: zero
- pxor mm7, mm7
- loop8:
- movq mm1, MMWORD PTR 0[ecx]
- paddw mm1, mm0 // rres + 32
- psraw mm1, 6 // (rres + 32) >> 6
- movq mm2, MMWORD PTR 0[ebx]
- punpcklbw mm2, mm7 // convert pred_row from unsigned char to short
- paddsw mm2, mm1 // pred_row + rres_row
- packuswb mm2, mm7
- movq MMWORD PTR 0[eax], mm2
- add eax, 16
- add ebx, 16
- add ecx, 16
- sub esi, 1
- jne loop8
- emms
- }
- }
- #endif
- // benski> unused, left in place for unit testing and if we ever need to port the decoder to non-intel
- static void sample_reconstruct8x8(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *mb_rres8, int pos_x, int max_imgpel_value)
- {
- int i,j;
- for( j = 0; j < 8; j++)
- {
- imgpel *rec_row = mb_rec[j] + pos_x;
- const short *rres_row = mb_rres8[j];
- const imgpel *pred_row = mb_pred[j] + pos_x;
- for( i = 0; i < 8; i++)
- rec_row[i] = (imgpel) iClip1(max_imgpel_value, pred_row[i] + rshift_rnd_sf(rres_row[i], DQ_BITS_8));
- }
- }
- /*!
- ***********************************************************************
- * \brief
- * Inverse 8x8 transformation
- ***********************************************************************
- */
- #ifdef _M_IX86
- void itrans8x8_mmx(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
- {
- inverse8x8((h264_short_8x8block_row_t *)block);
- sample_reconstruct8x8_mmx(mb_rec, mb_pred, block, pos_x);
- }
- #endif
- void itrans8x8_c(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
- {
- inverse8x8((h264_short_8x8block_row_t *)block);
- sample_reconstruct8x8(mb_rec, mb_pred, block, pos_x, 255);
- }
- void itrans8x8_lossless(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
- {
- int i,j;
- for( j = 0; j < 8; j++)
- {
- imgpel *rec_row = mb_rec[j] + pos_x;
- const short *rres_row = block[j];
- const imgpel *pred_row = mb_pred[j] + pos_x;
- for( i = 0; i < 8; i++)
- rec_row[i] = (imgpel) iClip1(255, (rres_row[i] + (long)pred_row[i]));
- }
- }
|