rs16.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. #include "rar.hpp"
  2. // We used "Screaming Fast Galois Field Arithmetic Using Intel SIMD
  3. // Instructions" paper by James S. Plank, Kevin M. Greenan
  4. // and Ethan L. Miller for fast SSE based multiplication.
  5. // Also we are grateful to Artem Drobanov and Bulat Ziganshin
  6. // for samples and ideas allowed to make Reed-Solomon codec more efficient.
  7. RSCoder16::RSCoder16()
  8. {
  9. Decoding=false;
  10. ND=NR=NE=0;
  11. ValidFlags=NULL;
  12. MX=NULL;
  13. DataLog=NULL;
  14. DataLogSize=0;
  15. gfInit();
  16. }
  17. RSCoder16::~RSCoder16()
  18. {
  19. delete[] gfExp;
  20. delete[] gfLog;
  21. delete[] DataLog;
  22. delete[] MX;
  23. delete[] ValidFlags;
  24. }
  25. // Initialize logarithms and exponents Galois field tables.
  26. void RSCoder16::gfInit()
  27. {
  28. gfExp=new uint[4*gfSize+1];
  29. gfLog=new uint[gfSize+1];
  30. for (uint L=0,E=1;L<gfSize;L++)
  31. {
  32. gfLog[E]=L;
  33. gfExp[L]=E;
  34. gfExp[L+gfSize]=E; // Duplicate the table to avoid gfExp overflow check.
  35. E<<=1;
  36. if (E>gfSize)
  37. E^=0x1100B; // Irreducible field-generator polynomial.
  38. }
  39. // log(0)+log(x) must be outside of usual log table, so we can set it
  40. // to 0 and avoid check for 0 in multiplication parameters.
  41. gfLog[0]= 2*gfSize;
  42. for (uint I=2*gfSize;I<=4*gfSize;I++) // Results for log(0)+log(x).
  43. gfExp[I]=0;
  44. }
  45. uint RSCoder16::gfAdd(uint a,uint b) // Addition in Galois field.
  46. {
  47. return a^b;
  48. }
  49. uint RSCoder16::gfMul(uint a,uint b) // Multiplication in Galois field.
  50. {
  51. return gfExp[gfLog[a]+gfLog[b]];
  52. }
  53. uint RSCoder16::gfInv(uint a) // Inverse element in Galois field.
  54. {
  55. return a==0 ? 0:gfExp[gfSize-gfLog[a]];
  56. }
  57. bool RSCoder16::Init(uint DataCount, uint RecCount, bool *ValidityFlags)
  58. {
  59. ND = DataCount;
  60. NR = RecCount;
  61. NE = 0;
  62. Decoding=ValidityFlags!=NULL;
  63. if (Decoding)
  64. {
  65. delete[] ValidFlags;
  66. ValidFlags=new bool[ND + NR];
  67. for (uint I = 0; I < ND + NR; I++)
  68. ValidFlags[I]=ValidityFlags[I];
  69. for (uint I = 0; I < ND; I++)
  70. if (!ValidFlags[I])
  71. NE++;
  72. uint ValidECC=0;
  73. for (uint I = ND; I < ND + NR; I++)
  74. if (ValidFlags[I])
  75. ValidECC++;
  76. if (NE > ValidECC || NE == 0 || ValidECC == 0)
  77. return false;
  78. }
  79. // 2021.09.01 - we allowed RR and REV >100%, so no more NR > ND check.
  80. if (ND + NR > gfSize || /*NR > ND ||*/ ND == 0 || NR == 0)
  81. return false;
  82. delete[] MX;
  83. if (Decoding)
  84. {
  85. MX=new uint[NE * ND];
  86. MakeDecoderMatrix();
  87. InvertDecoderMatrix();
  88. }
  89. else
  90. {
  91. MX=new uint[NR * ND];
  92. MakeEncoderMatrix();
  93. }
  94. return true;
  95. }
  96. void RSCoder16::MakeEncoderMatrix()
  97. {
  98. // Create Cauchy encoder generator matrix. Skip trivial "1" diagonal rows,
  99. // which would just copy source data to destination.
  100. for (uint I = 0; I < NR; I++)
  101. for (uint J = 0; J < ND; J++)
  102. MX[I * ND + J] = gfInv( gfAdd( (I+ND), J) );
  103. }
  104. void RSCoder16::MakeDecoderMatrix()
  105. {
  106. // Create Cauchy decoder matrix. Skip trivial rows matching valid data
  107. // units and containing "1" on main diagonal. Such rows would just copy
  108. // source data to destination and they have no real value for us.
  109. // Include rows only for broken data units and replace them by first
  110. // available valid recovery code rows.
  111. for (uint Flag=0, R=ND, Dest=0; Flag < ND; Flag++)
  112. if (!ValidFlags[Flag]) // For every broken data unit.
  113. {
  114. while (!ValidFlags[R]) // Find a valid recovery unit.
  115. R++;
  116. for (uint J = 0; J < ND; J++) // And place its row to matrix.
  117. MX[Dest*ND + J] = gfInv( gfAdd(R,J) );
  118. Dest++;
  119. R++;
  120. }
  121. }
  122. // Apply Gauss–Jordan elimination to find inverse of decoder matrix.
  123. // We have the square NDxND matrix, but we do not store its trivial
  124. // diagonal "1" rows matching valid data, so we work with NExND matrix.
  125. // Our original Cauchy matrix does not contain 0, so we skip search
  126. // for non-zero pivot.
  127. void RSCoder16::InvertDecoderMatrix()
  128. {
  129. uint *MI=new uint[NE * ND]; // We'll create inverse matrix here.
  130. memset(MI, 0, ND * NE * sizeof(*MI)); // Initialize to identity matrix.
  131. for (uint Kr = 0, Kf = 0; Kr < NE; Kr++, Kf++)
  132. {
  133. while (ValidFlags[Kf]) // Skip trivial rows.
  134. Kf++;
  135. MI[Kr * ND + Kf] = 1; // Set diagonal 1.
  136. }
  137. // Kr is the number of row in our actual reduced NE x ND matrix,
  138. // which does not contain trivial diagonal 1 rows.
  139. // Kf is the number of row in full ND x ND matrix with all trivial rows
  140. // included.
  141. for (uint Kr = 0, Kf = 0; Kf < ND; Kr++, Kf++) // Select pivot row.
  142. {
  143. while (ValidFlags[Kf] && Kf < ND)
  144. {
  145. // Here we process trivial diagonal 1 rows matching valid data units.
  146. // Their processing can be simplified comparing to usual rows.
  147. // In full version of elimination we would set MX[I * ND + Kf] to zero
  148. // after MI[..]^=, but we do not need it for matrix inversion.
  149. for (uint I = 0; I < NE; I++)
  150. MI[I * ND + Kf] ^= MX[I * ND + Kf];
  151. Kf++;
  152. }
  153. if (Kf == ND)
  154. break;
  155. uint *MXk = MX + Kr * ND; // k-th row of main matrix.
  156. uint *MIk = MI + Kr * ND; // k-th row of inversion matrix.
  157. uint PInv = gfInv( MXk[Kf] ); // Pivot inverse.
  158. // Divide the pivot row by pivot, so pivot cell contains 1.
  159. for (uint I = 0; I < ND; I++)
  160. {
  161. MXk[I] = gfMul( MXk[I], PInv );
  162. MIk[I] = gfMul( MIk[I], PInv );
  163. }
  164. for (uint I = 0; I < NE; I++)
  165. if (I != Kr) // For all rows except containing the pivot cell.
  166. {
  167. // Apply Gaussian elimination Mij -= Mkj * Mik / pivot.
  168. // Since pivot is already 1, it is reduced to Mij -= Mkj * Mik.
  169. uint *MXi = MX + I * ND; // i-th row of main matrix.
  170. uint *MIi = MI + I * ND; // i-th row of inversion matrix.
  171. uint Mik = MXi[Kf]; // Cell in pivot position.
  172. for (uint J = 0; J < ND; J++)
  173. {
  174. MXi[J] ^= gfMul(MXk[J] , Mik);
  175. MIi[J] ^= gfMul(MIk[J] , Mik);
  176. }
  177. }
  178. }
  179. // Copy data to main matrix.
  180. for (uint I = 0; I < NE * ND; I++)
  181. MX[I] = MI[I];
  182. delete[] MI;
  183. }
  184. #if 0
  185. // Multiply matrix to data vector. When encoding, it contains data in Data
  186. // and stores error correction codes in Out. When decoding it contains
  187. // broken data followed by ECC in Data and stores recovered data to Out.
  188. // We do not use this function now, everything is moved to UpdateECC.
  189. void RSCoder16::Process(const uint *Data, uint *Out)
  190. {
  191. uint ProcData[gfSize];
  192. for (uint I = 0; I < ND; I++)
  193. ProcData[I]=Data[I];
  194. if (Decoding)
  195. {
  196. // Replace broken data units with first available valid recovery codes.
  197. // 'Data' array must contain recovery codes after data.
  198. for (uint I=0, R=ND, Dest=0; I < ND; I++)
  199. if (!ValidFlags[I]) // For every broken data unit.
  200. {
  201. while (!ValidFlags[R]) // Find a valid recovery unit.
  202. R++;
  203. ProcData[I]=Data[R];
  204. R++;
  205. }
  206. }
  207. uint H=Decoding ? NE : NR;
  208. for (uint I = 0; I < H; I++)
  209. {
  210. uint R = 0; // Result of matrix row multiplication to data.
  211. uint *MXi=MX + I * ND;
  212. for (uint J = 0; J < ND; J++)
  213. R ^= gfMul(MXi[J], ProcData[J]);
  214. Out[I] = R;
  215. }
  216. }
  217. #endif
  218. // We update ECC in blocks by applying every data block to all ECC blocks.
  219. // This function applies one data block to one ECC block.
  220. void RSCoder16::UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize)
  221. {
  222. if (DataNum==0) // Init ECC data.
  223. memset(ECC, 0, BlockSize);
  224. bool DirectAccess;
  225. #ifdef LITTLE_ENDIAN
  226. // We can access data and ECC directly if we have little endian 16 bit uint.
  227. DirectAccess=sizeof(ushort)==2;
  228. #else
  229. DirectAccess=false;
  230. #endif
  231. #ifdef USE_SSE
  232. if (DirectAccess && SSE_UpdateECC(DataNum,ECCNum,Data,ECC,BlockSize))
  233. return;
  234. #endif
  235. if (ECCNum==0)
  236. {
  237. if (DataLogSize!=BlockSize)
  238. {
  239. delete[] DataLog;
  240. DataLog=new uint[BlockSize];
  241. DataLogSize=BlockSize;
  242. }
  243. if (DirectAccess)
  244. for (size_t I=0; I<BlockSize; I+=2)
  245. DataLog[I] = gfLog[ *(ushort*)(Data+I) ];
  246. else
  247. for (size_t I=0; I<BlockSize; I+=2)
  248. {
  249. uint D=Data[I]+Data[I+1]*256;
  250. DataLog[I] = gfLog[ D ];
  251. }
  252. }
  253. uint ML = gfLog[ MX[ECCNum * ND + DataNum] ];
  254. if (DirectAccess)
  255. for (size_t I=0; I<BlockSize; I+=2)
  256. *(ushort*)(ECC+I) ^= gfExp[ ML + DataLog[I] ];
  257. else
  258. for (size_t I=0; I<BlockSize; I+=2)
  259. {
  260. uint R=gfExp[ ML + DataLog[I] ];
  261. ECC[I]^=byte(R);
  262. ECC[I+1]^=byte(R/256);
  263. }
  264. }
  265. #ifdef USE_SSE
  266. // Data and ECC addresses must be properly aligned for SSE.
  267. // AVX2 did not provide a noticeable speed gain on i7-6700K here.
  268. bool RSCoder16::SSE_UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize)
  269. {
  270. // Check data alignment and SSSE3 support.
  271. if ((size_t(Data) & (SSE_ALIGNMENT-1))!=0 || (size_t(ECC) & (SSE_ALIGNMENT-1))!=0 ||
  272. _SSE_Version<SSE_SSSE3)
  273. return false;
  274. uint M=MX[ECCNum * ND + DataNum];
  275. // Prepare tables containing products of M and 4, 8, 12, 16 bit length
  276. // numbers, which have 4 high bits in 0..15 range and other bits set to 0.
  277. // Store high and low bytes of resulting 16 bit product in separate tables.
  278. __m128i T0L,T1L,T2L,T3L; // Low byte tables.
  279. __m128i T0H,T1H,T2H,T3H; // High byte tables.
  280. for (uint I=0;I<16;I++)
  281. {
  282. ((byte *)&T0L)[I]=gfMul(I,M);
  283. ((byte *)&T0H)[I]=gfMul(I,M)>>8;
  284. ((byte *)&T1L)[I]=gfMul(I<<4,M);
  285. ((byte *)&T1H)[I]=gfMul(I<<4,M)>>8;
  286. ((byte *)&T2L)[I]=gfMul(I<<8,M);
  287. ((byte *)&T2H)[I]=gfMul(I<<8,M)>>8;
  288. ((byte *)&T3L)[I]=gfMul(I<<12,M);
  289. ((byte *)&T3H)[I]=gfMul(I<<12,M)>>8;
  290. }
  291. size_t Pos=0;
  292. __m128i LowByteMask=_mm_set1_epi16(0xff); // 00ff00ff...00ff
  293. __m128i Low4Mask=_mm_set1_epi8(0xf); // 0f0f0f0f...0f0f
  294. __m128i High4Mask=_mm_slli_epi16(Low4Mask,4); // f0f0f0f0...f0f0
  295. for (; Pos+2*sizeof(__m128i)<=BlockSize; Pos+=2*sizeof(__m128i))
  296. {
  297. // We process two 128 bit chunks of source data at once.
  298. __m128i *D=(__m128i *)(Data+Pos);
  299. // Place high bytes of both chunks to one variable and low bytes to
  300. // another, so we can use the table lookup multiplication for 16 values
  301. // 4 bit length each at once.
  302. __m128i HighBytes0=_mm_srli_epi16(D[0],8);
  303. __m128i LowBytes0=_mm_and_si128(D[0],LowByteMask);
  304. __m128i HighBytes1=_mm_srli_epi16(D[1],8);
  305. __m128i LowBytes1=_mm_and_si128(D[1],LowByteMask);
  306. __m128i HighBytes=_mm_packus_epi16(HighBytes0,HighBytes1);
  307. __m128i LowBytes=_mm_packus_epi16(LowBytes0,LowBytes1);
  308. // Multiply bits 0..3 of low bytes. Store low and high product bytes
  309. // separately in cumulative sum variables.
  310. __m128i LowBytesLow4=_mm_and_si128(LowBytes,Low4Mask);
  311. __m128i LowBytesMultSum=_mm_shuffle_epi8(T0L,LowBytesLow4);
  312. __m128i HighBytesMultSum=_mm_shuffle_epi8(T0H,LowBytesLow4);
  313. // Multiply bits 4..7 of low bytes. Store low and high product bytes separately.
  314. __m128i LowBytesHigh4=_mm_and_si128(LowBytes,High4Mask);
  315. LowBytesHigh4=_mm_srli_epi16(LowBytesHigh4,4);
  316. __m128i LowBytesHigh4MultLow=_mm_shuffle_epi8(T1L,LowBytesHigh4);
  317. __m128i LowBytesHigh4MultHigh=_mm_shuffle_epi8(T1H,LowBytesHigh4);
  318. // Add new product to existing sum, low and high bytes separately.
  319. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,LowBytesHigh4MultLow);
  320. HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,LowBytesHigh4MultHigh);
  321. // Multiply bits 0..3 of high bytes. Store low and high product bytes separately.
  322. __m128i HighBytesLow4=_mm_and_si128(HighBytes,Low4Mask);
  323. __m128i HighBytesLow4MultLow=_mm_shuffle_epi8(T2L,HighBytesLow4);
  324. __m128i HighBytesLow4MultHigh=_mm_shuffle_epi8(T2H,HighBytesLow4);
  325. // Add new product to existing sum, low and high bytes separately.
  326. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesLow4MultLow);
  327. HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesLow4MultHigh);
  328. // Multiply bits 4..7 of high bytes. Store low and high product bytes separately.
  329. __m128i HighBytesHigh4=_mm_and_si128(HighBytes,High4Mask);
  330. HighBytesHigh4=_mm_srli_epi16(HighBytesHigh4,4);
  331. __m128i HighBytesHigh4MultLow=_mm_shuffle_epi8(T3L,HighBytesHigh4);
  332. __m128i HighBytesHigh4MultHigh=_mm_shuffle_epi8(T3H,HighBytesHigh4);
  333. // Add new product to existing sum, low and high bytes separately.
  334. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesHigh4MultLow);
  335. HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesHigh4MultHigh);
  336. // Combine separate low and high cumulative sum bytes to 16-bit words.
  337. __m128i HighBytesHigh4Mult0=_mm_unpacklo_epi8(LowBytesMultSum,HighBytesMultSum);
  338. __m128i HighBytesHigh4Mult1=_mm_unpackhi_epi8(LowBytesMultSum,HighBytesMultSum);
  339. // Add result to ECC.
  340. __m128i *StoreECC=(__m128i *)(ECC+Pos);
  341. StoreECC[0]=_mm_xor_si128(StoreECC[0],HighBytesHigh4Mult0);
  342. StoreECC[1]=_mm_xor_si128(StoreECC[1],HighBytesHigh4Mult1);
  343. }
  344. // If we have non 128 bit aligned data in the end of block, process them
  345. // in a usual way. We cannot do the same in the beginning of block,
  346. // because Data and ECC can have different alignment offsets.
  347. for (; Pos<BlockSize; Pos+=2)
  348. *(ushort*)(ECC+Pos) ^= gfMul( M, *(ushort*)(Data+Pos) );
  349. return true;
  350. }
  351. #endif