1
0

CDSPHBUpsampler.inc 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. // Auto-generated by `genhbc`, do not edit!
  2. #if defined( R8B_SSE2 )
  3. R8BHBC1( convolve1 )
  4. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
  5. R8BHBC2
  6. R8BHBC1( convolve2 )
  7. __m128d v1, v2, m1, s1;
  8. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  9. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  10. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  11. s1 = m1;
  12. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  13. R8BHBC2
  14. R8BHBC1( convolve3 )
  15. __m128d v1, v2, m1, s1;
  16. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  17. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  18. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  19. s1 = m1;
  20. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  21. op[ 1 ] += flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
  22. R8BHBC2
  23. R8BHBC1( convolve4 )
  24. __m128d v1, v2, m1, s1;
  25. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  26. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  27. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  28. s1 = m1;
  29. __m128d v3, v4, m3, s3;
  30. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  31. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  32. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  33. s3 = m3;
  34. s1 = _mm_add_pd( s1, s3 );
  35. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  36. R8BHBC2
  37. R8BHBC1( convolve5 )
  38. __m128d v1, v2, m1, s1;
  39. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  40. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  41. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  42. s1 = m1;
  43. __m128d v3, v4, m3, s3;
  44. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  45. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  46. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  47. s3 = m3;
  48. s1 = _mm_add_pd( s1, s3 );
  49. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  50. op[ 1 ] += flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
  51. R8BHBC2
  52. R8BHBC1( convolve6 )
  53. __m128d v1, v2, m1, s1;
  54. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  55. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  56. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  57. s1 = m1;
  58. __m128d v3, v4, m3, s3;
  59. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  60. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  61. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  62. s3 = m3;
  63. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  64. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  65. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  66. s1 = _mm_add_pd( s1, m1 );
  67. s1 = _mm_add_pd( s1, s3 );
  68. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  69. R8BHBC2
  70. R8BHBC1( convolve7 )
  71. __m128d v1, v2, m1, s1;
  72. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  73. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  74. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  75. s1 = m1;
  76. __m128d v3, v4, m3, s3;
  77. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  78. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  79. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  80. s3 = m3;
  81. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  82. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  83. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  84. s1 = _mm_add_pd( s1, m1 );
  85. s1 = _mm_add_pd( s1, s3 );
  86. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  87. op[ 1 ] += flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
  88. R8BHBC2
  89. R8BHBC1( convolve8 )
  90. __m128d v1, v2, m1, s1;
  91. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  92. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  93. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  94. s1 = m1;
  95. __m128d v3, v4, m3, s3;
  96. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  97. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  98. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  99. s3 = m3;
  100. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  101. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  102. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  103. s1 = _mm_add_pd( s1, m1 );
  104. v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
  105. m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
  106. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  107. s3 = _mm_add_pd( s3, m3 );
  108. s1 = _mm_add_pd( s1, s3 );
  109. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  110. R8BHBC2
  111. R8BHBC1( convolve9 )
  112. __m128d v1, v2, m1, s1;
  113. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  114. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  115. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  116. s1 = m1;
  117. __m128d v3, v4, m3, s3;
  118. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  119. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  120. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  121. s3 = m3;
  122. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  123. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  124. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  125. s1 = _mm_add_pd( s1, m1 );
  126. v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
  127. m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
  128. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  129. s3 = _mm_add_pd( s3, m3 );
  130. s1 = _mm_add_pd( s1, s3 );
  131. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  132. op[ 1 ] += flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
  133. R8BHBC2
  134. R8BHBC1( convolve10 )
  135. __m128d v1, v2, m1, s1;
  136. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  137. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  138. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  139. s1 = m1;
  140. __m128d v3, v4, m3, s3;
  141. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  142. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  143. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  144. s3 = m3;
  145. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  146. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  147. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  148. s1 = _mm_add_pd( s1, m1 );
  149. v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
  150. m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
  151. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  152. s3 = _mm_add_pd( s3, m3 );
  153. v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
  154. m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
  155. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  156. s1 = _mm_add_pd( s1, m1 );
  157. s1 = _mm_add_pd( s1, s3 );
  158. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  159. R8BHBC2
  160. R8BHBC1( convolve11 )
  161. __m128d v1, v2, m1, s1;
  162. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  163. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  164. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  165. s1 = m1;
  166. __m128d v3, v4, m3, s3;
  167. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  168. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  169. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  170. s3 = m3;
  171. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  172. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  173. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  174. s1 = _mm_add_pd( s1, m1 );
  175. v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
  176. m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
  177. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  178. s3 = _mm_add_pd( s3, m3 );
  179. v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
  180. m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
  181. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  182. s1 = _mm_add_pd( s1, m1 );
  183. s1 = _mm_add_pd( s1, s3 );
  184. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  185. op[ 1 ] += flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
  186. R8BHBC2
  187. R8BHBC1( convolve12 )
  188. __m128d v1, v2, m1, s1;
  189. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  190. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  191. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  192. s1 = m1;
  193. __m128d v3, v4, m3, s3;
  194. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  195. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  196. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  197. s3 = m3;
  198. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  199. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  200. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  201. s1 = _mm_add_pd( s1, m1 );
  202. v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
  203. m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
  204. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  205. s3 = _mm_add_pd( s3, m3 );
  206. v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
  207. m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
  208. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  209. s1 = _mm_add_pd( s1, m1 );
  210. v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
  211. m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
  212. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  213. s3 = _mm_add_pd( s3, m3 );
  214. s1 = _mm_add_pd( s1, s3 );
  215. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  216. R8BHBC2
  217. R8BHBC1( convolve13 )
  218. __m128d v1, v2, m1, s1;
  219. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  220. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  221. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  222. s1 = m1;
  223. __m128d v3, v4, m3, s3;
  224. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  225. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  226. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  227. s3 = m3;
  228. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  229. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  230. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  231. s1 = _mm_add_pd( s1, m1 );
  232. v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
  233. m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
  234. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  235. s3 = _mm_add_pd( s3, m3 );
  236. v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
  237. m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
  238. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  239. s1 = _mm_add_pd( s1, m1 );
  240. v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
  241. m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
  242. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  243. s3 = _mm_add_pd( s3, m3 );
  244. s1 = _mm_add_pd( s1, s3 );
  245. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  246. op[ 1 ] += flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
  247. R8BHBC2
  248. R8BHBC1( convolve14 )
  249. __m128d v1, v2, m1, s1;
  250. v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
  251. m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
  252. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  253. s1 = m1;
  254. __m128d v3, v4, m3, s3;
  255. v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
  256. m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
  257. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  258. s3 = m3;
  259. v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
  260. m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
  261. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  262. s1 = _mm_add_pd( s1, m1 );
  263. v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
  264. m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
  265. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  266. s3 = _mm_add_pd( s3, m3 );
  267. v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
  268. m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
  269. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  270. s1 = _mm_add_pd( s1, m1 );
  271. v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
  272. m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
  273. _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
  274. s3 = _mm_add_pd( s3, m3 );
  275. v2 = _mm_loadu_pd( rp - 13 ); v1 = _mm_loadu_pd( rp + 13 );
  276. m1 = _mm_mul_pd( _mm_load_pd( flt + 12 ),
  277. _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
  278. s1 = _mm_add_pd( s1, m1 );
  279. s1 = _mm_add_pd( s1, s3 );
  280. _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
  281. R8BHBC2
  282. #elif defined( R8B_NEON )
  283. R8BHBC1( convolve1 )
  284. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
  285. R8BHBC2
  286. R8BHBC1( convolve2 )
  287. float64x2_t v1, v2, s1;
  288. s1 = vdupq_n_f64( 0.0 );
  289. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  290. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  291. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  292. op[ 1 ] = vaddvq_f64( s1 );
  293. R8BHBC2
  294. R8BHBC1( convolve3 )
  295. float64x2_t v1, v2, s1;
  296. s1 = vdupq_n_f64( 0.0 );
  297. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  298. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  299. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  300. op[ 1 ] = vaddvq_f64( s1 ) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
  301. R8BHBC2
  302. R8BHBC1( convolve4 )
  303. float64x2_t v1, v2, s1;
  304. s1 = vdupq_n_f64( 0.0 );
  305. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  306. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  307. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  308. float64x2_t v3, v4, s3;
  309. s3 = vdupq_n_f64( 0.0 );
  310. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  311. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  312. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  313. s1 = vaddq_f64( s1, s3 );
  314. op[ 1 ] = vaddvq_f64( s1 );
  315. R8BHBC2
  316. R8BHBC1( convolve5 )
  317. float64x2_t v1, v2, s1;
  318. s1 = vdupq_n_f64( 0.0 );
  319. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  320. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  321. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  322. float64x2_t v3, v4, s3;
  323. s3 = vdupq_n_f64( 0.0 );
  324. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  325. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  326. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  327. s1 = vaddq_f64( s1, s3 );
  328. op[ 1 ] = vaddvq_f64( s1 ) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
  329. R8BHBC2
  330. R8BHBC1( convolve6 )
  331. float64x2_t v1, v2, s1;
  332. s1 = vdupq_n_f64( 0.0 );
  333. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  334. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  335. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  336. float64x2_t v3, v4, s3;
  337. s3 = vdupq_n_f64( 0.0 );
  338. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  339. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  340. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  341. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  342. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  343. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  344. s1 = vaddq_f64( s1, s3 );
  345. op[ 1 ] = vaddvq_f64( s1 );
  346. R8BHBC2
  347. R8BHBC1( convolve7 )
  348. float64x2_t v1, v2, s1;
  349. s1 = vdupq_n_f64( 0.0 );
  350. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  351. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  352. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  353. float64x2_t v3, v4, s3;
  354. s3 = vdupq_n_f64( 0.0 );
  355. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  356. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  357. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  358. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  359. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  360. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  361. s1 = vaddq_f64( s1, s3 );
  362. op[ 1 ] = vaddvq_f64( s1 ) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
  363. R8BHBC2
  364. R8BHBC1( convolve8 )
  365. float64x2_t v1, v2, s1;
  366. s1 = vdupq_n_f64( 0.0 );
  367. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  368. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  369. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  370. float64x2_t v3, v4, s3;
  371. s3 = vdupq_n_f64( 0.0 );
  372. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  373. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  374. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  375. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  376. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  377. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  378. v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
  379. s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
  380. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  381. s1 = vaddq_f64( s1, s3 );
  382. op[ 1 ] = vaddvq_f64( s1 );
  383. R8BHBC2
  384. R8BHBC1( convolve9 )
  385. float64x2_t v1, v2, s1;
  386. s1 = vdupq_n_f64( 0.0 );
  387. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  388. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  389. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  390. float64x2_t v3, v4, s3;
  391. s3 = vdupq_n_f64( 0.0 );
  392. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  393. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  394. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  395. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  396. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  397. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  398. v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
  399. s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
  400. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  401. s1 = vaddq_f64( s1, s3 );
  402. op[ 1 ] = vaddvq_f64( s1 ) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
  403. R8BHBC2
  404. R8BHBC1( convolve10 )
  405. float64x2_t v1, v2, s1;
  406. s1 = vdupq_n_f64( 0.0 );
  407. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  408. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  409. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  410. float64x2_t v3, v4, s3;
  411. s3 = vdupq_n_f64( 0.0 );
  412. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  413. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  414. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  415. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  416. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  417. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  418. v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
  419. s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
  420. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  421. v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
  422. s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
  423. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  424. s1 = vaddq_f64( s1, s3 );
  425. op[ 1 ] = vaddvq_f64( s1 );
  426. R8BHBC2
  427. R8BHBC1( convolve11 )
  428. float64x2_t v1, v2, s1;
  429. s1 = vdupq_n_f64( 0.0 );
  430. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  431. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  432. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  433. float64x2_t v3, v4, s3;
  434. s3 = vdupq_n_f64( 0.0 );
  435. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  436. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  437. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  438. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  439. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  440. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  441. v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
  442. s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
  443. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  444. v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
  445. s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
  446. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  447. s1 = vaddq_f64( s1, s3 );
  448. op[ 1 ] = vaddvq_f64( s1 ) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
  449. R8BHBC2
  450. R8BHBC1( convolve12 )
  451. float64x2_t v1, v2, s1;
  452. s1 = vdupq_n_f64( 0.0 );
  453. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  454. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  455. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  456. float64x2_t v3, v4, s3;
  457. s3 = vdupq_n_f64( 0.0 );
  458. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  459. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  460. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  461. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  462. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  463. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  464. v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
  465. s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
  466. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  467. v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
  468. s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
  469. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  470. v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
  471. s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
  472. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  473. s1 = vaddq_f64( s1, s3 );
  474. op[ 1 ] = vaddvq_f64( s1 );
  475. R8BHBC2
  476. R8BHBC1( convolve13 )
  477. float64x2_t v1, v2, s1;
  478. s1 = vdupq_n_f64( 0.0 );
  479. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  480. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  481. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  482. float64x2_t v3, v4, s3;
  483. s3 = vdupq_n_f64( 0.0 );
  484. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  485. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  486. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  487. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  488. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  489. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  490. v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
  491. s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
  492. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  493. v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
  494. s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
  495. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  496. v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
  497. s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
  498. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  499. s1 = vaddq_f64( s1, s3 );
  500. op[ 1 ] = vaddvq_f64( s1 ) + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
  501. R8BHBC2
  502. R8BHBC1( convolve14 )
  503. float64x2_t v1, v2, s1;
  504. s1 = vdupq_n_f64( 0.0 );
  505. v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
  506. s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
  507. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  508. float64x2_t v3, v4, s3;
  509. s3 = vdupq_n_f64( 0.0 );
  510. v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
  511. s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
  512. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  513. v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
  514. s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
  515. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  516. v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
  517. s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
  518. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  519. v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
  520. s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
  521. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  522. v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
  523. s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
  524. vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
  525. v2 = vld1q_f64( rp - 13 ); v1 = vld1q_f64( rp + 13 );
  526. s1 = vmlaq_f64( s1, vld1q_f64( flt + 12 ),
  527. vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
  528. s1 = vaddq_f64( s1, s3 );
  529. op[ 1 ] = vaddvq_f64( s1 );
  530. R8BHBC2
  531. #else // SIMD
  532. R8BHBC1( convolve1 )
  533. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
  534. R8BHBC2
  535. R8BHBC1( convolve2 )
  536. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  537. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]);
  538. R8BHBC2
  539. R8BHBC1( convolve3 )
  540. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  541. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  542. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
  543. R8BHBC2
  544. R8BHBC1( convolve4 )
  545. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  546. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  547. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  548. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]);
  549. R8BHBC2
  550. R8BHBC1( convolve5 )
  551. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  552. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  553. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  554. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  555. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
  556. R8BHBC2
  557. R8BHBC1( convolve6 )
  558. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  559. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  560. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  561. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  562. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  563. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]);
  564. R8BHBC2
  565. R8BHBC1( convolve7 )
  566. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  567. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  568. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  569. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  570. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  571. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  572. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
  573. R8BHBC2
  574. R8BHBC1( convolve8 )
  575. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  576. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  577. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  578. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  579. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  580. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  581. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
  582. + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]);
  583. R8BHBC2
  584. R8BHBC1( convolve9 )
  585. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  586. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  587. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  588. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  589. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  590. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  591. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
  592. + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
  593. + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
  594. R8BHBC2
  595. R8BHBC1( convolve10 )
  596. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  597. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  598. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  599. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  600. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  601. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  602. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
  603. + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
  604. + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
  605. + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]);
  606. R8BHBC2
  607. R8BHBC1( convolve11 )
  608. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  609. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  610. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  611. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  612. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  613. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  614. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
  615. + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
  616. + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
  617. + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
  618. + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
  619. R8BHBC2
  620. R8BHBC1( convolve12 )
  621. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  622. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  623. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  624. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  625. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  626. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  627. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
  628. + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
  629. + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
  630. + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
  631. + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
  632. + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]);
  633. R8BHBC2
  634. R8BHBC1( convolve13 )
  635. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  636. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  637. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  638. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  639. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  640. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  641. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
  642. + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
  643. + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
  644. + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
  645. + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
  646. + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
  647. + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
  648. R8BHBC2
  649. R8BHBC1( convolve14 )
  650. op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
  651. + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
  652. + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
  653. + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
  654. + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
  655. + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
  656. + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
  657. + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
  658. + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
  659. + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
  660. + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
  661. + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
  662. + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ])
  663. + flt[ 13 ] * ( rp[ 14 ] + rp[ -13 ]);
  664. R8BHBC2
  665. #endif // SIMD