// Auto-generated by `genhbc`, do not edit! #if defined( R8B_SSE2 ) R8BHBC1( convolve1 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]); R8BHBC2 R8BHBC1( convolve2 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); R8BHBC2 R8BHBC1( convolve3 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); op[ 1 ] += flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]); R8BHBC2 R8BHBC1( convolve4 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); R8BHBC2 R8BHBC1( convolve5 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); op[ 1 ] += flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]); R8BHBC2 R8BHBC1( convolve6 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); R8BHBC2 R8BHBC1( convolve7 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); op[ 1 ] += flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]); R8BHBC2 R8BHBC1( convolve8 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); R8BHBC2 R8BHBC1( convolve9 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); op[ 1 ] += flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]); R8BHBC2 R8BHBC1( convolve10 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); R8BHBC2 R8BHBC1( convolve11 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); op[ 1 ] += flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]); R8BHBC2 R8BHBC1( convolve12 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); R8BHBC2 R8BHBC1( convolve13 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); op[ 1 ] += flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]); R8BHBC2 R8BHBC1( convolve14 ) __m128d v1, v2, m1, s1; v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = m1; __m128d v3, v4, m3, s3; v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = m3; v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 ); m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ), _mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 ))); s3 = _mm_add_pd( s3, m3 ); v2 = _mm_loadu_pd( rp - 13 ); v1 = _mm_loadu_pd( rp + 13 ); m1 = _mm_mul_pd( _mm_load_pd( flt + 12 ), _mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 ))); s1 = _mm_add_pd( s1, m1 ); s1 = _mm_add_pd( s1, s3 ); _mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 ))); R8BHBC2 #elif defined( R8B_NEON ) R8BHBC1( convolve1 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]); R8BHBC2 R8BHBC1( convolve2 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); op[ 1 ] = vaddvq_f64( s1 ); R8BHBC2 R8BHBC1( convolve3 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); op[ 1 ] = vaddvq_f64( s1 ) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]); R8BHBC2 R8BHBC1( convolve4 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ); R8BHBC2 R8BHBC1( convolve5 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]); R8BHBC2 R8BHBC1( convolve6 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ); R8BHBC2 R8BHBC1( convolve7 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]); R8BHBC2 R8BHBC1( convolve8 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ); R8BHBC2 R8BHBC1( convolve9 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]); R8BHBC2 R8BHBC1( convolve10 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ); R8BHBC2 R8BHBC1( convolve11 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]); R8BHBC2 R8BHBC1( convolve12 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ); R8BHBC2 R8BHBC1( convolve13 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ) + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]); R8BHBC2 R8BHBC1( convolve14 ) float64x2_t v1, v2, s1; s1 = vdupq_n_f64( 0.0 ); v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); float64x2_t v3, v4, s3; s3 = vdupq_n_f64( 0.0 ); v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 ); s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ), vaddq_f64( v3, vextq_f64( v4, v4, 1 ))); v2 = vld1q_f64( rp - 13 ); v1 = vld1q_f64( rp + 13 ); s1 = vmlaq_f64( s1, vld1q_f64( flt + 12 ), vaddq_f64( v1, vextq_f64( v2, v2, 1 ))); s1 = vaddq_f64( s1, s3 ); op[ 1 ] = vaddvq_f64( s1 ); R8BHBC2 #else // SIMD R8BHBC1( convolve1 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]); R8BHBC2 R8BHBC1( convolve2 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]); R8BHBC2 R8BHBC1( convolve3 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]); R8BHBC2 R8BHBC1( convolve4 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]); R8BHBC2 R8BHBC1( convolve5 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]); R8BHBC2 R8BHBC1( convolve6 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]); R8BHBC2 R8BHBC1( convolve7 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]); R8BHBC2 R8BHBC1( convolve8 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]) + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]); R8BHBC2 R8BHBC1( convolve9 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]) + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]); R8BHBC2 R8BHBC1( convolve10 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]) + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]) + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]); R8BHBC2 R8BHBC1( convolve11 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]) + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]) + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]); R8BHBC2 R8BHBC1( convolve12 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]) + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]) + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]) + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]); R8BHBC2 R8BHBC1( convolve13 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]) + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]) + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]) + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]) + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]); R8BHBC2 R8BHBC1( convolve14 ) op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]) + flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]) + flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]) + flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]) + flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]) + flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]) + flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]) + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]) + flt[ 13 ] * ( rp[ 14 ] + rp[ -13 ]); R8BHBC2 #endif // SIMD