diff --git a/lib_com/basop_util.c b/lib_com/basop_util.c index 1dc21c5f1129c6dd2ced14614acff4d7e3e2e543..6ecc23bee1debab7ccdd69defbf5e716030d802a 100644 --- a/lib_com/basop_util.c +++ b/lib_com/basop_util.c @@ -988,7 +988,6 @@ Word32 div_w( Word32 L_num, Word32 L_den ) } -static Word32 div_w_newton( Word32 num, Word32 den ); /* Table of 256 precalculated estimates to be used by the "div_w_newton" function using the Newton/Raphson method. @@ -1283,7 +1282,7 @@ static Word32 L_dmult( Word32 L_var1, Word32 L_var2 ) * BASOP weights: 24 (incl. L_dmult) */ -static Word32 div_w_newton( Word32 num, Word32 den ) +Word32 div_w_newton( Word32 num, Word32 den ) { Word32 x0, x1, x2, x3, diff, result; diff --git a/lib_com/basop_util.h b/lib_com/basop_util.h index cbd7838655e95b97380107cd4144c756ad1f2b3e..b016690d6c6ed3d8f1fecd87596f37b53f05552b 100644 --- a/lib_com/basop_util.h +++ b/lib_com/basop_util.h @@ -328,6 +328,9 @@ Word16 BASOP_Util_Divide3232_Scale( Word32 x, /*!< i : Numerator*/ Word32 y, /*!< i : Denominator*/ Word16 *s ); /*!< o : Additional scalefactor difference*/ +Word32 div_w_newton( Word32 num, /*!< i : Numerator*/ + Word32 den ); /*!< i : Denominator*/ + Word32 BASOP_Util_Divide3232_Scale_newton( Word32 x, /*!< i : Numerator*/ Word32 y, /*!< i : Denominator*/ Word16 *s ); /*!< o : Additional scalefactor difference*/ diff --git a/lib_com/options.h b/lib_com/options.h index 79b51a0572ba05320040640b300360a88cd4ca83..b73b5ee8a07ae4d0e864f861de938e225384bca1 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -109,6 +109,7 @@ #define OPT_2182_MATRIX_SCALE_OPS /* Dolby: Issue 2181, move matrix scale operations outside mul operations. */ #define OPT_2185_MATRIX_OUT_SCALING /* Dolby: Issue 2185, optimize matrix-mul output-format. */ #define OPT_2239_IVAS_FILTER_PROCESS /* Dolby: Issue 2239, optimize ivas_filter_process_fx. */ +#define NONBE_OPT_2193_EIG2X2 /* Dolby: Issue 2193, optimize eig2x2_fx. */ /* #################### End BASOP optimization switches ############################ */ diff --git a/lib_rend/ivas_dirac_dec_binaural_functions_fx.c b/lib_rend/ivas_dirac_dec_binaural_functions_fx.c index 6571237797ca9cd45d68600130f85b8fce606ab4..a10f9d5ea5ff6d7734561d85e1add5850b334db3 100644 --- a/lib_rend/ivas_dirac_dec_binaural_functions_fx.c +++ b/lib_rend/ivas_dirac_dec_binaural_functions_fx.c @@ -3482,6 +3482,18 @@ static void ivas_dirac_dec_binaural_check_and_switch_transports_headtracked_fx( return; } +#ifdef NONBE_OPT_2193_EIG2X2 +static Word32 eig2x2_div_fx( Word32 num, Word32 den ); + +static Word32 eig2x2_div_fx( Word32 num, Word32 den ) +{ + IF( EQ_32( den, 0x40000000 ) ) + { + return num; + } + return div_w_newton( num, den ); +} +#endif static void eig2x2_fx( const Word32 E1_fx, /*q_E*/ @@ -3496,6 +3508,319 @@ static void eig2x2_fx( Word32 D_fx[BINAURAL_CHANNELS], /*q_D*/ Word16 *q_D ) { +#ifdef NONBE_OPT_2193_EIG2X2 + Word32 pm_fx, add_fx; + Word32 tmp1, tmp2, e1, e2, c_re, c_im, c0_im, c1_im; + Word32 s0_fx, s1_fx, nval0_fx, nval1_fx; + Word64 crossSquare_fx, tmp3, tmp4; + Word16 q_crossSquare, q_min, q_diff, q_tmp1, q_tmp2, exp, q_e, q_c; + Word16 nval0_q, nval1_q; + Word32 i01, i00, i11, i10; + Word64 eps_fx = ( (Word64) EPSILON_MANT ) << 32; + Word16 eps_q = 63 - EPSILON_EXP; + move32(); + move16(); + + set32_fx( (Word32 *) Ure_fx, 0, BINAURAL_CHANNELS * BINAURAL_CHANNELS ); + set32_fx( (Word32 *) Uim_fx, 0, BINAURAL_CHANNELS * BINAURAL_CHANNELS ); + + exp = sub( get_min_scalefactor( Cre_fx, Cim_fx ), 2 ); + c_re = L_shl( Cre_fx, exp ); + c_im = L_shl( Cim_fx, exp ); + q_c = add( q_C, exp ); + + exp = sub( get_min_scalefactor( E1_fx, E2_fx ), 2 ); + e1 = L_shl( E1_fx, exp ); + e2 = L_shl( E2_fx, exp ); + q_e = add( q_E, exp ); + + // crossSquare_fx = (c_re * c_re) + (c_im * c_im) + // a_fx = (e1 + e2) * (e1 + e2) - 4.0f * ((e1 * e2) - crossSquare_fx) = (e1 - e2)^2 + 4 * crossSquare_fx + // pm_fx = 0.5f * sqrtf(max(0.0f, a_fx)) + // add_fx = 0.5f * (e1 + e2) + + tmp1 = L_sub( e1, e2 ); + tmp3 = W_mult_32_32( tmp1, tmp1 ); + q_tmp1 = add( add( q_e, q_e ), 1 ); + if ( !tmp3 ) + { + q_tmp1 = 63; + move16(); + } + + crossSquare_fx = W_mac_32_32( W_mult_32_32( c_re, c_re ), c_im, c_im ); + q_crossSquare = add( add( q_c, q_c ), 1 ); + if ( !crossSquare_fx ) + { + q_crossSquare = 63; + move16(); + } + + tmp4 = crossSquare_fx; + move64(); + q_tmp2 = sub( q_crossSquare, 2 ); + if ( !tmp4 ) + { + q_tmp2 = 63; + move16(); + } + + q_diff = sub( q_tmp1, q_tmp2 ); + q_tmp1 = s_min( q_tmp1, q_tmp2 ); + if ( q_diff > 0 ) + { + tmp3 = W_shr( tmp3, q_diff ); + } + if ( q_diff < 0 ) + { + tmp4 = W_shl( tmp4, q_diff ); + } + tmp3 = W_add( tmp3, tmp4 ); + q_diff = W_norm( tmp3 ); + tmp3 = W_shl( tmp3, q_diff ); + q_tmp1 = add( q_tmp1, q_diff ); + + // pm_fx = 0.5f * sqrtf(max(0.0f, a_fx)) + exp = sub( 63, q_tmp1 ); + pm_fx = Sqrt32( L_max( 0, W_extract_h( tmp3 ) ), &exp ); + pm_fx = L_shr( pm_fx, 1 ); + q_tmp2 = sub( 31, exp ); + + // add_fx = 0.5 * (e1 + e2) + add_fx = L_shr( L_add( e1, e2 ), 1 ); + q_tmp1 = q_e; + move16(); + + // D[0] = add + pm; + // D[1] = max( 0.0f, add - pm ); + + q_diff = sub( q_tmp1, q_tmp2 ); + + tmp1 = add_fx; + move32(); + if ( q_diff > 0 ) + { + tmp1 = L_shr( tmp1, q_diff ); + } + + tmp2 = pm_fx; + move32(); + if ( q_diff < 0 ) + { + tmp2 = L_shl( tmp2, q_diff ); + } + + D_fx[0] = L_add( tmp1, tmp2 ); + move32(); + D_fx[1] = L_max( L_sub( tmp1, tmp2 ), 0 ); + move32(); + *q_D = s_min( q_tmp1, q_tmp2 ); + move32(); + + // Numeric case, when input is practically zeros + // if ( D_fx[0] < EPSILON_FX ) + + IF( LT_32( L_shl_sat( D_fx[0], sub( 31 - EPSILON_EXP, *q_D ) ), EPSILON_MANT ) ) + { + Ure_fx[0][0] = ONE_IN_Q30; + move32(); + Ure_fx[1][1] = ONE_IN_Q30; + move32(); + *q_U = Q30; + move16(); + return; + } + + // Numeric case, when input is near an identity matrix with a gain + tmp1 = Mpy_32_32( INV_1000_Q31, add_fx ); + if ( q_diff > 0 ) + { + tmp1 = L_shr( tmp1, q_diff ); + } + + IF( LT_32( tmp2, tmp1 ) ) + { + Ure_fx[0][0] = ONE_IN_Q30; + move32(); + Ure_fx[1][1] = ONE_IN_Q30; + move32(); + *q_U = Q30; + move16(); + return; + } + + // Eigenvectors + + q_diff = sub( q_e, *q_D ); + q_tmp1 = s_min( q_e, *q_D ); + + tmp1 = D_fx[0]; + move32(); + if ( q_diff > 0 ) + { + tmp1 = L_shr( tmp1, q_diff ); + } + + tmp2 = D_fx[1]; + move32(); + if ( q_diff > 0 ) + { + tmp2 = L_shr( tmp2, q_diff ); + } + + if ( q_diff < 0 ) + { + e1 = L_shl( e1, q_diff ); + } + + if ( q_diff < 0 ) + { + e2 = L_shl( e2, q_diff ); + } + + s0_fx = L_sub( tmp1, e1 ); // D_fx[0] - e1 + tmp1 = L_sub( tmp1, e2 ); // D_fx[0] - e2 + s1_fx = L_sub( tmp2, e1 ); // D_fx[1] - e1 + tmp2 = L_sub( tmp2, e2 ); // D_fx[1] - e2 + + i01 = GT_32( L_abs( tmp1 ), L_abs( s0_fx ) ); // fabsf( D_fx[0] - e2 ) > fabsf( D_fx[0] - e1 ) + i11 = GT_32( L_abs( tmp2 ), L_abs( s1_fx ) ); // fabsf( D_fx[1] - e2 ) > fabsf( D_fx[1] - e1 ) + + if ( i01 ) + { + s0_fx = tmp1; + move32(); + } + + if ( i11 ) + { + s1_fx = tmp2; + move32(); + } + + // normVal = sqrtf( 1.0f / ( 1e-12f + crossSquare + s * s ) ); + + q_tmp2 = shl( q_tmp1, 1 ); + q_min = s_min( q_tmp2, q_crossSquare ); + q_min = s_min( q_min, eps_q ); + + q_diff = sub( q_tmp2, q_min ); + tmp3 = W_shr( W_mult0_32_32( s0_fx, s0_fx ), q_diff ); + tmp4 = W_shr( W_mult0_32_32( s1_fx, s1_fx ), q_diff ); + + q_diff = sub( q_crossSquare, q_min ); + crossSquare_fx = W_shr( crossSquare_fx, q_diff ); + tmp3 = W_add( tmp3, crossSquare_fx ); + tmp4 = W_add( tmp4, crossSquare_fx ); + + q_diff = sub( eps_q, q_min ); + eps_fx = W_shr( eps_fx, q_diff ); + tmp3 = W_add( tmp3, eps_fx ); + tmp4 = W_add( tmp4, eps_fx ); + + q_diff = W_norm( tmp3 ); + tmp3 = W_shl( tmp3, q_diff ); + nval0_q = add( q_min, q_diff ); + + q_diff = W_norm( tmp4 ); + tmp4 = W_shl( tmp4, q_diff ); + nval1_q = add( q_min, q_diff ); + + // nval0_fx = BASOP_Util_Divide3232_Scale_newton( ONE_IN_Q30, W_extract_h( tmp3 ), &exp ); + // exp = sub( exp, sub( 62, nval0_q ) ); + // + // is equivalent to: + // + // nval0_fx = div_w_newton( ONE_IN_Q30, W_extract_h( tmp3 ) ); + // exp = sub( nval0_q, 61 ); + + nval0_fx = eig2x2_div_fx( ONE_IN_Q30, W_extract_h( tmp3 ) ); + exp = sub( nval0_q, 61 ); + nval0_fx = Sqrt32( nval0_fx, &exp ); + nval0_q = sub( 31, exp ); + + // nval1_fx = BASOP_Util_Divide3232_Scale_newton( ONE_IN_Q30, W_extract_h( tmp4 ), &exp ); + // exp = sub( exp, sub( 62, nval1_q ) ); + // + // is equivalent to: + // + // nval1_fx = div_w_newton( ONE_IN_Q30, W_extract_h( tmp4 ) ); + // exp = sub( nval1_q, 61 ); + + nval1_fx = eig2x2_div_fx( ONE_IN_Q30, W_extract_h( tmp4 ) ); + exp = sub( nval1_q, 61 ); + nval1_fx = Sqrt32( nval1_fx, &exp ); + nval1_q = sub( 31, exp ); + + q_diff = sub( q_c, q_tmp1 ); + q_tmp1 = s_min( q_tmp1, q_c ); + + if ( q_diff > 0 ) + { + c_re = L_shr( c_re, q_diff ); + } + + if ( q_diff > 0 ) + { + c_im = L_shr( c_im, q_diff ); + } + + if ( q_diff < 0 ) + { + s0_fx = L_shl( s0_fx, q_diff ); + } + + if ( q_diff < 0 ) + { + s1_fx = L_shl( s1_fx, q_diff ); + } + + q_diff = sub( nval0_q, nval1_q ); + q_tmp2 = s_min( nval0_q, nval1_q ); + + if ( q_diff > 0 ) + { + nval0_fx = L_shr( nval0_fx, q_diff ); + } + + if ( q_diff < 0 ) + { + nval1_fx = L_shl( nval1_fx, q_diff ); + } + + *q_U = sub( add( q_tmp1, q_tmp2 ), 31 ); + + i00 = L_sub( 1, i01 ); + i10 = L_sub( 1, i11 ); + + c0_im = c_im; + move32(); + if ( i00 > 0 ) + { + c0_im = L_negate( c0_im ); + } + + c1_im = c_im; + move32(); + if ( i10 > 0 ) + { + c1_im = L_negate( c1_im ); + } + + Ure_fx[i00][0] = Mpy_32_32( s0_fx, nval0_fx ); + move32(); + Ure_fx[i01][0] = Mpy_32_32( c_re, nval0_fx ); + move32(); + Uim_fx[i01][0] = Mpy_32_32( c0_im, nval0_fx ); + move32(); + + Ure_fx[i10][1] = Mpy_32_32( s1_fx, nval1_fx ); + move32(); + Ure_fx[i11][1] = Mpy_32_32( c_re, nval1_fx ); + move32(); + Uim_fx[i11][1] = Mpy_32_32( c1_im, nval1_fx ); + move32(); +#else Word16 chA, chB, ch; Word32 s_fx, normVal_fx, crossSquare_fx, a_fx, pm_fx, add_fx; Word32 tmp1, tmp2, tmp3, e1, e2, c_re, c_im; @@ -3862,7 +4187,7 @@ static void eig2x2_fx( *q_U = q_U_2; move16(); } - +#endif return; }