* ======================================================================= * * TEXAS INSTRUMENTS, INC. * * * * NAME * * DSPF_sp_dotprod -- Dot Product of 2 Single Precision float vector * * * * USAGE * * * * This routine is C Callable and can be called as: * * * * float DSPF_sp_dotprod(const float *x, const float *y, * * const int nx); * * * * x : Pointer to array holding the first floating point vector * * y : Pointer to array holding the second floating point vector * * nx : Number of values in the x & y vectors * * * * * * DESCRIPTION * * * * This routine calculates the dot product of 2 single precision * * float vectors. * * * * TECHNIQUES * * * * 1. LDDW instructions are used to load two SP floating point * * values at a time for the x and y arrays. * * 2. The loop is unrolled once and software pipelined. * * However, by conditionally adding to the dot product * * odd numbered array sizes are also permitted. * * 3. Since the ADDSP and MPYSP instructions take 4 cycles, * * A8, B8, A0, and B0 multiplex different variables to save * * on register usage. * * This multiple assignment is possible since the variables * * are always read just once on the first cycle that they * * are available. * * 4. The loop is primed to reduce the prolog by 4 cycles * * (14 words) with no increase in cycle time. * * 5. The load counter is used as the loop counter which * * requires a 3 cycle (6 word) epilog to finish the * * calculations. This does not increase the cycle time. * * * * ASSUMPTIONS * * * * 1. The x and y arrays must be double word aligned. * * 2. A memory pad of 4 bytes is required at the end of each * * array if the number of inputs is odd. * * 3. The value of nx must be > 0. * * * * C CODE * * This is the C equivalent for the assembly code. Note that * * the assembly code is hand optimized and restrictions may * * apply. * * * * float DSPF_sp_dotprod(const float *x, const float *y, const int nx) * * { * * int i; * * float sum = 0; * * * * for (i=0; i < nx; i++) * * { * * sum += x[i]* y[i]; * * } * * return sum; * * } * * * * NOTES * * * * 1. Endian: This code is LITTLE ENDIAN. * * 2. Interruptibility: This code is interrupt tolerant but not * * interruptible. * * * * CYCLES * * * * nx/2 + 25 * * eg. for nx = 512, cycles = 281 * * * * CODESIZE * * 256 bytes * * ----------------------------------------------------------------------- * * Copyright (c) 2003 Texas Instruments, Incorporated. * * All Rights Reserved. * * ======================================================================= * .global _DSPF_sp_dotprod _DSPF_sp_dotprod: * =============== SYMBOLIC REGISTER ASSIGNMENT ============================* .asg A6, A_cntarg .asg A1, A_cnt .asg B2, B_cntodd .asg A4, A_x .asg B4, B_x .asg A7, A_x1 .asg A6, A_x0 .asg B7, B_x1 .asg B6, B_x0 .asg A8, sum0 .asg B8, sum1 .asg A5, prod0 .asg B5, prod1 .asg A0, A_sum10 .asg B0, B_sum10 .asg A5, A_finalh .asg B5, B_finalh .asg A4, A_return **************************** Prolog Begins********************************** MV A_cntarg, A_cnt || AND A_cntarg, 1, B_cntodd ; is cnt even or odd? ZERO .L2 prod1 ; prod1 = 0 ||[A_cnt] LDDW .D1 *A_x++, A_x1:A_x0 ; load x1:x0 from memory ||[A_cnt] LDDW .D2 *B_x++, B_x1:B_x0 ; load y1:y0 from memory ||[A_cnt] B .S2 LOOP ; branch to loop ||[B_cntodd] SUB .S1 A_cnt, 1, A_cnt ||[!B_cntodd] SUB .L1 A_cnt, 2, A_cnt ZERO .L1 sum0 ; sum0 = 0 || ZERO .L2 sum1 ; sum1 = 0 ||[A_cnt] LDDW .D1 *A_x++, A_x1:A_x0 ; load x1:x0 from memory ||[A_cnt] LDDW .D2 *B_x++, B_x1:B_x0 ; load y1:y0 from memory ||[A_cnt] B .S2 LOOP ; branch to loop ||[A_cnt] SUB .S1 A_cnt, 2, A_cnt [A_cnt] LDDW .D1 *A_x++, A_x1:A_x0 ; load x1:x0 from memory ||[A_cnt] LDDW .D2 *B_x++, B_x1:B_x0 ; load y1:y0 from memory ||[A_cnt] B .S2 LOOP ; branch to loop ||[A_cnt] SUB .S1 A_cnt, 2, A_cnt || ZERO .L1 prod0 ; prod0 = 0 [A_cnt] LDDW .D1 *A_x++, A_x1:A_x0 ; load x1:x0 from memory ||[A_cnt] LDDW .D2 *B_x++, B_x1:B_x0 ; load y1:y0 from memory ||[A_cnt] B .S1 LOOP ; branch to loop ||[A_cnt] SUB .L1 A_cnt, 2, A_cnt [A_cnt] LDDW .D1 *A_x++, A_x1:A_x0 ; load x1:x0 from memory ||[A_cnt] LDDW .D2 *B_x++, B_x1:B_x0 ; load y1:y0 from memory ||[A_cnt] B .S2 LOOP ; branch to loop ||[A_cnt] SUB .S1 A_cnt, 2, A_cnt ****** Loop Begins ***************************** LOOP: [A_cnt] LDDW .D1 *A_x++, A_x1:A_x0 ; if(lcntr) load x1:x0 from memory ||[A_cnt] LDDW .D2 *B_x++, B_x1:B_x0 ; if(lcntr) load y1:y0 from memory || MPYSP .M1X A_x0, B_x0, prod0; prod0 = x0 * y0 || MPYSP .M2X A_x1, B_x1, prod1; prod1 = x1 * y1 || ADDSP .L1 prod0, sum0, sum0 ; sum0 = prod0 + sum0 || ADDSP .L2 prod1, sum1, sum1 ; sum1 = prod1 + sum1 ||[A_cnt] B .S2 LOOP ; if(lcntr) branch to loop ||[A_cnt] SUB .S1 A_cnt, 2, A_cnt; if(lcntr) lcntr -= 2 ********************** Epilog Begins **************** ADDSP .L1 prod0, sum0, sum0; sum0 = prod0 + sum0 || ADDSP .L2 prod1, sum1, sum1; sum1 = prod1 + sum1 ADDSP .L1 prod0, sum0, sum0; sum0 = prod0 + sum0 || ADDSP .L2 prod1, sum1, sum1; sum1 = prod1 + sum1 ADDSP .L1 prod0, sum0, sum0; sum0 = prod0 + sum0 ||[!B_cntodd]ADDSP .L2 prod1, sum1, B9 ; sum1 = prod1 + sum1 ||[B_cntodd] MV .D2 sum1, B9 *********************** Epilog Ends **************** ADDSP .L1X sum0, sum1, A_sum10 ; Asum10=sum0 + sum1 || B .S1 NO_INTS ; To block interrupts ADDSP .L2X sum0, sum1, B_sum10 ; Bsum10=sum0 + sum1 ADDSP .L1X sum0, sum1, A_sum10 ; Asum10=sum0 + sum1 ADDSP .L2X sum0, B9, B_sum10 ; Bsum10=sum0 + sum1 NOP ; wait for Bsum10 ADDSP .L1X A_sum10, B_sum10, A_finalh ; A_finalh=Asum10+Bsum10 NO_INTS: NOP ; wait for next Bsum10 ADDSP .L2X A_sum10, B_sum10, B_finalh ; B_finalh=Asum10+Bsum10 NOP ; wait for B_finalh B .S2 B3 ; return from function NOP ; wait for B_finalh ADDSP .L1X A_finalh, B_finalh, A_return; return A_return NOP 3 ; A_return and branch .end * ======================================================================== * * End of file: sp_dotprod.asm * * ------------------------------------------------------------------------ * * Copyright (C) 2003 Texas Instruments, Incorporated. * * All Rights Reserved. * * ======================================================================== *