/* -------------------------------------------------------------- */ /* (C)Copyright 2001,2007, */ /* International Business Machines Corporation, */ /* Sony Computer Entertainment, Incorporated, */ /* Toshiba Corporation, */ /* */ /* All Rights Reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the */ /* following conditions are met: */ /* */ /* - Redistributions of source code must retain the above copyright*/ /* notice, this list of conditions and the following disclaimer. */ /* */ /* - Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* - Neither the name of IBM Corporation nor the names of its */ /* contributors may be used to endorse or promote products */ /* derived from this software without specific prior written */ /* permission. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */ /* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */ /* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */ /* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */ /* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ /* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */ /* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */ /* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* -------------------------------------------------------------- */ /* PROLOG END TAG zYx */ #ifndef _FFT_1D_R2_H_ #define _FFT_1D_R2_H_ 1 #include "fft_1d.h" /* fft_1d_r2 * --------- * Performs a single precision, complex Fast Fourier Transform using * the DFT (Discrete Fourier Transform) with radix-2 decimation in time. * The input is an array of complex numbers of length (1<. Note: This routine can support an in-place transformation * by specifying and to be the same array. * * This implementation utilizes the Cooley-Tukey algorithm consisting * of stages. The basic operation is the butterfly. * * p --------------------------> P = p + q*Wi * \ / * \ / * \ / * \/ * /\ * / \ * / \ * ____ / \ * q --| Wi |-----------------> Q = p - q*Wi * ---- * * This routine also requires pre-computed twiddle values, W. W is an * array of single precision complex numbers of length 1<<(log2_size-2) * and is computed as follows: * * for (i=0; i> 1; n_4 = n >> 2; n_8 = n >> 3; n_16 = n >> 4; n_3_16 = n_8 + n_16; /* Compute a byte reverse shuffle pattern to be used to produce * an address bit swap. */ reverse = spu_or(spu_slqwbyte(spu_splats((unsigned char)0x80), log2_size), spu_rlmaskqwbyte(((vec_uchar16){15,14,13,12, 11,10,9,8, 7, 6, 5, 4, 3, 2,1,0}), log2_size-16)); /* Perform the first 3 stages of the FFT. These stages differs from * other stages in that the inputs are unscrambled and the data is * reformated into parallel arrays (ie, seperate real and imaginary * arrays). The term "unscramble" means the bit address reverse the * data array. In addition, the first three stages have simple twiddle * weighting factors. * stage 1: (1, 0) * stage 2: (1, 0) and (0, -1) * stage 3: (1, 0), (0.707, -0.707), (0, -1), (-0.707, -0.707) * * The arrays are processed as two halves, simultaneously. The lo (first * half) and hi (second half). This is done because the scramble * shares source value between each half of the output arrays. */ i = 0; i_rev = 0; in0 = in; in1 = in + n_8; in2 = in + n_16; in3 = in + n_3_16; in4 = in + n_4; in5 = in1 + n_4; in6 = in2 + n_4; in7 = in3 + n_4; re0 = re; re1 = re + n_8; im0 = im; im1 = im + n_8; w0_re = (vector float) { 1.0f, INV_SQRT_2, 0.0f, -INV_SQRT_2}; w0_im = (vector float) { 0.0f, -INV_SQRT_2, -1.0f, -INV_SQRT_2}; do { src_lo0 = in0[i_rev]; src_lo1 = in1[i_rev]; src_lo2 = in2[i_rev]; src_lo3 = in3[i_rev]; src_hi0 = in4[i_rev]; src_hi1 = in5[i_rev]; src_hi2 = in6[i_rev]; src_hi3 = in7[i_rev]; /* Perform scramble. */ dst_lo0 = spu_shuffle(src_lo0, src_hi0, shuf_lo); dst_hi0 = spu_shuffle(src_lo0, src_hi0, shuf_hi); dst_lo1 = spu_shuffle(src_lo1, src_hi1, shuf_lo); dst_hi1 = spu_shuffle(src_lo1, src_hi1, shuf_hi); dst_lo2 = spu_shuffle(src_lo2, src_hi2, shuf_lo); dst_hi2 = spu_shuffle(src_lo2, src_hi2, shuf_hi); dst_lo3 = spu_shuffle(src_lo3, src_hi3, shuf_lo); dst_hi3 = spu_shuffle(src_lo3, src_hi3, shuf_hi); /* Perform the stage 1 butterfly. The multiplier constant, ppmm, * is used to control the sign of the operands since a single * quadword contains both of P and Q valule of the butterfly. */ pq_lo0 = spu_madd(ppmm, dst_lo0, spu_rlqwbyte(dst_lo0, 8)); pq_hi0 = spu_madd(ppmm, dst_hi0, spu_rlqwbyte(dst_hi0, 8)); pq_lo1 = spu_madd(ppmm, dst_lo1, spu_rlqwbyte(dst_lo1, 8)); pq_hi1 = spu_madd(ppmm, dst_hi1, spu_rlqwbyte(dst_hi1, 8)); pq_lo2 = spu_madd(ppmm, dst_lo2, spu_rlqwbyte(dst_lo2, 8)); pq_hi2 = spu_madd(ppmm, dst_hi2, spu_rlqwbyte(dst_hi2, 8)); pq_lo3 = spu_madd(ppmm, dst_lo3, spu_rlqwbyte(dst_lo3, 8)); pq_hi3 = spu_madd(ppmm, dst_hi3, spu_rlqwbyte(dst_hi3, 8)); /* Perfrom the stage 2 butterfly. For this stage, the * inputs pq are still interleaved (p.real, p.imag, q.real, * q.imag), so we must first re-order the data into * parallel arrays as well as perform the reorder * associated with the twiddle W[n/4], which equals * (0, -1). * * ie. (A, B) * (0, -1) => (B, -A) */ re_lo0 = spu_madd(ppmm, spu_shuffle(pq_lo1, pq_lo1, shuf_0303), spu_shuffle(pq_lo0, pq_lo0, shuf_0202)); im_lo0 = spu_madd(pmmp, spu_shuffle(pq_lo1, pq_lo1, shuf_1212), spu_shuffle(pq_lo0, pq_lo0, shuf_1313)); re_lo1 = spu_madd(ppmm, spu_shuffle(pq_lo3, pq_lo3, shuf_0303), spu_shuffle(pq_lo2, pq_lo2, shuf_0202)); im_lo1 = spu_madd(pmmp, spu_shuffle(pq_lo3, pq_lo3, shuf_1212), spu_shuffle(pq_lo2, pq_lo2, shuf_1313)); re_hi0 = spu_madd(ppmm, spu_shuffle(pq_hi1, pq_hi1, shuf_0303), spu_shuffle(pq_hi0, pq_hi0, shuf_0202)); im_hi0 = spu_madd(pmmp, spu_shuffle(pq_hi1, pq_hi1, shuf_1212), spu_shuffle(pq_hi0, pq_hi0, shuf_1313)); re_hi1 = spu_madd(ppmm, spu_shuffle(pq_hi3, pq_hi3, shuf_0303), spu_shuffle(pq_hi2, pq_hi2, shuf_0202)); im_hi1 = spu_madd(pmmp, spu_shuffle(pq_hi3, pq_hi3, shuf_1212), spu_shuffle(pq_hi2, pq_hi2, shuf_1313)); /* Perform stage 3 butterfly. */ FFT_1D_BUTTERFLY(re0[0], im0[0], re0[1], im0[1], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im); FFT_1D_BUTTERFLY(re1[0], im1[0], re1[1], im1[1], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im); re0 += 2; re1 += 2; im0 += 2; im1 += 2; i += 8; i_rev = BIT_SWAP(i, reverse) / 2; } while (i < n_2); /* Process stages 4 to log2_size-2 */ for (stage=4, stride=4; stage> stage; w_2stride = n >> stage; w_3stride = w_stride + w_2stride; w_4stride = w_2stride + w_2stride; W0 = W; W1 = W + w_stride; W2 = W + w_2stride; W3 = W + w_3stride; stride_2 = stride >> 1; stride_4 = stride >> 2; stride_3_4 = stride_2 + stride_4; re0 = re; im0 = im; re1 = re + stride_2; im1 = im + stride_2; re2 = re + stride_4; im2 = im + stride_4; re3 = re + stride_3_4; im3 = im + stride_3_4; for (i=0, offset=0; i> stage; w_2stride = n >> stage; w_3stride = w_stride + w_2stride; w_4stride = w_2stride + w_2stride; stride_2 = stride >> 1; stride_4 = stride >> 2; stride_3_4 = stride_2 + stride_4; re0 = re; im0 = im; re1 = re + stride_2; im1 = im + stride_2; re2 = re + stride_4; im2 = im + stride_4; re3 = re + stride_3_4; im3 = im + stride_3_4; for (i=0, offset=0; i. * * This loop has been manually unrolled by 2 to improve * dual issue rates and reduce stalls. This unrolling * forces a minimum FFT size of 32. */ re0 = re; re1 = re + n_8; re2 = re + n_16; re3 = re + n_3_16; im0 = im; im1 = im + n_8; im2 = im + n_16; im3 = im + n_3_16; out0 = out; out1 = out + n_4; out2 = out + n_8; out3 = out1 + n_8; i = n_16; do { /* Fetch the twiddle factors */ w0 = W[0]; w1 = W[1]; w2 = W[2]; w3 = W[3]; W += 4; w0_re = spu_shuffle(w0, w1, shuf_0246); w0_im = spu_shuffle(w0, w1, shuf_1357); w1_re = spu_shuffle(w2, w3, shuf_0246); w1_im = spu_shuffle(w2, w3, shuf_1357); /* Fetch the butterfly inputs, reals and imaginaries */ re_lo0 = re0[0]; im_lo0 = im0[0]; re_lo1 = re1[0]; im_lo1 = im1[0]; re_lo2 = re0[1]; im_lo2 = im0[1]; re_lo3 = re1[1]; im_lo3 = im1[1]; re_hi0 = re2[0]; im_hi0 = im2[0]; re_hi1 = re3[0]; im_hi1 = im3[0]; re_hi2 = re2[1]; im_hi2 = im2[1]; re_hi3 = re3[1]; im_hi3 = im3[1]; re0 += 2; im0 += 2; re1 += 2; im1 += 2; re2 += 2; im2 += 2; re3 += 2; im3 += 2; /* Perform the butterflys */ FFT_1D_BUTTERFLY (out_re_lo0, out_im_lo0, out_re_lo1, out_im_lo1, re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im); FFT_1D_BUTTERFLY (out_re_lo2, out_im_lo2, out_re_lo3, out_im_lo3, re_lo2, im_lo2, re_lo3, im_lo3, w1_re, w1_im); FFT_1D_BUTTERFLY_HI(out_re_hi0, out_im_hi0, out_re_hi1, out_im_hi1, re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im); FFT_1D_BUTTERFLY_HI(out_re_hi2, out_im_hi2, out_re_hi3, out_im_hi3, re_hi2, im_hi2, re_hi3, im_hi3, w1_re, w1_im); /* Interleave the results and store them into the output buffers (ie, * the original input buffers. */ out0[0] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_0415); out0[1] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_2637); out0[2] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_0415); out0[3] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_2637); out1[0] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_0415); out1[1] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_2637); out1[2] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_0415); out1[3] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_2637); out2[0] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_0415); out2[1] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_2637); out2[2] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_0415); out2[3] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_2637); out3[0] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_0415); out3[1] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_2637); out3[2] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_0415); out3[3] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_2637); out0 += 4; out1 += 4; out2 += 4; out3 += 4; i -= 2; } while (i); } #endif /* _FFT_1D_R2_H_ */