2D( 2k, 4k, 8k ) FFT (IFFT) source code in C99


#include <math.h>
#include <omp.h>         // For CPUs with many cores like Ryzen 9 3900X and later
////////  omp.h =>OpenMP // In distributed computers, you should use OpenMPI.
#include <complex.h>     // If use complex.h performance will spoil 10% or more,
                         // but readability is dramatically improved.

#define INT       register int
#define DOUBLE    register double
#ifndef M_PI
#define M_PI      3.14159265358979323846
#endif
#define Rotor(a)  (double _Complex)( cos(a) + I*sin(a) )

enum { FFT=DFT, IFFT=IDFT };

void FFT_TwoDimensionalC99( DOUBLE _Complex **d, INT elements, INT flag ) {
INT     x, y,  lx=elements, ly=elements;
DOUBLE  unit = (flag==IFFT) ? 2*M_PI/elements : -2*M_PI/elements;
double _Complex rotor[elements]; for(x=0;x<elements;rotor[x]=Rotor(unit*x),x++);
//////  If the flag is FFT, unit=-2*M_PI/elements, if IFFT, unit=2*M_PI/elements.

    #pragma omp parallel for       // gcc -fopenmp -O3  (Strongly recommendation)
    for( y=0; y<ly; y++ ) {        // X軸方向の1次元FFT(IFFT)をy行分行う  <=並列化
        INT     i, j, k, m, mh, ir;// ir: irev
        DOUBLE _Complex  W, temp;  //  W: Rotation factor
        for( i=0, j=1; j<elements-1; j++ ) {  // scrambler, for initial placement
            for( k=elements>>1; k>(i^=k); k>>=1 );
            if( j<i ){ temp=d[y][j], d[y][j]=d[y][i], d[y][i]=temp; } // exchange
        }// Because in this pare (i,j), there are two, (a,b) or (b,a), cases.
        for( mh=1; (m=mh<<1)<=elements; mh=m ) { // Operations
            for( ir=0, i=0; i<elements; i+=m ) { // ir=0: Rotation angle is 0 rad.
                for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 ); //next ir is
                for( j=i; j<mh+i; j++ ) { INT a = j,  b = j + mh;
                      temp  = d[y][a] - d[y][b]; // For butterfly operation
                    d[y][a] = d[y][a] + d[y][b]; // a ->- (a + b) ->- a
                    d[y][b] = temp * W;          // b ->- (a-b)*W ->- b
        }   }   }//WELL: DOUBLE reciprocal=1./elements; =>d[y][i++]*=reciprocal);
        if(flag==IFFT)for( i=0; i<elements; d[y][i++]/=elements); //Normalization
    } // X_END   <= このブロック ( { ... } の部分 ) は、スレッドセーフ(thread-safe)
    #pragma omp parallel for       // gcc -fopenmp -O3  (Strongly recommendation)
    for( x=0; x<lx; x++ ) {        // Y軸方向の1次元FFT(IFFT)をx列分行う  <=並列化
        INT     i, j, k, m, mh, ir;// ir: irev
        DOUBLE _Complex  W, temp;  //  W: Rotation factor
        for( i=0, j=1; j<elements-1; j++ ) {  // scrambler, for initial placement
            for( k=elements>>1; k>(i^=k); k>>=1 );
            if( j<i ){ temp=d[j][x], d[j][x]=d[i][x], d[i][x]=temp; } // exchange
        }// Because in this pare (i,j), there are two, (a,b) or (b,a), cases.
        for( mh=1; (m=mh<<1)<=elements; mh=m ) { // Operations
            for( ir=0, i=0; i<elements; i+=m ) { // ir=0: Rotation angle is 0 rad.
                for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 ); //next ir is
                for( j=i; j<mh+i; j++ ) { INT a = j,  b = j + mh;
                      temp  = d[a][x] - d[b][x]; // For butterfly operation
                    d[a][x] = d[a][x] + d[b][x]; // a ->- (a + b) ->- a
                    d[b][x] = temp * W;          // b ->- (a-b)*W ->- b
        }   }   }//WELL: DOUBLE reciprocal=1./elements; =>d[i++][x]*=reciprocal);
        if(flag==IFFT)for( i=0; i<elements; d[i++][x]/=elements); //Normalization
    } // Y_END   <= このブロック ( { ... } の部分 ) は、スレッドセーフ(thread-safe)
}
// Ex. (i,j): (32,16), (16,32), (48,48), (64,8), (8,64), (72,72), (80,40), .....
////////////////////////////////////////////////////////////////////////
// How to use FFT_TwoDimensionalC99(...)
//
//extern int fft_check( int );
//void FFT_TwoDimensionalC99( double _Complex **d, int elements, int flag ) {
//if(!fft_check(elements)) return DFT_TwoDimensionalC99(d,elements,elements,flag);
//
int fft_check( int n ) {
    if( n<1000 || 20000<n ) return 0;
    for(;n>1;n>>=1) if(n&1) return 0;
    return 1;
}
int fft_neo_check( int lx, int ly ) {// 2K, 4K, 8K サイズ対応用
    if( lx!=ly || lx<2000 || 10000<lx ) return 0;
    return fft_check(lx);
}

void dft_TwoDimensional( COMPLEX **s, int lx, int ly, int flag ) {
int     w = lx>ly ? lx : ly;
COMPLEX **spreadsheet = NULL;

    double _Complex **ss = (double _Complex **)s;
    if(fft_neo_check(lx,ly)) return FFT_TwoDimensionalC99(ss,lx,flag);
    if( (spreadsheet=(COMPLEX**)alloc2Darray(sizeof(COMPLEX),w,w)) ) {
       dft_TwoDimensionalWithTable( spreadsheet, w, s, lx, ly, flag );
       free(spreadsheet);
    } else dft_TwoDimensionalStandard( s, lx, ly, flag );
}