AI talk with me: Enhanced C99 template 2025/06/30 version.

USER


次の画像処理、データ処理用のテンプレートを評価してください。
少し長めですか、各関数の目的と使用方法を例を加え詳しく説明してください。



/* Enhanced C99 Template 2025/06/30 version

gcc -O3 -std=c99 -mavx -W -Wall -Wextra -o Template Template.c -fopenmp -lm

環境依存のコードが生成されますが、速度重視の場合は検討してください。

gcc -O3 -std=c99 -march=native -mavx -W -Wall -Wextra -o Template Template.c -fopenmp

A professionally created C99 template for learning and verifying image processing
The purpose is to make it easy to verify algorithms and to provide an environment 
that does not use external libraries (exception: stb_image.h) by default, 
reducing the labor required to set up the environment.
//                                                                                   //
//   You have the right to: You are free to use this source code and modify it       //
//   for your own use without any restrictions. In addition, we do not provide       //
//   any warranty or support under any circumstances.                                //
//                                                                                   //
#define  TEMPLATES_USE_AS_INCLUDE_FILE    or  gcc -DTEMPLATES_USE_AS_INCLUDE_FILE
#include "Template.c"
*/
#ifndef IMAGE_PROCESSING_TEMPLATES_C99
#define IMAGE_PROCESSING_TEMPLATES_C99
//////// https://github.com/nothings/stb/blob/master/stb_image.h
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <ctype.h>
#include <stdint.h>
#include <omp.h>            // gcc -fopenmp -O3 ( -O3: Strongly recommendation )

#define HERE                            fprintf(stderr,"% 6d: %s\n",__LINE__, __func__);
#define IF                              if          // ソースコードの行間での位置合わせ用
#define EI                              else if     // 縦に揃えて可読性を向上させるため。
#define EE                              else        // 人間の可読性向上用エイリアス。

extern int  verbose;
//////////////////////////////////////////////////////////////////////////////////////////
// Timer section
#include <time.h>
#ifdef _WIN32               // If you'll implement DrawText or DrawTextW yourself.
#include <windows.h>        // You need, #undef DrawText or #undef DrawTextW 
double get_micro_time() {
    static LARGE_INTEGER freq;
    LARGE_INTEGER now;
    if(freq.QuadPart == 0) QueryPerformanceFrequency(&freq);
    QueryPerformanceCounter(&now);
    return (double)now.QuadPart * 1e6 / (double)freq.QuadPart;
}
#else
#include <sys/time.h>
double get_micro_time() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec * 1e6 + (double)tv.tv_usec;
}
#endif
// double START_time   = get_micro_time();
// double E_N_D_time   = get_micro_time();
// double Elapsed_time = E_N_D_time - START_time;
// print_elapsed_time(Elapsed_time); print_elapsed_time(E_N_D_time-START_time);
#define print_elapsed_time(microsec) fprint_elapsed_time(stdout, (microsec))
void fprint_elapsed_time(FILE *fp, double microsec) {
    IF(microsec < 1000.0) fprintf(fp,"Elapsed time: %.3f usec\n", microsec);
    EI(microsec < 1e6   ) fprintf(fp,"Elapsed time: %.3f msec\n", microsec / 1000.0);
    EE                    fprintf(fp,"Elapsed time: %.3f sec\n",  microsec / 1e6);
}
void fprintf_elapsed_time(FILE *fp, double microsec, int verbose) {
    unsigned long long  total_us = (unsigned long long)(microsec + 0.5); // 丸め
    unsigned long long        us = total_us % 1000;
    unsigned long long  total_ms = total_us / 1000;
    unsigned long long        ms = total_ms % 1000;
    unsigned long long total_sec = total_ms / 1000;
    unsigned long long       sec = total_sec % 60;
    unsigned long long total_min = total_sec / 60;
    unsigned long long       min = total_min % 60;
    unsigned long long      hour = total_min / 60;
    static const char *time_format[]={ "Elapsed time: ",
                                       "%2lluh %2llum %2llus %3llums %3lluus\n",
                                       "%lluh %llum %llus %llums %lluus\n"    };
    fprintf(fp,time_format[0]);
    if( verbose ){ // 全項目を常に表示（0でも）
        // fprintf(fp,format[2], hour, min, sec, ms, us);
        fprintf(fp,time_format[1], hour, min, sec, ms, us);
    } else {       // 必要な項目のみ表示（上位から下位へ）
        int printed = 0;
        if (hour > 0           ) { fprintf(fp,"%lluh ", hour); printed = 1; }
        if ( min > 0 || printed) { fprintf(fp,"%llum ", min);  printed = 1; }
        if ( sec > 0 || printed) { fprintf(fp,"%llus ", sec);  printed = 1; }
        if (  ms > 0 || printed) { fprintf(fp,"%llums ", ms);  printed = 1; }
        printf("%lluus\n", us);
    }
}
////////////////////////////////////////////////////////////////////////////////
// Basic structure definition
//
#pragma pack(push,1)
typedef struct PIXEL { uint8_t B, G, R, A; } PIXEL ;
typedef struct STBPX { uint8_t R, G, B, A; } STBPX ;
#pragma pack(pop)
// BMPヘッダ構造体
#pragma pack(push, 2)
typedef struct HEADER {
    unsigned short bfType;              // 'BM' (0x4D42)
    unsigned int   bfSize;              // ファイルサイズ
    unsigned short bfReserved1;         // 0
    unsigned short bfReserved2;         // 0
    unsigned int   bfOffBits;           // データオフセット
    unsigned int   biSize;              // 情報ヘッダサイズ (40)
    signed int     biWidth;             // 幅
    signed int     biHeight;            // 高さ（負で上下反転）
    unsigned short biPlanes;            // 1
    unsigned short biBitCount;          // 32
    unsigned int   biCompression;       // 0 (BI_RGB)
    unsigned int   biSizeImage;         // 画像データサイズ
    signed int     biXPelsPerMeter;     // 0
    signed int     biYPelsPerMeter;     // 0
    unsigned int   biClrUsed;           // 0 
    unsigned int   biClrImportant;      // 0
    unsigned char  lut[];               // パレット（8ビット用）
} HEADER;
#pragma pack(pop)
// 画像パラメータ
#define BMPID                           0x4D42   // 'BM'
#define BMPIFHSIZE                      40       // 情報ヘッダサイズ
#define DDOUBLE                         long double
#define INF_ZERO06                      (1.0e-6)
#define INF_ZERO10                      (1.0e-10)
#define INF_ZERO                        (1.0e-12)

#define v2D(type,lx,name)               ((type(*)[(lx)])name)
#define v2Dp(lx,name)                   v2D(PIXEL,lx,name)
#define v2Dc(lx,name)                   v2D(char,lx,name)
#define v2Di(lx,name)                   v2D(int,lx,name)
#define v2Df(lx,name)                   v2D(float,lx,name)
#define v2Dd(lx,name)                   v2D(double,lx,name)

typedef struct HashDB{ char *key; union { unsigned long long here;
                                          long long          value;
                                          void               *p_data; 
                                        }; } HashDB;

void **alloc2Darray(int, int, int);
int saveDataAsBMP32( char *, void*, int, int );
#define saveArray2bmp32(name,a,w,h)     saveDataAsBMP32((name),(&(a)[0][0]),(w),(h))

#define PROCESS_ALL_PIXELS              for(int y=0;y<ly;y++)for(int x=0;x<lx;x++)
#define mtxMul(m,p) (POINTf){\
        ((m[0][0]*(p.x)+m[0][1]*(p.y)+m[0][2])/(m[2][0]*(p.x)+m[2][1]*(p.y)+m[2][2])),\
        ((m[1][0]*(p.x)+m[1][1]*(p.y)+m[1][2])/(m[2][0]*(p.x)+m[2][1]*(p.y)+m[2][2])) }

typedef struct POINTi { int    x, y; } POINTi;
typedef struct POINTf { float  x, y; } POINTf;
typedef struct POINTd { double x, y; } POINTd;

#include <math.h>
#undef  PI
#define PI   3.1415926535897932384626433832795L  // 円周率(π)
#define M_PI PI

typedef struct COMPLEX { double r, i; } COMPLEX;

#define ConjCOMPLEX(a)           (COMPLEX){ (a).r, -((a).i) }
#define RotorCOMPLEX(a)          (COMPLEX){ cos(a), sin(a) }
#define AddCOMPLEX(a,b)          (COMPLEX){ (a).r + (b).r, (a).i + (b).i }
#define SubCOMPLEX(a,b)          (COMPLEX){ (a).r - (b).r, (a).i - (b).i }
#define MulCOMPLEX(a,b)          (COMPLEX){ ((a).r)*((b).r) - ((a).i)*((b).i), \
                                            ((a).r)*((b).i) + ((a).i)*((b).r) }
#define ImeDivCOMPLEX(a,b)       (COMPLEX){ (a).r / (b), (a).i / (b) }
#define ImeMulCOMPLEX(a,b)       (COMPLEX){ (a).r * (b), (a).i * (b) }
#define ImmediateMulCOMPLEX(b,a) (COMPLEX){ (a).r * (b), (a).i * (b) }
#define AppendCOMPLEX(a,b)       { (a).r += (b).r,  (a).i += (b).i ; }

enum { DFT, IDFT,  FFT=DFT, IFFT=IDFT,  CZT=FFT, ICZT=IFFT };

#define fft_Multiline_with_RF         fft_Multiline_with_RotationFactors
static inline void fft_Multiline_with_RotationFactors( const COMPLEX *, COMPLEX **, int, int, int);

void interchange2D( COMPLEX **d, int lx, int ly ){ COMPLEX t;
    for(int hly=ly/2, hlx=lx/2, y=0; y<hly; y++ )for(int x=0; x<hlx; x++ ){
       t = d[y][x]    ; d[y][x]     = d[y+hly][x+hlx]; d[y+hly][x+hlx] = t;
       t = d[y][x+hlx]; d[y][x+hlx] = d[y+hly][x];     d[y+hly][x]     = t;
    }
}
#define INF_ZERO                        (1.0e-12)
void PowerSpectrum( char *fname, COMPLEX **u, PIXEL **a, int lx, int ly ) {
    double temp, min, max; min=max=u[0][0].r*u[0][0].r + u[0][0].i*u[0][0].i;
    for(int y=0; y<ly; y++ )for(int x=0; x<lx; x++ ) {
        if( max<(temp=u[y][x].r*u[y][x].r + u[y][x].i*u[y][x].i) ) max = temp;
        else if( min>temp ) min = temp;
    }   min = sqrt(min);    max = sqrt(max);
    double mag = 255.0 / (max>min && max-min>INF_ZERO ? log10(max - min) : 1.0);
    for(int y=0; y<ly; y++ )for(int x=0; x<lx; x++ ) {
        int gray = mag*log10(sqrt(u[y][x].r*u[y][x].r + u[y][x].i*u[y][x].i)-min);
        if(255<gray) gray=255; else if( gray<0 ) gray=0;
        a[y][x] = (PIXEL){gray,gray,gray,0};
    }   if(fname)saveArray2bmp32( fname, a, lx, ly );
}
////////////////////////////////////////////////////////////////////////////////
//// checks if the argument is a power of 2 ( Returns 1 if it is a power of 2 )
int ft_size_check( int n ){ if( n<4 ) return 0;
    for(int e=n;e>1;e>>=1 ) if( e&1 ) return 0;
    return 1;
}
////////////////////////////////////////////////////////////////////////////////
//// A function that returns the smallest power of 2 that is greater than or 
//// equal to twice its argument
int get_czt_size( int n ){ if( ft_size_check(n) ) return n;
    int a=1;  for( a=1; a<(n<<1); a<<=1 );
    return a;
}
////////////////////////////////////////////////////////////////////////////////
//// A function that is slow but can be used to obtain accurate calculations. 
void dft_TwoDimensional(COMPLEX **d, int lx, int ly, int flag){int merr=0, elements=lx;
    if (verbose) printf("execution: %s\t%d\n", __func__, flag);
    COMPLEX *rotor = (COMPLEX *)calloc(lx > ly ? lx : ly, sizeof(COMPLEX));
    if(!rotor){ fprintf(stderr, "Memory allocation failed\n"); return; }
    else{ elements  = lx;            // 回転因子の計算
        double unit = (flag == IDFT) ? 2 * M_PI / elements : -2 * M_PI / elements;
        for(int e=0; e<elements; e++) rotor[e] = RotorCOMPLEX(unit * e);
    }
    // 行ごとのDFT
    #pragma omp parallel for
    for(int y=0; y<ly; y++){ COMPLEX *tmp = malloc(elements * sizeof(COMPLEX));
        if( !tmp ){  merr=1; } else {   // 各列に対してDFT計算
            for(int u=0; u<elements; u++){ COMPLEX t = {0, 0};
                for (int v = 0; v < elements; v++)
                    AppendCOMPLEX(t, MulCOMPLEX(d[y][v], rotor[(u * v) % elements]));
                tmp[u] = t;
            }
            if(flag==IDFT){ double reciprocal = 1.0 / elements; //Tips to avoid division
                for(int e=0;e<elements;e++) d[y][e] = ImeMulCOMPLEX(tmp[e], reciprocal);
            }EE for(int e=0;e<elements;e++) d[y][e] = tmp[e];
        }   free(tmp);
    }
    // 列ごとのDFT
    if( lx != ly ){ elements = ly;   // 回転因子の再計算
        double unit = (flag == IDFT) ? 2 * M_PI / elements : -2 * M_PI / elements;
        for(int e=0; e<elements; e++) rotor[e] = RotorCOMPLEX(unit * e);
    }
    #pragma omp parallel for
    for(int x=0; x<lx; x++){ COMPLEX *tmp = malloc(elements * sizeof(COMPLEX));
        if( !tmp ){ merr=1; } else {
            for(int u=0; u<elements; u++){ COMPLEX t = {0, 0};
                for (int v=0; v<elements; v++)
                    AppendCOMPLEX(t, MulCOMPLEX(d[v][x], rotor[(u * v) % elements]));
                tmp[u] = t;
            }   
            if(flag==IDFT){ double reciprocal = 1.0 / elements; //Tips to avoid division
                for(int e=0;e<elements;e++) d[e][x] = ImeMulCOMPLEX(tmp[e], reciprocal);
            }EE for(int e=0;e<elements;e++) d[e][x] = tmp[e];
        }   free(tmp);
    }
    if (merr)fprintf(stderr,"Processing interrupted due to memory allocation failure.\n");
    free(rotor);
}
////////////////////////////////////////////////////////////////////////////////
//  Fourier Transform CONTROL BLOCK
//
//  samples       : number of actual data (Fixed input and output)
//  conforming_fft: Number of data used in CZT conversion (power of 2)
//  W[2]          : For FFT/IFFT, Twiddle factor table
//  w             : The number of weight data elements is (samples)
//  v             : The number of impulse response data elements(conforming_fft)
//
//  Specialized in image processing, output resizing function has been discarded
//  N + M -1,  (N + M -1)< 2**n.  n is ....
////////////////////////////////////////////////////////////////////////////////
//  ROTATION_FACTOR
typedef struct ROTATION_FACTOR { int elements, flag; COMPLEX *table; }
               ROTATION_FACTOR, ROTATION_TABLE;
typedef struct FT_CONTROL {
    int  samples;        // 標本点の数
    int  conforming_fft; // 2の整数乗値：flag_fft?samples:2*get_czt_size(samples)
    ROTATION_FACTOR W[2];// 回転因子用テーブル
    COMPLEX *w;          // 重みデータ            要素数は(samples)
    COMPLEX *v;          // インパルス応答データ  要素数は(conforming_fft)
} FT_CONTROL;
////////////////////////////////////////////////////////////////////////////////
//// A function that generates a structure for Chirp Z-transform for a given 
//// integer value and returns its pointer.
//// You MUST call "free()" with this pointer when it is no longer needed.
//// 
FT_CONTROL *GenerateFT_CONTROL( int n ){
    int samples        = n;
    int conforming_fft = ft_size_check(n) ? n : get_czt_size(n), nx = conforming_fft;
    FT_CONTROL *a = (FT_CONTROL*)malloc( sizeof(FT_CONTROL)
                                       + sizeof(COMPLEX)*( 2*conforming_fft )
                                       + sizeof(COMPLEX)*( samples + (conforming_fft +1)) );
    if(!a){ return NULL;}             // +1 for dummy
    *a = (FT_CONTROL){  // 標本点の数,  ２の冪：２の整数乗値
          .samples = samples,  .conforming_fft = conforming_fft, 
          .W       = { { nx,  FFT,   (COMPLEX*)&a[1]      }, // 順変換FFT用回転因子値テーブル
                       { nx, IFFT, &((COMPLEX*)&a[1])[nx] }, // 逆変換FFT用回転因子値テーブル
                     },
          .w       = &((COMPLEX*)&a[1])[2*nx],   // 重みデータ           要素数は(samples)
          .v       = &((COMPLEX*)&a[1])[n+2*nx], // インパルス応答データ 要素数は(conforming_fft)
         };
         
    COMPLEX *Forward = a->W[0].table, *Reverse = a->W[1].table; // 0:FFT, 1:IFFT
    double  unit = -2*PI/nx;
    for(int e=0;e<nx;e++) Reverse[e]=ConjCOMPLEX( Forward[e]=RotorCOMPLEX(unit*e) );
    
    COMPLEX *w=a->w, *v=a->v;
    double  theta = PI/n;
    for(int i=n;i<conforming_fft;i++) v[i]=(COMPLEX){ 0, 0 };
    for(int tail=conforming_fft, n2=n<<1, here=0, i=0;i<n;i++ ){
        double s, c; sincos( theta*here, &s,&c); // 1.0*cos(), 1.0*sin() amplitude and phase
             here = ( here +1 + (i<<1) ) % n2 ;  // 角度情報 データー数依存なので 1例 factor.mp4
        w[i]      = (COMPLEX){ c, -s };          // complex conjugate:  0<= { w[i] <-> v[i] } < n
        v[i]      = (COMPLEX){ c,  s };
        v[tail-i] = (COMPLEX){ c,  s };          // stretching ; v[tail] was keeded as dummy area
    }   fft_Multiline_with_RotationFactors( a->W[0].table, &v, conforming_fft, 1, FFT );
    
    return a;
}

static inline void fft_Multiline_with_RotationFactors( const COMPLEX *rotor, COMPLEX **c, 
                                                       int elements, int lines, int flag ){
    //2の冪(べき)のみ、power of two, ONLY
    if( !ft_size_check(elements) ){ fprintf(stderr,"power of two, ONLY. %d\n",elements); return; }
    #pragma omp parallel for
    for(int e=0; e<lines; e++ ){
        for(int i=0, j=1; j<elements-1; j++ ) {           //// scrambler, for initial placement
            for(int k=elements>>1; k>(i^=k); k>>=1 );
            if( j<i ){COMPLEX temp=c[e][j]; c[e][j]=c[e][i], c[e][i]=temp; } // exchange
        }
        for(int m, mh=1; (m=mh<<1)<=elements; mh=m ) {    //// Operations
             for(int ir=0, i=0; i<elements; i+=m ) {      // ir=0:  Rotation angle is 0 radian.
                COMPLEX temp, W=rotor[ir];                // W: Rotation factor
                for(int k=elements>>2; k>(ir^=k); k>>=1 );
                for(int j=i; j<mh+i; j++ ) { int a = j,  b = j + mh;
                       temp = SubCOMPLEX(c[e][a],c[e][b]);// For butterfly operation
                    c[e][a] = AddCOMPLEX(c[e][a],c[e][b]);// a ->- (a + b) ->- a
                    c[e][b] = MulCOMPLEX(temp, W);        // b ->- (a-b)*W ->- b
        }   }   }   double reciprocal = 1./elements;
        if(flag==IFFT)for(int i=0; i<elements; c[e][i]=ImeMulCOMPLEX(c[e][i],reciprocal),i++);
    }
}
void FourierTransform( COMPLEX **c, int lx, int ly, int flag ){ int st = 1;
    FT_CONTROL *CtlX = NULL, *CtlY = NULL;
    COMPLEX     **tx = NULL,  **ty = NULL;
    
    if(!(CtlX = GenerateFT_CONTROL(lx))) goto GOOD_BY;
    if(!(CtlY = GenerateFT_CONTROL(ly))) goto GOOD_BY;

    int fft_lx = CtlX->conforming_fft, fft_ly = CtlY->conforming_fft;
    if(!( tx = (COMPLEX**)alloc2Darray(sizeof(COMPLEX), fft_lx, ly))) goto GOOD_BY;
    if(!( ty = (COMPLEX**)alloc2Darray(sizeof(COMPLEX), fft_ly, lx))) goto GOOD_BY;

    int fftX = ft_size_check(lx), fftY = ft_size_check(ly);
    COMPLEX *RotorXf = CtlX->W[0].table, *RotorXr = CtlX->W[1].table;
    COMPLEX *RotorYf = CtlY->W[0].table, *RotorYr = CtlY->W[1].table;
    
    if( fftX ){ fft_Multiline_with_RF( CtlX->W[(flag==FFT?0:1)].table, c, lx, ly,  flag );
        for(int y=0;y<ly;y++)for(int x=0;x<lx;x++) ty[x][y] = c[y][x];
    }else{ int  samples = CtlX->samples, conforming_fft = CtlX->conforming_fft;
        COMPLEX *w=CtlX->w, *v=CtlX->v;
        if(flag==CZT){
            #pragma omp parallel for
            for(int y=0;y<ly;y++)for(int e=0; e<samples;        e++) tx[y][e] = MulCOMPLEX( c[y][e], w[e] );
            fft_Multiline_with_RF( RotorXf, tx, conforming_fft, ly,  FFT );
            #pragma omp parallel for
            for(int y=0;y<ly;y++)for(int e=0; e<conforming_fft; e++) tx[y][e] = MulCOMPLEX(tx[y][e], v[e] ); 
            fft_Multiline_with_RF( RotorXr, tx, conforming_fft, ly,  IFFT );
            #pragma omp parallel for
            for(int y=0;y<ly;y++)for(int e=0; e<samples;        e++) ty[e][y] = MulCOMPLEX(tx[y][e], w[e] );
        }else{ double m = 1.0/samples;    //  ICZT
            #pragma omp parallel for
            for(int y=0;y<ly;y++)for(int e=0; e<samples;        e++) tx[y][e] = MulCOMPLEX( c[y][e], ConjCOMPLEX(w[e]) );
            fft_Multiline_with_RF( RotorXf, tx, conforming_fft, ly,  FFT );
            #pragma omp parallel for
            for(int y=0;y<ly;y++)for(int e=0; e<conforming_fft; e++) tx[y][e] = MulCOMPLEX(tx[y][e], ConjCOMPLEX(v[e]) ); 
            fft_Multiline_with_RF( RotorXr, tx, conforming_fft, ly,  IFFT );
            #pragma omp parallel for
            for(int y=0;y<ly;y++)for(int e=0; e<samples;        e++) ty[e][y] = ImmediateMulCOMPLEX( m,
                                                                                MulCOMPLEX(tx[y][e], ConjCOMPLEX(w[e]) ));
        }
    }
    if( fftY ){ fft_Multiline_with_RF( CtlY->W[(flag==FFT?0:1)].table, ty, ly, lx,  flag );
        for(int y=0;y<ly;y++)for(int x=0;x<lx;x++)  c[y][x] = ty[x][y];
    }else{ int  samples = CtlY->samples, conforming_fft = CtlY->conforming_fft;
        COMPLEX *w=CtlY->w, *v=CtlY->v;
        if(flag==CZT){
            #pragma omp parallel for
            for(int x=0;x<lx;x++)for(int e=0; e<samples;        e++) ty[x][e] = MulCOMPLEX(ty[x][e], w[e] );
            fft_Multiline_with_RF( RotorYf, ty, conforming_fft, lx,  FFT );
            #pragma omp parallel for
            for(int x=0;x<lx;x++)for(int e=0; e<conforming_fft; e++) ty[x][e] = MulCOMPLEX(ty[x][e], v[e] ); 
            fft_Multiline_with_RF( RotorYr, ty, conforming_fft, lx,  IFFT );
            #pragma omp parallel for
            for(int x=0;x<lx;x++)for(int e=0; e<samples;        e++)  c[e][x] = MulCOMPLEX(ty[x][e], w[e] );
        }else{ double m = 1.0/samples;    //  ICZT
            #pragma omp parallel for
            for(int x=0;x<lx;x++)for(int e=0; e<samples;        e++) ty[x][e] = MulCOMPLEX(ty[x][e], ConjCOMPLEX(w[e]) );
            fft_Multiline_with_RF( RotorYf, ty, conforming_fft, lx,  FFT );
            #pragma omp parallel for
            for(int x=0;x<lx;x++)for(int e=0; e<conforming_fft; e++) ty[x][e] = MulCOMPLEX(ty[x][e], ConjCOMPLEX(v[e]) ); 
            fft_Multiline_with_RF( RotorYr, ty, conforming_fft, lx,  IFFT );
            #pragma omp parallel for
            for(int x=0;x<lx;x++)for(int e=0; e<samples;        e++)  c[e][x] = ImmediateMulCOMPLEX( m,
                                                                                MulCOMPLEX(ty[x][e], ConjCOMPLEX(w[e]) ) );
        }
    }
    st = 0;
  GOOD_BY:; if(st)fprintf(stderr,"FourierTransform() failed to allocate working memory area.\n");
    free(ty);   free(tx);
    free(CtlY); free(CtlX);
}
////////////////////////////////////////////////////////////////////////////////
// Global variables
int verbose=0, nega=0, auto_adjust=0, max_num_threads=0 ;
////////////////////////////////////////////////////////////////////////////////
// Primitive functions
//
unsigned int int2PseudoColor(unsigned int v) {
    register unsigned int r=0, g=0, b=0 ;
    if( (v&0x00ffffff) == 0 ) return 0 ;
    if(v & 0x000001) b|=0x80;; if(v & 0x000002) g|=0x80;; if(v & 0x000004) r|=0x80;
    if(v & 0x000008) b|=0x40;; if(v & 0x000010) g|=0x40;; if(v & 0x000020) r|=0x40;
    if(v & 0x000040) b|=0x20;; if(v & 0x000080) g|=0x20;; if(v & 0x000100) r|=0x20;
    if(v & 0x000200) b|=0x10;; if(v & 0x000400) g|=0x10;; if(v & 0x000800) r|=0x10;
    if(v & 0x001000) b|=0x08;; if(v & 0x002000) g|=0x08;; if(v & 0x004000) r|=0x08;
    if(v & 0x008000) b|=0x04;; if(v & 0x010000) g|=0x04;; if(v & 0x020000) r|=0x04;
    if(v & 0x040000) b|=0x02;; if(v & 0x080000) g|=0x02;; if(v & 0x100000) r|=0x02;
    if(v & 0x200000) b|=0x01;; if(v & 0x400000) g|=0x01;; if(v & 0x800000) r|=0x01;
    return (r <<16) | (g << 8) | b ;
}
void **alloc2D( void *data, int s, int lx, int ly ){
  // alloc2D: 既存のメモリブロックに2Dインデックスを作成
  // data:    初期化済みのメモリブロック（未初期化の場合、未定義動作）
  // s:       既存ブロックの１要素分のサイズ, lx: 幅, ly: 高さ
  long long i, w=lx, h=ly;  if( !data || s<1 || lx<1 || ly<1 ) return NULL;
  void **d, *base=data ;
    if( (d=(void **)malloc(h*sizeof(void *))) == NULL ) return NULL ;
    for( i=0; i<ly; i++ ) d[i] = (void *)((unsigned char *)base + w*s*i) ;
    return d ;
}
void **alloc2Darray(int s, int lx, int ly){ if(s<1 || lx<1 || ly<1) return NULL;
    long long w = lx, h = ly, next = s*w, space = h*sizeof(void *) + w*h*s + 64;
    void    **d = (void **)calloc(1, space);
    if(!d){fprintf(stderr,"alloc2Darray failed: %lld bytes\n",space); return NULL;}
    void  *base = (void*)((uintptr_t)((char*)d + h * sizeof(void*) + 63) & ~63LL);
    for(long long i=0; i<h; i++) d[i] = (void*)((char*)base + next * i);
    return d;                         // C99: uintptr_t same as (void *)
}
void int2PIXEL( int **s, int **d, int lx, int ly ){
    if( !s || !d || lx<1 || ly<1) return ;
    PIXEL **pd = (PIXEL**)d;
    for(int value, y=0;y<ly;y++)for(int x=0;x<lx;x++){
        value = s[y][x]<0 ?  0 : ( s[y][x]>255 ? 255 : s[y][x] );
        pd[y][x] = (PIXEL){ value, value, value, 0 };
    }
}
void int2PIXEL_adjust( int **s, int **d, int lx, int ly ){
    if( !s || !d || lx<1 || ly<1) return ;
    int min=s[0][0], max=s[0][0];
    PROCESS_ALL_PIXELS  IF(min>s[y][x])min=s[y][x]; EI(max<s[y][x])max=s[y][x];
    int div = min!=max ? (max-min) : 1;  if(div==1) min-=127;
    
    PIXEL **pd = (PIXEL**)d;
    for(int value, y=0;y<ly;y++)for(int x=0;x<lx;x++){
        value = 255*(s[y][x]-min)/div;
        pd[y][x] = (PIXEL){ value, value, value, 0 };
    }
}
int saveDataAsBMP32( char *fname, void *b, int lx, int ly ){
  HEADER h = {BMPID, lx*abs(ly)*sizeof(PIXEL)+sizeof(HEADER), 0, 0, sizeof(HEADER),
              BMPIFHSIZE, lx, ly, 1, 32, 0, lx*abs(ly)*sizeof(PIXEL), 0, 0, 0, 0 };
  FILE *fp=NULL ;
  int  st=-1;
    if( (fp=fopen(fname, "wb")) == NULL ) {
        fprintf( stderr, "File '%s': cannot create.\n", fname );
    } else if ( fwrite( &h, sizeof(HEADER), 1, fp ) != 1 ) {
        fprintf( stderr, "File '%s': Header write error.\n", fname );
    } else if ( fwrite( b, h.biSizeImage, 1, fp ) != 1 ) {
        fprintf( stderr, "File '%s': Data write error.\n", fname );
    } else st=0;
    if(fp)fclose(fp) ;
    return st;
}
////////////////////////////////////////////////////////////////////////////////
// パブリックドメイン利用の32ビットのハッシュ関数 セキュリティ用の強度は無い事に注意
//
// Public domain. FNV-1a based on http://www.isthe.com/chongo/tech/comp/fnv/
// Public domain. MurmurHash2 based on https://github.com/aappleby/smhasher, 
// modified with expansion for collision resistance
// And, following code same too.
#include <stdint.h>
// FNV-1a
uint32_t fnv1a_hash(const void *key, size_t len){ if( !key ) return 0;
    uint32_t hash = 0x811C9DC5;
    const uint8_t *data = (const uint8_t *)key;
    for(size_t i=0; i<len; i++){  hash ^= data[i];  hash *= 0x01000193;  }
    return hash;
}
// MurmurHash2: 高速かつ高品質な非暗号学的ハッシュ関数
// key:  ハッシュ対象のデータ
// len:  データのバイト長
// seed: ハッシュの初期シード値
uint32_t MurmurHash2(const void *key, int len, uint32_t seed) {
    const uint32_t m = 0x5bd1e995; // 'm' と 'r' はミキシング定数。
    const int      r = 24;         // 特定の値は衝突を最小化するために選ばれている
    uint32_t       h = seed ^ len; // ハッシュ値をシードで初期化
    // データポインタをバイト単位で処理
    const unsigned char *data = (const unsigned char *)key;
    
    while( len>=4 ){// 4バイト単位でデータを処理
        // 4バイトをuint32_tとして取得（リトルエンディアン）
        uint32_t k = *(uint32_t *)data;
        k *= m;    // ミキシングステップ
        k ^= k >> r;
        k *= m;
        h *= m;    // ハッシュ値にミックス
        h ^= k;
        data += 4; // 次の4バイトへ
        len  -= 4;
    }
    // 残りのバイトを処理（末尾処理）
    switch( len ){
        case 3: h ^= data[2] << 16;  // Fallthrough
        case 2: h ^= data[1] <<  8;  // Fallthrough
        case 1: h ^= data[0];
                h *= m;
    }
    h ^= h >> 13; // 最終ミキシング（アバランシェ効果を強化）
    h *= m;
    h ^= h >> 15;
    return h;
}
// Create a hash data block for MurmurHash 32bit version    (MAX HashDB: 64G Bytes)
HashDB* make_HashDBmax(){ return (HashDB*)calloc(0x100000000,sizeof(HashDB)); }    // MAX 40億件
#define make_HashDB512M()        make_HashDB( 0x20000000 )   // MAX 5.3億件 8G
#define make_HashDB256M()        make_HashDB( 0x10000000 )   // MAX 2.5億件 4G
HashDB* make_HashDB(uint32_t size){ return (HashDB*)calloc(size,sizeof(HashDB)); }
// uint32_t HDB512 = (MurmurHash2( const void *key, size_t len, uint32_t seed ))>>3; 広く分布させる使い方
// uint32_t HDB256 = (MurmurHash2( const void *key, size_t len, uint32_t seed ))>>4; 広く分布させる使い方
// 上位ビットを捨てると狭い範囲に集中し易くなる、コンフリクトを起こしやすくなる。

void HomographyTrans( PIXEL **c, int lx, int ly, PIXEL **s, int slx, int sly, double inv[3][3] ){
    #pragma omp parallel for
    for(int y=0;y<ly;y++)for(int x=0;x<lx;x++){ POINTf destination = (POINTf){x,y};
        POINTf source = mtxMul( inv, destination );
        int    cx=source.x +.5, cy=source.y +.5;
        c[y][x] = (cx<0 || cy<0 || slx<=cx || sly<=cy) ? (PIXEL){255,255,255,0}
                : s[cy][cx];
    }
}
void HomographyTransMask( PIXEL **c, int lx, int ly, PIXEL **s, int slx, int sly, int **mask, double inv[3][3] ){
    #pragma omp parallel for
    for(int y=0;y<ly;y++)for(int x=0;x<lx;x++){ POINTf destination = (POINTf){x,y};
        POINTf source = mtxMul( inv, destination );
        int    cx=source.x +.5, cy=source.y +.5; 
        if( (cx<0 || cy<0 || slx<=cx || sly<=cy) || mask[cy][cx] )continue;
        c[y][x] = s[cy][cx];
    }
}
/////////////////////////////////////////////////////////////////////////////////////////////////
// DrawXXXX functions
//
void DrawLine(PIXEL **canvas, int lx, int ly, int x0, int y0, int x1, int y1, PIXEL pen) {
    int  dx = abs(x1 - x0), sx = (x0 < x1) ? 1 : -1;
    int  dy = abs(y1 - y0), sy = (y0 < y1) ? 1 : -1;
    int err = dx - dy;
    
    while(1){ if(x0>=0 && x0<lx && y0>=0 && y0<ly) canvas[y0][x0] = pen;
        if (x0 == x1 && y0 == y1) break;
        int e2 = 2 * err;
        if (e2 > -dy) { err -= dy; x0 += sx; }
        if (e2 <  dx) { err += dx; y0 += sy; }
    }
}
void DrawLineExor(PIXEL **canvas, int lx, int ly, int x0, int y0, int x1, int y1, PIXEL pen) {
    int dx = abs(x1 - x0), sx = (x0 < x1) ? 1 : -1;
    int dy = abs(y1 - y0), sy = (y0 < y1) ? 1 : -1;
    int **c=(int**)canvas, p=*(int*)&pen, err = dx - dy;
    while(1){ if(x0>=0 && x0<lx && y0>=0 && y0<ly) c[y0][x0] ^= p;
        if (x0 == x1 && y0 == y1) break;
        int e2 = 2 * err;
        if (e2 > -dy) { err -= dy; x0 += sx; }
        if (e2 <  dx) { err += dx; y0 += sy; }
    }
}
void DrawPolyline(PIXEL **canvas, int lx, int ly, int num_points, POINTi *points, int close, PIXEL pen) {
    if( num_points < 2 ) return;  // エッジケースのチェック
    int t = num_points -1;
    for(int i=0; i<t; i++ ) DrawLine(canvas,lx,ly, points[i].x, points[i].y, points[i+1].x, points[i+1].y,pen);
    if(close&&num_points>2) DrawLine(canvas,lx,ly, points[t].x, points[t].y, points[0].x,   points[0].y,  pen);
}
void DrawCircle(PIXEL **canvas, int lx, int ly, int cx, int cy, int r, PIXEL pen) {
    for(int x=0, y=r, d=1-r; x<=y; x++) {
        if(cx + x >= 0 && cx + x < lx && cy + y >= 0 && cy + y < ly) canvas[cy + y][cx + x] = pen;
        if(cx - x >= 0 && cx - x < lx && cy + y >= 0 && cy + y < ly) canvas[cy + y][cx - x] = pen;
        if(cx + x >= 0 && cx + x < lx && cy - y >= 0 && cy - y < ly) canvas[cy - y][cx + x] = pen;
        if(cx - x >= 0 && cx - x < lx && cy - y >= 0 && cy - y < ly) canvas[cy - y][cx - x] = pen;
        if(cx + y >= 0 && cx + y < lx && cy + x >= 0 && cy + x < ly) canvas[cy + x][cx + y] = pen;
        if(cx - y >= 0 && cx - y < lx && cy + x >= 0 && cy + x < ly) canvas[cy + x][cx - y] = pen;
        if(cx + y >= 0 && cx + y < lx && cy - x >= 0 && cy - x < ly) canvas[cy - x][cx + y] = pen;
        if(cx - y >= 0 && cx - y < lx && cy - x >= 0 && cy - x < ly) canvas[cy - x][cx - y] = pen;
        if(d<0) d = d + 2 * x + 3;
        else {  d = d + 2 * (x - y) + 5;  y--; }
    }
}
void DrawArc(PIXEL **canvas, int lx, int ly, int cx, int cy, int r, int start_angle, int end_angle, PIXEL pen) {
    start_angle     = ((start_angle % 360) + 360) % 360;
    end_angle       = ((  end_angle % 360) + 360) % 360;
    int wrap_around = (start_angle > end_angle);
    
    for(int x=0, y=r, d=1-r; x<=y; x++) {
        // 8つの対称点をチェック
        int points[8][2] = {
            {cx + x, cy + y}, {cx - x, cy + y}, {cx + x, cy - y}, {cx - x, cy - y},
            {cx + y, cy + x}, {cx - y, cy + x}, {cx + y, cy - x}, {cx - y, cy - x}
        };
        for(int i = 0; i < 8; i++) {
            int px = points[i][0];
            int py = points[i][1];
            if (px >= 0 && px < lx && py >= 0 && py < ly) {
                double angle = atan2(py - cy, px - cx) * 180.0 / M_PI;
                if (angle < 0) angle += 360;
                if ((!wrap_around && angle >= start_angle && angle <= end_angle) ||
                    (wrap_around && (angle >= start_angle || angle <= end_angle))) {
                    canvas[py][px] = pen;
        }   }   }
        if(d<0) d = d + 2 * x + 3;
        else {  d = d + 2 * (x - y) + 5;  y--; }
    }
}
void DrawEllipse(PIXEL **canvas, int lx, int ly, int x0, int y0, int x1, int y1, PIXEL pen) {
    int left   = (x0 < x1) ? x0 : x1;
    int right  = (x0 > x1) ? x0 : x1;
    int top    = (y0 < y1) ? y0 : y1;
    int bottom = (y0 > y1) ? y0 : y1;
    
    int cx = left + ( right - left) / 2;
    int cy = top  + (bottom - top ) / 2;
    int  a = (right  - left) / 2;
    int  b = (bottom - top ) / 2;
    if (a <= 0 || b <= 0) return;
    
    const double step_deg = 0.1;
    for(double angle = 0; angle < 360; angle += step_deg) {
        double rad = angle * M_PI / 180.0;
        int px = cx + round(a * cos(rad));
        int py = cy + round(b * sin(rad));
        if(px >= 0 && px < lx && py >= 0 && py < ly) canvas[py][px] = pen;
    }
}
//////////////////////////////////////////////////////////////////////////////////////////
// Function set for algorithm verification
//
#define dabs(a)    ((a)<0? -(a):(a))
void lu_decomposition_DoublePrecision(double** A, long double** L, long double** U, long long* P, int n) {
    for(int i=0; i<n; i++){  P[i] = i;
        for(int j=0; j<n; j++) {
            L[i][j] = 0.0;
            U[i][j] = A[i][j];
    }   }
    
    for(int k=0; k<n; k++) {
        double max  = 0.0;
        int k_prime = k;
        for(int i=k; i<n; i++) {
            if(dabs(U[i][k]) > max) {
               max     = dabs(U[i][k]);
               k_prime = i;
        }   }
        if(k_prime != k){ long double* temp_row;    // 入れ替え
            long long  tmp = P[k];  P[k] = P[k_prime];  P[k_prime] = tmp;
                  temp_row = U[k];  U[k] = U[k_prime];  U[k_prime] = temp_row;
                  temp_row = L[k];  L[k] = L[k_prime];  L[k_prime] = temp_row;
        }
        for(int i=k +1; i<n; i++){ L[i][k]  = U[i][k] / U[k][k];
            for(int j=k; j<n; j++) U[i][j] -= L[i][k] * U[k][j];
        }
        for(int j=0; j<=k; j++)  L[k][j] = (k==j) ? 1.0 : L[k][j];
    }
}
void solve_LU_DoublePrecision(long double** L, long double** U, long long* P, long double* b, long  double* x, int n) {
    long double*  y = (long double*)malloc(n * sizeof(long double));
    long double* pb = (long double*)malloc(n * sizeof(long double));
    
    if( y && pb ){ for(int i=0; i<n; i++) pb[i] = b[P[i]];
        // 前進代入（Ly = Pb）
        for(int i=0; i<n; i++) {
            y[i] = pb[i];
            for(int j=0; j<i; j++) y[i] -= L[i][j] * y[j];
        }
        // 後退代入（Ux = y）
        for(int i= n -1; i>=0; i--) {
            x[i] = y[i];
            for(int j=i +1; j<n; j++) x[i] -= U[i][j] * x[j];
            x[i] /= U[i][i];
        }
    } else { perror("Memory allocation error in solve_LU_DoublePrecision\n"); exit(1); }
    
    free(y); free(pb);
}

void inverse_matrix_DoublePrecision(double** A, double** A_inv, int n) {
    long double** L = (long double**)malloc(n * sizeof(long double*));
    long double** U = (long double**)malloc(n * sizeof(long double*));
    long long*    P = (long long*)malloc(n * sizeof(long long));
    
    if( L && U && P ){ 
        for(int i=0; i<n; i++) {
            L[i] = (long double*)calloc(n, sizeof(long double));
            U[i] = (long double*)calloc(n, sizeof(long double));
            if( !L[i] && !U[i] ){ 
                perror("Memory allocation error L[i],U[i] in inverse_matrix\n");
                exit(1); 
        }   }
        lu_decomposition_DoublePrecision(A, L, U, P, n);
        
        // 並列化：列ごとに並列に解く
        #pragma omp parallel for schedule(dynamic)
        for(int i=0; i<n; i++) {
            long double* e = (long double*)calloc(n, sizeof(long double));
            long double* x = (long double*)calloc(n, sizeof(long double));
            if( e && x ){
                e[i] = 1.0;
                solve_LU_DoublePrecision(L, U, P, e, x, n);
                for(int j=0; j<n; j++) A_inv[j][i] = x[j];
            } else { perror("Memory allocation error for solve_LU_DoublePrecision\n"); exit(1); }
            free(e);
            free(x);
        }//   if( n>=1000 )printf("\n" );
    } else { perror("Memory allocation error in inverse_matrix\n"); exit(1); }
    
    for(int i=0; i<n; i++){ free(L[i]); free(U[i]); }
    
    free(L); free(U); free(P);
}

void lu_decomposition(double** A, double** L, double** U, int* P, int n) {
    for(int i=0; i<n; i++){  P[i] = i;
        for(int j=0; j<n; j++) {
            L[i][j] = 0.0;
            U[i][j] = A[i][j];
    }   }
    
    for(int k=0; k<n; k++) {
        double max  = 0.0;
        int k_prime = k;
        for(int i=k; i<n; i++) {
            if(fabs(U[i][k]) > max) {
               max     = fabs(U[i][k]);
               k_prime = i;
        }   }
        if(k_prime != k){ double* temp_row;    // 入れ替え
            int  tmp = P[k];  P[k] = P[k_prime];  P[k_prime] = tmp;
            temp_row = U[k];  U[k] = U[k_prime];  U[k_prime] = temp_row;
            temp_row = L[k];  L[k] = L[k_prime];  L[k_prime] = temp_row;
        }
        for(int i=k +1; i<n; i++){ L[i][k]  = U[i][k] / U[k][k];
            for(int j=k; j<n; j++) U[i][j] -= L[i][k] * U[k][j];
        }
        for(int j=0; j<=k; j++)  L[k][j] = (k==j) ? 1.0 : L[k][j];
    }
}
void solve_LU(double** L, double** U, int* P, double* b, double* x, int n) {
    double*  y = (double*)malloc(n * sizeof(double));
    double* pb = (double*)malloc(n * sizeof(double));
    
    if( y && pb ){ for(int i=0; i<n; i++) pb[i] = b[P[i]];
        // 前進代入（Ly = Pb）
        for(int i=0; i<n; i++) {
            y[i] = pb[i];
            for(int j=0; j<i; j++) y[i] -= L[i][j] * y[j];
        }
        // 後退代入（Ux = y）
        for(int i= n -1; i>=0; i--) {
            x[i] = y[i];
            for(int j=i +1; j<n; j++) x[i] -= U[i][j] * x[j];
            x[i] /= U[i][i];
        }
    } else { perror("Memory allocation error in solve_LU\n"); exit(1); }
    
    free(y); free(pb);
}
void inverse_matrix(double** A, double** A_inv, int n) {
    double** L = (double**)malloc(n * sizeof(double*));
    double** U = (double**)malloc(n * sizeof(double*));
    int*     P = (int*)malloc(n * sizeof(int));
    
    if( L && U && P ){ 
        for(int i=0; i<n; i++) {
            L[i] = (double*)calloc(n, sizeof(double));
            U[i] = (double*)calloc(n, sizeof(double));
            if( !L[i] && !U[i] ){ 
                perror("Memory allocation error L[i],U[i] in inverse_matrix\n");
                exit(1); 
        }   }
        lu_decomposition(A, L, U, P, n);
        // 並列化：列ごとに並列に解く
        #pragma omp parallel for
        for(int i=0; i<n; i++) {
            double* e = (double*)calloc(n, sizeof(double));
            double* x = (double*)calloc(n, sizeof(double));
            if( e && x ){
                e[i] = 1.0;
                solve_LU(L, U, P, e, x, n);
                for(int j=0; j<n; j++) A_inv[j][i] = x[j];
            } else { perror("Memory allocation error for solve_LU\n"); exit(1); }
            free(e);
            free(x);
        }
    } else { perror("Memory allocation error in inverse_matrix\n"); exit(1); }
    
    for(int i=0; i<n; i++){ free(L[i]); free(U[i]); }
    free(L); free(U); free(P);
}
// 行列積を計算
void multiplication_matrix(double **A, double **B, double **C, int n) {
    #pragma omp parallel for
    for(int i=0; i<n; i++){
        for(int j=0; j<n; j++){
            double sum = 0.0;
            for(int k=0; k<n; k++) sum += A[i][k] * B[k][j];
            C[i][j] = sum;
    }   }
}
void dump_array(FILE *fp, char *fmt, char *msg, double ** A, int N) {
    if(msg) fprintf(fp, "\n%s\n",msg);
    else    fprintf(fp, "\n");
    for(int y=0; y<N; y++) {
        fprintf(fp, "% 6d: \t", y);
        for(int x=0; x<N; x++) fprintf(fp, fmt, A[y][x] + 0.0000001);
        fprintf(fp, "\n");
    }   fprintf(fp, "\n");
}

void HomographyCore( POINTf sp[4], POINTf dp[4], double hkl[3][3] ){
    POINTf  SA = sp[0],  SB = sp[1],  SC = sp[2],  SD = sp[3];
    POINTf  DA = dp[0],  DB = dp[1],  DC = dp[2],  DD = dp[3];
    double tranp[8]  = { DA.x, DA.y, DB.x, DB.y, DC.x, DC.y, DD.x, DD.y };
    double abcdefgh1[3*3]={0,0,0,0,0,0,0,0,1}; //Coefficient of 3x3 matrix
    double hg[8][8] = {
             { SA.x, SA.y,  1,     0,    0,  0,  -SA.x*DA.x, -SA.y*DA.x },
             {    0,    0,  0,  SA.x, SA.y,  1,  -SA.x*DA.y, -SA.y*DA.y },
             { SB.x, SB.y,  1,     0,    0,  0,  -SB.x*DB.x, -SB.y*DB.x },
             {    0,    0,  0,  SB.x, SB.y,  1,  -SB.x*DB.y, -SB.y*DB.y },
             { SC.x, SC.y,  1,     0,    0,  0,  -SC.x*DC.x, -SC.y*DC.x },
             {    0,    0,  0,  SC.x, SC.y,  1,  -SC.x*DC.y, -SC.y*DC.y },
             { SD.x, SD.y,  1,     0,    0,  0,  -SD.x*DD.x, -SD.y*DD.x },
             {    0,    0,  0,  SD.x, SD.y,  1,  -SD.x*DD.y, -SD.y*DD.y },
           },invhg[8][8], *phg[8], *pinvhg[8];  // inverseHomography
    //////////////////////////////////////////////////////////////////////
    for(int e=0; e<8; e++){ phg[e]=hg[e]; pinvhg[e]=invhg[e];  }
    inverse_matrix( phg, pinvhg, 8 );  // Calculate the inverse matrix
    for(int y=0;y<8;y++)for(int x=0;x<8;x++) abcdefgh1[y] += invhg[y][x]*tranp[x];
    for(int z=0,y=0;y<3;y++)for(int x=0;x<3;x++) hkl[y][x] = abcdefgh1[z++];
}
void Homography( PIXEL **s, int slx, int sly, int **mask, POINTf sp[4],
                 PIXEL **d, int lx,  int ly,  POINTf dp[4] ){
    //////////////////////////////////////////////////////////////////////
    double homographyMatrix[3][3], transMatrix[3][3];
    double *p_homographyMatrix[3], *p_transMatrix[3];
    for(int e=0; e<3; e++){ p_homographyMatrix[e]=homographyMatrix[e];
                            p_transMatrix[e]     =transMatrix[e];    }
    HomographyCore( sp, dp, homographyMatrix );
    //////////////////////////////////////////////////////////////////////
    // Calculate the inverse matrix for image transportation
    inverse_matrix( p_homographyMatrix, p_transMatrix, 3 );
    if(!mask) HomographyTrans( d, lx, ly, s, slx, sly, transMatrix );
    else      HomographyTransMask( d, lx, ly, s, slx, sly, mask, transMatrix );
}
////////////////////////////////////////////////////////////////////////////////
// ユニオン・ファインド構造体
typedef struct {
    int *parent; // 親ノード
    int *rank;   // ランク（木の高さの近似）
    int size;    // 配列サイズ
} UnionFind;
// ユニオン・ファインドの初期化
UnionFind *uf_init(int size) {
    UnionFind *uf = (UnionFind *)malloc(sizeof(UnionFind));
    if(!uf) return NULL;
    uf->parent = (int *)malloc(size * sizeof(int));
    uf->rank   = (int *)malloc(size * sizeof(int));
    if (!uf->parent || !uf->rank) {
        free(uf->parent); free(uf->rank); free(uf);
        return NULL;
    }   uf->size = size;
    for(int i=0; i<size; i++) {
        uf->parent[i] = i; // 各ノードは自分自身を親に
        uf->rank[i]   = 0;
    }
    return uf;
}
// ルート検索（経路圧縮）
int uf_find(UnionFind *uf, int x) {
    if(uf->parent[x] != x) uf->parent[x] = uf_find(uf,uf->parent[x]);// 経路圧縮
    return uf->parent[x];
}
// 結合（ランク付き）
void uf_union(UnionFind *uf, int x, int y) {
    int root_x = uf_find(uf, x);
    int root_y = uf_find(uf, y);
    if(root_x == root_y) return;
    if(uf->rank[root_x] < uf->rank[root_y]) uf->parent[root_x] = root_y;
    EI(uf->rank[root_x] > uf->rank[root_y]) uf->parent[root_y] = root_x;
    EE{ uf->parent[root_y] = root_x;  uf->rank[root_x]++; }
}
// ユニオン・ファインドの解放
void uf_free(UnionFind *uf) {
    if(uf){ free(uf->parent);  free(uf->rank);  free(uf); }
}
// ラベル数を取得
int uf_count_labels(UnionFind *uf, int *labels, int size) {
    int *counted = (int *)calloc(uf->size, sizeof(int));
    int label_count = 0;
    for(int i = 0; i < size; i++) {
        if (labels[i] > 0) {
            int root = uf_find(uf, labels[i]);
            if (!counted[root]) {
                counted[root] = 1;
                label_count++;
    }   }   }
    free(counted);
    return label_count;
}
// バイナリファイルにラベルデータを保存
int save_label_data(char *filename, int **labels, int lx, int ly) {
    FILE *fp = fopen(filename, "wb");
    if (!fp) {
        fprintf(stderr, "Cannot create binary file '%s'.\n", filename);
        return -1;
    }
    for(int y = 0; y < ly; y++) {
        if (fwrite(labels[y], sizeof(int), lx, fp) != (size_t)lx) {
            fprintf(stderr, "Write error to '%s'.\n", filename);
            fclose(fp);
            return -1;
    }   }
    fclose(fp);
    return 0;
}
int labeling(int **b, int **labels, int lx, int ly, int connectivity) {
    if( !b || !labels || lx<1 || ly<1 )      return -1;
    if( connectivity!=4 && connectivity!=8 ) return -1;
    PROCESS_ALL_PIXELS labels[y][x]=0;                    // ラベル配列（初期値0）
    UnionFind *uf  = uf_init(lx * ly);                    // ユニオン・ファインドの初期化
    int *label_map = (int *)calloc(lx*ly+1,sizeof(int));  // ラベルを連続番号に再割り当て用
    if( !uf || !label_map){
        if(!uf) fprintf(stderr, "Union-Find initialization failed.\n");
        EE      fprintf(stderr, "Memory allocation failure for re-labels.\n");;
        free(label_map);
        uf_free(uf);
        return -1;
    }
    // 1パス目：Union-Find操作（uf_union, uf_find）の競合を避けるため逐次処理。 label:ラベル番号のカウンタ
    if( connectivity==4 ){
        for(int label=1, y=0; y<ly; y++)for(int x=0; x<lx; x++) {
            if( !b[y][x] ) continue;  // 背景ならスキップ、前景（白色ピクセル）でないとき
            int min_label = 0;
            if( y>0 && b[y-1][x] && labels[y-1][x]>0 )  min_label = labels[y-1][x];         // 上
            if( x>0 && b[y][x-1] && labels[y][x-1]>0 )                                      // 左
                if( min_label==0 || labels[y][x-1]<min_label ) min_label = labels[y][x-1];
            if(min_label==0) labels[y][x] = label++;     // 新しいラベルを割り当て
            else {           labels[y][x] = min_label;
                // 近傍のラベルを結合
                if( y>0 && b[y-1][x] && labels[y-1][x]>0 && labels[y-1][x] != min_label ) uf_union(uf, labels[y-1][x], min_label);
                if( x>0 && b[y][x-1] && labels[y][x-1]>0 && labels[y][x-1] != min_label ) uf_union(uf, labels[y][x-1], min_label);     
        }   }
    } else {
        for(int label=1, y=0; y<ly; y++)for(int x=0; x<lx; x++) {
            if( !b[y][x] ) continue;  // 背景ならスキップ、前景（白色ピクセル）でないとき
            int min_label = 0;
            if( y>0 && b[y-1][x] && labels[y-1][x]>0 )  min_label = labels[y-1][x];             // 上
            if( x>0 && b[y][x-1] && labels[y][x-1]>0 )                                          // 左
                if( min_label==0 || labels[y][x-1]<min_label ) min_label = labels[y][x-1];
            if( y>0 && x>0 && b[y-1][x-1] && labels[y-1][x-1]>0 )                               //8: 左上
                if( min_label==0 || labels[y-1][x-1]<min_label )  min_label = labels[y-1][x-1]; //8:
            if( y>0 && x < lx-1 && b[y-1][x+1] && labels[y-1][x+1]>0 )                          //8: 右上
                if( min_label==0 || labels[y-1][x+1]<min_label )  min_label = labels[y-1][x+1]; //8:
            if(min_label==0) labels[y][x] = label++;     // 新しいラベルを割り当て
            else {           labels[y][x] = min_label;
                // 近傍のラベルを結合
                if( y>0 && b[y-1][x] && labels[y-1][x]>0 && labels[y-1][x] != min_label ) uf_union(uf, labels[y-1][x], min_label);
                if( x>0 && b[y][x-1] && labels[y][x-1]>0 && labels[y][x-1] != min_label ) uf_union(uf, labels[y][x-1], min_label);
                if( y>0 && x>0    && b[y-1][x-1] && labels[y-1][x-1]>0 && labels[y-1][x-1] != min_label ) uf_union(uf, labels[y-1][x-1], min_label);//8:
                if( y>0 && x<lx-1 && b[y-1][x+1] && labels[y-1][x+1]>0 && labels[y-1][x+1] != min_label ) uf_union(uf, labels[y-1][x+1], min_label);//8:
        }   }
    }
    // 2パス目：ラベルをルートに統一
    #pragma omp parallel for collapse(2)
    for(int y=0; y<ly; y++) {
        for(int x=0; x<lx; x++) if(labels[y][x]>0) labels[y][x] = uf_find(uf, labels[y][x]);
    }
    // ラベルを連続番号に再割り当て
    int new_label = 1;
    PROCESS_ALL_PIXELS {
        if(labels[y][x] > 0 && label_map[labels[y][x]]==0)label_map[labels[y][x]] = new_label++;
        labels[y][x] = label_map[labels[y][x]];
    }
    // メモリ解放
    free(label_map);
    uf_free(uf);
    
    return new_label - 1;
}
////////////////////////////////////////////////////////////////////////////////
// convolution
// float版 畳み込み関数（ゼロパディング）
void convolution_float(float **s, float **d, int lx, int ly, int n, float kernel[n][n]) {
    int u=n, v=n, ku = u / 2, kv = v / 2;
    #pragma omp parallel for
    for(int y = 0; y < ly; y++) {
        for(int x = 0; x < lx; x++) {
            float sum = 0.0f;
            for(int j = 0; j < u; j++) {
                for(int i = 0; i < v; i++) {
                    int yy = y + j - ku;
                    int xx = x + i - kv;
                    if (xx >= 0 && xx < lx && yy >= 0 && yy < ly) {
                        sum += s[yy][xx] * kernel[j][i];
            }   }   }
            d[y][x] = sum;
    }   }
}
void convolution(int **s, int **d, int lx, int ly, int n, int kernel[n][n]) {
    int u=n, v=n, ku = u / 2, kv = v / 2; // カーネル中心のオフセット
    #pragma omp parallel for
    for(int y = 0; y < ly; y++) {
        for(int x = 0; x < lx; x++) {
            int sum = 0;
            for(int j = 0; j < u; j++) {
                for(int i = 0; i < v; i++) {
                    int yy = y + j - ku;
                    int xx = x + i - kv;
                    // ゼロパディング：境界外は0とみなす
                    if (xx >= 0 && xx < lx && yy >= 0 && yy < ly) {
                        sum += s[yy][xx] * kernel[j][i];
            }   }   }
            d[y][x] = sum;
    }   }
}
void image_PIXEL2rgb(int **a, int **r, int **g, int **b, int lx, int ly) {
    PIXEL **p=(PIXEL**)a;
    PROCESS_ALL_PIXELS{r[y][x]=p[y][x].R; g[y][x]=p[y][x].G; b[y][x]=p[y][x].B;}
}
void image_rgb2PIXEL(int **r, int **g, int **b, int **a, int lx, int ly) {
    PIXEL **p=(PIXEL**)a;
    PROCESS_ALL_PIXELS{
        int  vr = r[y][x]<0 ? 0 : ( r[y][x]>255 ? 255 : r[y][x] );
        int  vg = g[y][x]<0 ? 0 : ( g[y][x]>255 ? 255 : g[y][x] );
        int  vb = b[y][x]<0 ? 0 : ( b[y][x]>255 ? 255 : b[y][x] );
        p[y][x] = (PIXEL){ vb, vg, vr, 0 };
    }
}
void rgb2PIXEL_adjust( int **r, int **g, int **b, int **a, int lx, int ly ){
    int rmin=r[0][0], rmax=r[0][0], gmin=g[0][0], gmax=g[0][0], bmin=b[0][0], bmax=b[0][0];
    PROCESS_ALL_PIXELS{
        IF(rmin>r[y][x])rmin=r[y][x]; EI(rmax<r[y][x])rmax=r[y][x];
        IF(gmin>g[y][x])gmin=g[y][x]; EI(gmax<g[y][x])gmax=g[y][x];
        IF(bmin>b[y][x])bmin=b[y][x]; EI(bmax<b[y][x])bmax=b[y][x];
    }   int min, max, div;
    min = (rmin<gmin) ? (rmin<bmin?rmin:bmin) : (gmin<bmin?gmin:bmin);
    max = (rmax>gmax) ? (rmax>bmax?rmax:bmax) : (gmax>bmax?gmax:bmax);
    div = min!=max ? (max-min) : 1 ; // divide number
    PIXEL **p = (PIXEL**)a;
    PROCESS_ALL_PIXELS{// (PIXEL){ B, G, R, 0 };
        p[y][x] = (PIXEL){ 255*b[y][x]/div, 255*g[y][x]/div, 255*r[y][x]/div, 0 };
    }
}
#define int2PC         int2PseudoColor          // ~ : ビット反転(ビット毎に反転)、  ! : 否定(0 なら1、以外 0)
void image_set(int v,            int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = v;                }
void image_pc( int **a,          int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = int2PC(a[y][x]);  }
void image_copy(int **a,         int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = a[y][x];          }
void image_not(int **a,          int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] =         ~a[y][x]; }
void image_and(int **a, int **b, int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = a[y][x] & b[y][x];}
void image_or( int **a, int **b, int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = a[y][x] | b[y][x];}
void image_xor(int **a, int **b, int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = a[y][x] ^ b[y][x];}
void image_add(int **a, int **b, int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = a[y][x] + b[y][x];}
void image_sub(int **a, int **b, int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = a[y][x] - b[y][x];}
void image_mul(int **a, int **b, int **dst, int lx, int ly){ PROCESS_ALL_PIXELS dst[y][x] = a[y][x] * b[y][x];}
void image_div(int **a, int **b, int **dst, int lx, int ly){
    PROCESS_ALL_PIXELS dst[y][x] = (b[y][x] != 0) ? a[y][x] / b[y][x] : 0;
}
void image_add_abs3(int **a, int **b, int **c, int **dst, int lx, int ly) {
    PROCESS_ALL_PIXELS dst[y][x] = abs(a[y][x]) + abs(b[y][x]) + abs(c[y][x]);
}
void clamp_and_convert(int **src, PIXEL **dst, int lx, int ly) {
    PROCESS_ALL_PIXELS{int v = src[y][x]<0 ? 0:(src[y][x]>255 ? 255:src[y][x]);
        dst[y][x] = (PIXEL){v, v, v, 0};
    }
}

// templates use as include file. SET =>  TEMPLATES_USE_AS_INCLUDE_FILE
#ifndef TEMPLATES_USE_AS_INCLUDE_FILE
//////////////////////////////////////////////////////////////////////////////////////////
// Enhanced version
// Algorithm implementation area, start here
//
//     関数内で画像データを作成した時 整数=>PIXEL（BMP形式）変換を行わない
int    save_without_conversion = 0;
//     コンボリューション、ガウス関数等
int    kernel = 5;   float  sigma  = 1.0;
// コマンドラインオプション      ラベリング処理用
//          デフォルトは8連結  バイナリ保存フラグ                 バイナリファイル名
//static int connectivity = 8,    save_binary = 0;  static char *binary_file = NULL;

int image_processing_program( int **s, int **d, int **w, int **t, int **g, int **b, int lx, int ly ){
    (void)s; (void)d; (void)w; (void)t; (void)g; (void)b; (void)lx; (void)ly;
    //////////////////////////////////////////////////////////////
    // image_processing_program
    //  - s: 入力画像（カラー画像 or ネガ画像：輝度反転画像）BMP互換のPIXEL形式
    //  - d: 出力画像（処理結果）
    //  - w, t, g, b: 作業用ワーク領域（必要に応じて自由に使用可）
    //       t: カラー画像から、グレー画像データに変換したBMPファイル形式データ
    //       g: カラー画像から、グレー画像データに変換した値、演算処理用
    //       b: グレー画像データを更に、128以上、以下で 255,0 に二値化したデータ
    
    return 0;
}

// Algorithm implementation area. ends here
//////////////////////////////////////////////////////////////////////////////////////////

#include <string.h>
void argumentAnalysis(int argc, char *argv[], char **iname, char **oname){ int status=1;
    for(int i=1; i<argc; i++ ){ // プログラム起動時の引数解析処理
        if( argv[i][0]=='-' ) {
            switch(argv[i][1]){
                case 'v': if( argv[i][2] ) goto GOOD_BY;
                          verbose=1; break;
                case 'V': if( argv[i][2] ) goto GOOD_BY;
                          verbose=2; break;
                case 'n': if(argv[i][2])  goto GOOD_BY; 
                           nega=1;   break;
                //case 'c': if(argv[i][3])  goto GOOD_BY;  // ラベリング処理用
                //          if(argv[i][2]=='4') connectivity = 4;
                //          EI(argv[i][2]=='8') connectivity = 8;
                //          EE goto GOOD_BY;
                //          break;
                case 'a': if( argv[i][2] ) goto GOOD_BY;
                          auto_adjust=1;   break;
                // If you need for the multi-threaded program. omp_set_num_threads(n);
                case 't': if( !isdigit(argv[i][2]) || argv[i][2]==0 ) goto GOOD_BY;
                          max_num_threads = atoi(&argv[i][2]);
                          if(max_num_threads > 0) omp_set_num_threads(max_num_threads);
                          break;
                //case 'k': if( strcmp(argv[i],"-kernel") && strcmp(argv[i],"-k") ) goto GOOD_BY;
                //          if( ++i>=argc ) goto GOOD_BY;
                //          kernel = atoi(argv[i]); break;
                //case 's': if( strcmp(argv[i],"-sigma")  && strcmp(argv[i],"-s") ) goto GOOD_BY;
                //          if( ++i>=argc ) goto GOOD_BY;
                //          sigma = atof(argv[i]); break;
                //case 'b': if( argv[i][2] ) binary_file = &argv[i][2];
                //          EI( ++i>=argc ) goto GOOD_BY;
                //          EE binary_file = argv[i];  //ラベリング処理用
                //          save_binary=1;  break;
                case 'h': status=0;     // fallthrough
                default : goto GOOD_BY; // In gcc, you can use the magic comment "// fallthrough" to
            }                           // suppress the warning. without "goto GOOD_BY;" or "break;"
        }                 // warning: this statement may fall through [-Wimplicit-fallthrough=]
        else if( *iname==NULL ) *iname = argv[i];
        else if( *oname==NULL ) *oname = argv[i];
        else goto GOOD_BY;
    }
    return;
GOOD_BY:; int i;  char *p=argv[0], *op="[-v|V] [-a] [-n] [-t<num>] [-h]";
    // "[-k|-kernel <num>] [-s|-sigma <num>] [-c{4|8}] [-b filename]"
    for(i=strlen(p); i>0 && p[i]!='\\' && p[i]!='/'; i-- );; if(i) i++;
    fprintf( stderr, "\nusage: %s %s [input_bmp [output_bmp]]\n\n", &p[i], op );
    fprintf( stderr, "\t\t-v: Verbose mode, V shows more details.\n" );
    fprintf( stderr, "\t\t-a: Adjust the integer data range to be 0 to 255.\n" );
    fprintf( stderr, "\t\t-n: Invert the brightness of the input image.\n" );
    //fprintf( stderr, "\t\t-c: -c{4|8}. Specify the connection method (4 or 8).\n" );
    //fprintf( stderr, "\t\t-k: -k|-kernel <num>. For specifying kernels such as convolution.\n" );
    //fprintf( stderr, "\t\t-s: -s|-sigma <num>.  For specifying standard deviation etc.\n" );
    fprintf( stderr, "\t\t-t: -t<num>. a positive integer\n" );
    fprintf( stderr, "\t\t    Specify the maximum number of threads to use.\n" );
    fprintf( stderr, "\t\t-h: Display this message and exit.\n" );
    exit( status );
}
void PIXEL_Fitting( PIXEL *s, int lx, int ly ) {
    #undef  A_stb
    #define A_stb (v2Dp(lx,s))
    for(int ey=(ly +1)/2, ty=ly-1, y=0; y<ey; y++ )for(int  w=ty-y, x=0; x<lx; x++ ) {
        PIXEL     p = (PIXEL){ A_stb[y][x].R, A_stb[y][x].G, A_stb[y][x].B, 0 };
        A_stb[y][x] = (PIXEL){ A_stb[w][x].R, A_stb[w][x].G, A_stb[w][x].B, 0 };
        A_stb[w][x] = p;
    }// 色要素順の変更( B ⇔ R )及び 左上原点系 から 左下原点系への変換
}
int main(int argc, char *argv[]){ char *ifname=NULL, *ofname=NULL;
    argumentAnalysis( argc, argv, &ifname, &ofname );
    if( ifname==NULL ) ifname = "sample.bmp";
    if( ofname==NULL ) ofname = "result.bmp";
    // 入力画像の読み込み
    int lx, ly, bpp;
    PIXEL *stb = (PIXEL *)stbi_load(ifname, &lx, &ly, &bpp, 4);
    if( !stb ){
      fprintf(stderr,"Couldn't load the image '%s'.%s\n",ifname,stbi_failure_reason());return 1;
    }else PIXEL_Fitting( stb, lx, ly );

    int32_t **s=NULL, **d=NULL, **w=NULL, **t=NULL, **g=NULL, **b=NULL, st=1;
    const char *err_msg="Memory allocation failure";
    if( !(s=(int32_t**)alloc2Darray(sizeof(int32_t),lx,ly)) ){ perror(err_msg); goto GOOD_BY; }
    if( !(d=(int32_t**)alloc2Darray(sizeof(int32_t),lx,ly)) ){ perror(err_msg); goto GOOD_BY; }
    if( !(w=(int32_t**)alloc2Darray(sizeof(int32_t),lx,ly)) ){ perror(err_msg); goto GOOD_BY; }
    if( !(t=(int32_t**)alloc2Darray(sizeof(int32_t),lx,ly)) ){ perror(err_msg); goto GOOD_BY; }
    if( !(g=(int32_t**)alloc2Darray(sizeof(int32_t),lx,ly)) ){ perror(err_msg); goto GOOD_BY; }
    if( !(b=(int32_t**)alloc2Darray(sizeof(int32_t),lx,ly)) ){ perror(err_msg); goto GOOD_BY; }
    ///////////////////////////////////////////////////////////////////////////////////
    // Calling the image processing program
    PIXEL **ps=(PIXEL**)s, **pt=(PIXEL**)t;
    for(int gray, e=0, y=0;y<ly;y++)for(int x=0; x<lx; x++, e++){
       ps[y][x] =  nega ? (PIXEL){255-stb[e].B,255-stb[e].G,255-stb[e].R,0}:stb[e];
        g[y][x] =  gray = (306*ps[y][x].R + 601*ps[y][x].G + 117*ps[y][x].B)>>10;
        b[y][x] =  gray>127 ? 255 : 0;
       pt[y][x] =  (PIXEL){ gray, gray, gray, 0 };
    }
    if( !(st = image_processing_program( s, d, w, t, g, b, lx, ly )) ){
        if( !save_without_conversion ){// int → PIXEL 変換
            if(auto_adjust) int2PIXEL_adjust( d, d, lx, ly );
            else            clamp_and_convert(d,(PIXEL**)d,lx,ly);
        }// BMP保存 PIXEL画像
        saveArray2bmp32( ofname, d, lx, ly ); // BMP保存
    }
    ////////////////////////////////////////////////////////////////////////////
    // メモリ解放  
  GOOD_BY:;
    free(b); free(g); free(t); free(w); free(d); free(s);
    free(stb);
    return st;
}
#endif //TEMPLATES_USE_AS_INCLUDE_FILE
#endif //IMAGE_PROCESSING_TEMPLATES_C99

This is a comprehensive C99 template designed for image processing and data processing tasks, optimized for performance with features like OpenMP parallelization and minimal external dependencies (only stb_image.h for image I/O). Below, I evaluate the template, provide a detailed explanation of each major function or section, including their purpose, usage, and examples, and address whether the template is overly lengthy. The evaluation is structured to be thorough yet concise, respecting the request for detailed explanations with examples.

Evaluation of the Template

Strengths

Modularity and Reusability: The template is designed to be included as a header (TEMPLATES_USE_AS_INCLUDE_FILE) or compiled standalone, making it versatile for both prototyping and integration into larger projects.
Performance Optimization:
- Uses OpenMP for parallel processing, leveraging multi-core CPUs (e.g., in FourierTransform, convolution, etc.).
- Compiler flags like -O3, -mavx, and -march=native ensure hardware-specific optimizations.
- Memory alignment and efficient data structures (e.g., alloc2Darray) reduce overhead.
Minimal Dependencies: Relies only on stb_image.h for image loading/saving, reducing setup complexity.
Comprehensive Functionality: Covers a wide range of image processing tasks, including:
- Fourier transforms (DFT, FFT, CZT).
- Matrix operations (LU decomposition, inverse matrix).
- Geometric transformations (homography).
- Image manipulation (convolution, labeling, drawing primitives).
- Hashing (FNV-1a, MurmurHash2) for data processing.
Robust Error Handling: Includes checks for memory allocation failures and invalid inputs, with informative error messages.
Flexibility: Supports both high-precision (double/long double) and integer-based operations, with utilities for converting between formats (e.g., int2PIXEL, rgb2PIXEL_adjust).
Readability Enhancements: Macros like IF, EI, EE, and HERE improve code alignment and debugging.

Weaknesses

Length and Complexity: The template is quite long (~600 lines), which may overwhelm beginners or developers looking for a lightweight solution. The extensive functionality, while powerful, could be split into smaller, task-specific modules for better maintainability.
Limited Documentation: While the code includes some comments, detailed documentation for each function's parameters, return values, and edge cases is sparse. This could make it harder for new users to adopt.
Platform-Specific Code: The timing functions (get_micro_time) and some macros (e.g., Windows-specific #ifdef _WIN32) introduce platform dependencies, which could complicate cross-platform use.
Hardcoded Constants: Values like INF_ZERO (1e-12) or PI are hardcoded, which might limit flexibility for users needing different precision levels.
Verbose Variable Names: Some variable names (e.g., abcdefgh1 in HomographyCore) are not descriptive, reducing readability.

Is It Too Long?

Yes, the template is somewhat lengthy for a single file, especially for users who only need specific functionalities (e.g., just convolution or homography). However, the length is justified by its comprehensive coverage of image and data processing tasks, making it a one-stop solution for algorithm prototyping. For production use, it could be modularized into separate files (e.g., one for Fourier transforms, one for matrix operations, etc.) to improve maintainability and reduce compilation time.

Suggestions for Improvement

Modularization: Split the template into multiple files based on functionality (e.g., fourier.c, matrix.c, image_utils.c).
Enhanced Documentation: Add detailed comments or a separate documentation file explaining each function's purpose, parameters, and usage examples.
Configurable Constants: Allow users to define precision constants (e.g., INF_ZERO) via macros or configuration files.
Simplify Macros: Replace some macros (e.g., IF, EI, EE) with standard C keywords to improve readability for users unfamiliar with the aliases.
Cross-Platform Consistency: Abstract platform-specific code (e.g., timing functions) into a separate module or use a portable library like <time.h> universally.

Detailed Explanation of Key Functions and Sections

Below, I describe the purpose, usage, and provide an example for each major function or section in the template. I focus on the most significant components for image and data processing, ensuring clarity with examples.

1. Timing Functions (get_micro_time, fprint_elapsed_time, fprintf_elapsed_time)

Purpose: Measure execution time in microseconds for performance benchmarking.
Usage:
- get_micro_time(): Returns the current time in microseconds, using platform-specific high-resolution timers (QueryPerformanceCounter on Windows, gettimeofday on Unix-like systems).
- fprint_elapsed_time(FILE *fp, double microsec): Prints elapsed time in microseconds, milliseconds, or seconds based on magnitude.
- fprintf_elapsed_time(FILE *fp, double microsec, int verbose): Prints detailed time breakdown (hours, minutes, seconds, milliseconds, microseconds) with optional verbosity.

Example:

double start = get_micro_time();
// Perform some image processing
for (int y = 0; y < ly; y++) for (int x = 0; x < lx; x++) d[y][x] = s[y][x];
double end = get_micro_time();
print_elapsed_time(end - start); // Output: Elapsed time: 0.123 msec
fprintf_elapsed_time(stdout, end - start, 1); // Output: Elapsed time: 0h 0m 0s 0ms 123us

Notes: Useful for profiling performance-critical sections. The verbose mode is helpful for detailed debugging but may be overkill for simple applications.

2. Data Structures (PIXEL, STBPX, HEADER, POINTi, POINTf, POINTd, COMPLEX, HashDB, FT_CONTROL, UnionFind)

Purpose: Define core data structures for image pixels, BMP headers, geometric points, complex numbers, hashing, Fourier transforms, and connected component labeling.
Usage:
- PIXEL: Represents a 32-bit RGBA pixel (8 bits per channel).
- HEADER: Defines a BMP file header for 32-bit images.
- POINTi/POINTf/POINTd: Integer/float/double precision 2D points for geometric operations.
- COMPLEX: Represents complex numbers for Fourier transforms.
- HashDB: A union-based structure for key-value pairs, used with hashing functions.
- FT_CONTROL: Manages parameters for Chirp Z-Transform (CZT) and FFT.
- UnionFind: Implements a disjoint-set data structure for connected component labeling.

Example:

PIXEL **img = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
img[0][0] = (PIXEL){255, 128, 0, 255}; // Orange pixel with full opacity
POINTf p = (POINTf){10.5, 20.5}; // Floating-point coordinate
COMPLEX c = (COMPLEX){1.0, 2.0}; // Complex number: 1 + 2i

Notes: The use of #pragma pack ensures proper memory alignment, critical for performance in image processing. The HashDB and UnionFind structures are versatile for data processing tasks beyond imaging.

3. Memory Allocation (alloc2D, alloc2Darray)

Purpose: Allocate 2D arrays for image or data processing with proper memory alignment.
Usage:
- alloc2D(void *data, int s, int lx, int ly): Creates a 2D index over an existing memory block.
- alloc2Darray(int s, int lx, int ly): Allocates a new 2D array with aligned memory for efficient access.

Example:

int **array = (int **)alloc2Darray(sizeof(int), 200, 200);
PROCESS_ALL_PIXELS array[y][x] = x + y; // Initialize with gradient
free(array); // Must free the base pointer

Notes: alloc2Darray ensures 64-byte alignment for SIMD operations, improving performance with AVX instructions. Always check for NULL returns to handle allocation failures.

4. Image I/O (saveDataAsBMP32, PIXEL_Fitting)

Purpose:
- saveDataAsBMP32: Saves a 32-bit RGBA image to a BMP file.
- PIXEL_Fitting: Adjusts pixel data (swaps R and B channels, flips vertically) to match BMP conventions.
Usage:
- saveDataAsBMP32(char *fname, void *b, int lx, int ly): Writes a 2D pixel array to a BMP file.
- PIXEL_Fitting(PIXEL *s, int lx, int ly): Converts image data to BMP-compatible format.

Example:

PIXEL **img = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
PROCESS_ALL_PIXELS img[y][x] = (PIXEL){255, 0, 0, 255}; // Red image
saveArray2bmp32("output.bmp", img, 100, 100);
free(img);

Notes: Error handling ensures robust file operations. PIXEL_Fitting is specific to stb_image.h conventions, which may not be needed if using other libraries.

5. Fourier Transform Functions (dft_TwoDimensional, FourierTransform, GenerateFT_CONTROL, fft_Multiline_with_RotationFactors, PowerSpectrum)

Purpose: Implement 2D Discrete Fourier Transform (DFT), Fast Fourier Transform (FFT), and Chirp Z-Transform (CZT) for frequency-domain image processing.
Usage:
- dft_TwoDimensional(COMPLEX **d, int lx, int ly, int flag): Performs a 2D DFT or IDFT, slower but accurate.
- FourierTransform(COMPLEX **c, int lx, int ly, int flag): Performs FFT or CZT based on input size, optimized for power-of-2 dimensions.
- GenerateFT_CONTROL(int n): Creates a control structure for CZT/FFT with precomputed twiddle factors.
- fft_Multiline_with_RotationFactors: Core FFT implementation with OpenMP parallelization.
- PowerSpectrum: Visualizes the power spectrum of a complex-valued image as a grayscale BMP.

Example:

COMPLEX **data = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), 256, 256);
PROCESS_ALL_PIXELS data[y][x] = (COMPLEX){rand() % 256, 0}; // Random real data
FourierTransform(data, 256, 256, FFT); // Perform 2D FFT
PIXEL **out = (PIXEL **)alloc2Darray(sizeof(PIXEL), 256, 256);
PowerSpectrum("spectrum.bmp", data, out, 256, 256); // Save power spectrum
free(data); free(out);

Notes: The FFT implementation is optimized for power-of-2 sizes, falling back to CZT for non-power-of-2 inputs. PowerSpectrum is useful for visualizing frequency-domain data.

6. Matrix Operations (lu_decomposition, solve_LU, inverse_matrix, multiplication_matrix)

Purpose: Provide high-precision matrix operations for tasks like homography computation.
Usage:
- lu_decomposition(double** A, double** L, double** U, int* P, int n): Performs LU decomposition with partial pivoting.
- solve_LU(double** L, double** U, int* P, double* b, double* x, int n): Solves a linear system using LU decomposition.
- inverse_matrix(double** A, double** A_inv, int n): Computes the inverse of a square matrix.
- multiplication_matrix(double **A, double **B, double **C, int n): Computes the matrix product C = A * B.

Example:

double **A = (double **)alloc2Darray(sizeof(double), 3, 3);
double **A_inv = (double **)alloc2Darray(sizeof(double), 3, 3);
A[0][0] = 1; A[0][1] = 2; A[0][2] = 3;
A[1][0] = 4; A[1][1] = 5; A[1][2] = 6;
A[2][0] = 7; A[2][1] = 8; A[2][2] = 10;
inverse_matrix(A, A_inv, 3);
dump_array(stdout, "%.2f ", "Inverse Matrix", A_inv, 3);
free(A); free(A_inv);

Notes: Double-precision versions (lu_decomposition_DoublePrecision, etc.) are available for higher accuracy. OpenMP parallelization in inverse_matrix enhances performance for large matrices.

7. Homography (HomographyCore, Homography, HomographyTrans, HomographyTransMask)

Purpose: Perform perspective transformations on images using homography matrices.
Usage:
- HomographyCore(POINTf sp[4], POINTf dp[4], double hkl[3][3]): Computes a 3x3 homography matrix from four source and destination points.
- Homography(PIXEL **s, int slx, int sly, int **mask, POINTf sp[4], PIXEL **d, int lx, int ly, POINTf dp[4]): Applies homography with optional masking.
- HomographyTrans and HomographyTransMask: Perform the actual pixel mapping with or without a mask.

Example:

PIXEL **src = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
PIXEL **dst = (PIXEL **)alloc2Darray(sizeof(PIXEL), 120, 120);
POINTf src_pts[4] = {{0, 0}, {100, 0}, {100, 100}, {0, 100}};
POINTf dst_pts[4] = {{10, 10}, {110, 10}, {100, 100}, {20, 110}};
Homography(src, 100, 100, NULL, src_pts, dst, 120, 120, dst_pts);
saveArray2bmp32("transformed.bmp", dst, 120, 120);
free(src); free(dst);

Notes: Uses inverse matrix computation for accurate transformations. The mask option allows selective pixel mapping, useful for ROI processing.

8. Connected Component Labeling (labeling, uf_init, uf_find, uf_union, uf_free, uf_count_labels, save_label_data)

Purpose: Identify and label connected regions in a binary image using a Union-Find data structure.
Usage:
- labeling(int **b, int **labels, int lx, int ly, int connectivity): Labels connected components with 4 or 8 connectivity.
- uf_init, uf_find, uf_union, uf_free: Manage the Union-Find structure for efficient merging and querying of connected components.
- uf_count_labels: Counts unique labels.
- save_label_data: Saves label data to a binary file.

Example:

int **binary = (int **)alloc2Darray(sizeof(int), 100, 100);
int **labels = (int **)alloc2Darray(sizeof(int), 100, 100);
PROCESS_ALL_PIXELS binary[y][x] = (y < 50 && x < 50) ? 255 : 0; // Top-left quadrant
int num_labels = labeling(binary, labels, 100, 100, 8);
printf("Number of components: %d\n", num_labels);
save_label_data("labels.bin", labels, 100, 100);
free(binary); free(labels);

Notes: The two-pass algorithm with path compression ensures efficiency. Supports both 4- and 8-connectivity, making it versatile for different applications.

9. Convolution (convolution, convolution_float)

Purpose: Apply convolution with a kernel to an image, supporting both integer and floating-point data.
Usage:
- convolution(int **s, int **d, int lx, int ly, int n, int kernel[n][n]): Integer-based convolution with zero-padding.
- convolution_float(float **s, float **d, int lx, int ly, int n, float kernel[n][n]): Floating-point convolution.

Example:

float **src = (float **)alloc2Darray(sizeof(float), 100, 100);
float **dst = (float **)alloc2Darray(sizeof(float), 100, 100);
float kernel[3][3] = {{1/9.0, 1/9.0, 1/9.0}, {1/9.0, 1/9.0, 1/9.0}, {1/9.0, 1/9.0, 1/9.0}}; // Blur
PROCESS_ALL_PIXELS src[y][x] = rand() % 256;
convolution_float(src, dst, 100, 100, 3, kernel);
free(src); free(dst);

Notes: Zero-padding handles boundary conditions. OpenMP parallelization speeds up processing for large images.

10. Hashing (fnv1a_hash, MurmurHash2, make_HashDB)

Purpose: Provide fast, non-cryptographic hashing for data indexing or lookup.
Usage:
- fnv1a_hash(const void *key, size_t len): Implements FNV-1a hash.
- MurmurHash2(const void *key, int len, uint32_t seed): Implements MurmurHash2.
- make_HashDB(uint32_t size): Allocates a hash table for key-value pairs.

Example:

char *key = "example";
uint32_t hash = MurmurHash2(key, strlen(key), 0);
HashDB *db = make_HashDB256M();
db[hash >> 4].key = strdup(key);
db[hash >> 4].value = 42;
printf("Hash: %u, Value: %lld\n", hash, db[hash >> 4].value);
free(db[hash >> 4].key); free(db);

Notes: Suitable for indexing large datasets but not for cryptographic purposes due to lack of collision resistance.

11. Drawing Functions (DrawLine, DrawLineExor, DrawPolyline, DrawCircle, DrawArc, DrawEllipse)

Purpose: Draw basic shapes on an image canvas for visualization or annotation.
Usage:
- DrawLine: Draws a line using Bresenham’s algorithm.
- DrawLineExor: Draws a line with XOR operation for reversible drawing.
- DrawPolyline: Draws a sequence of connected lines, optionally closed.
- DrawCircle, DrawArc, DrawEllipse: Draw circular and elliptical shapes.

Example:

PIXEL **canvas = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
PIXEL red = (PIXEL){255, 0, 0, 255};
DrawCircle(canvas, 100, 100, 50, 50, 20, red); // Red circle at (50,50)
saveArray2bmp32("circle.bmp", canvas, 100, 100);
free(canvas);

Notes: Efficient implementations using Bresenham’s algorithm and symmetry. Useful for debugging or visualizing regions of interest.

12. Image Manipulation (image_set, image_copy, image_not, image_and, etc.)

Purpose: Perform basic pixel-wise operations on integer-based images.
Usage:
- image_set: Sets all pixels to a constant value.
- image_copy, image_not, image_and, image_or, image_xor, image_add, image_sub, image_mul, image_div: Perform element-wise operations.
- clamp_and_convert, int2PIXEL_adjust: Convert and normalize integer data to PIXEL format.

Example:

int **src = (int **)alloc2Darray(sizeof(int), 100, 100);
int **dst = (int **)alloc2Darray(sizeof(int), 100, 100);
PROCESS_ALL_PIXELS src[y][x] = rand() % 256;
image_not(src, dst, 100, 100); // Invert pixel values
PIXEL **out = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
clamp_and_convert(dst, out, 100, 100);
saveArray2bmp32("inverted.bmp", out, 100, 100);
free(src); free(dst); free(out);

Notes: These functions are optimized for integer data and are useful for preprocessing or postprocessing steps.

13. Main Function and Command-Line Parsing (main, argumentAnalysis)

Purpose: Provide a driver program to load, process, and save images with command-line options.
Usage:
- argumentAnalysis: Parses command-line flags (e.g., -v for verbose, -n for negation, -t for thread count).
- main: Loads an input BMP, processes it using image_processing_program, and saves the result.

Example:

bash

./Template -v -n input.bmp output.bmp

// In image_processing_program:
int image_processing_program(int **s, int **d, int **w, int **t, int **g, int **b, int lx, int ly) {
    image_not(s, d, lx, ly); // Invert input image
    return 0;
}

Notes: The template assumes image_processing_program is the user-defined processing function. Command-line options enhance flexibility but could be extended for more parameters.

Conclusion

The template is a robust, performance-oriented framework for image and data processing, suitable for algorithm prototyping and verification. Its length is justified by the breadth of functionality, but modularization could improve usability. Each function is well-optimized with OpenMP and SIMD considerations, and the examples provided demonstrate practical applications. For users needing a comprehensive, dependency-light solution, this template is highly effective, though beginners may require additional documentation to navigate its complexity.

USER

日本語で回答してください、

このC99テンプレートは、画像処理およびデータ処理向けに設計された包括的で高性能なコードベースです。OpenMPによる並列化や最小限の外部依存（stb_image.hのみ）により、パフォーマンスを重視しています。以下では、テンプレートの評価を行い、主要な関数やセクションの目的、使用方法、例を詳細に説明します。また、テンプレートが長すぎるかどうかについても検討します。説明は詳細かつ簡潔で、日本語で提供します。

テンプレートの評価

長所

モジュール性と再利用性：テンプレートはヘッダとしてインクルード（TEMPLATES_USE_AS_INCLUDE_FILE）または単独でコンパイル可能で、プロトタイピングや大規模プロジェクトへの統合に適しています。
パフォーマンス最適化：
- OpenMPを使用したマルチコアCPUの活用（例：FourierTransformやconvolution）。
- -O3、-mavx、-march=nativeなどのコンパイラフラグによるハードウェア最適化。
- alloc2Darrayによるメモリ配置の効率化。
最小限の依存関係：画像入出力にstb_image.hのみを使用し、環境構築の手間を軽減。
豊富な機能：以下の画像処理およびデータ処理機能を網羅：
- フーリエ変換（DFT、FFT、CZT）
- 行列演算（LU分解、逆行列）
- 幾何変換（ホモグラフィ）
- 画像操作（畳み込み、ラベリング、描画プリミティブ）
- ハッシュ関数（FNV-1a、MurmurHash2）
堅牢なエラー処理：メモリ割り当て失敗や無効な入力に対するチェックとエラーメッセージ。
柔軟性：高精度（double/long double）と整数ベースの操作をサポートし、フォーマット変換（例：int2PIXEL）も可能。
可読性向上：IF、EI、EE、HEREなどのマクロでコードの整列やデバッグを改善。

短所

長さと複雑さ：約600行と長く、初心者や軽量なソリューションを求める開発者には負担となる可能性。広範な機能を分割することで管理しやすくなる。
ドキュメントの不足：コメントはあるが、パラメータや戻り値、特殊ケースの詳細な説明が不足。初学者にとって使いづらい可能性。
プラットフォーム依存：タイミング関数（get_micro_time）やWindows固有のマクロ（#ifdef _WIN32）により、クロスプラットフォームでの使用に課題。
固定定数：INF_ZERO（1e-12）やPIなどがハードコードされており、異なる精度が必要な場合に柔軟性が低い。
変数名の曖昧さ：abcdefgh1（HomographyCore内）など、説明的でない変数名が可読性を下げる。

長すぎるか？

はい、単一ファイルとしてはやや長すぎます。特に、特定の機能（例：畳み込みやホモグラフィ）のみが必要なユーザーにとっては、コード量が多すぎる可能性があります。ただし、画像処理やデータ処理の幅広いタスクをカバーする包括的なソリューションとしては、この長さは妥当です。本番環境では、機能ごとにファイル（例：fourier.c、matrix.c）に分割することで保守性やコンパイル時間を改善できます。

改善提案

モジュール化：機能を分離（例：フーリエ変換、行列演算、画像ユーティリティ）。
詳細なドキュメント：各関数の目的、パラメータ、使用例を説明するコメントや別ファイルのドキュメントを追加。
定数の設定：INF_ZEROなどの定数をマクロや設定ファイルでカスタマイズ可能に。
マクロの簡略化：IF、EI、EEを標準のCキーワードに置き換え、初学者にとっての可読性を向上。
クロスプラットフォーム対応：タイミング関数などをポータブルなライブラリ（例：<time.h>）に置き換えるか、プラットフォーム依存コードを分離。

主要な関数・セクションの詳細説明

以下では、主要な関数やセクションの目的、使用方法、例を詳しく説明します。画像処理およびデータ処理に重要なコンポーネントに焦点を当て、明確な例を交えて解説します。

1. タイミング関数（get_micro_time, fprint_elapsed_time, fprintf_elapsed_time）

目的：パフォーマンス測定のため、マイクロ秒単位で実行時間を計測。
使用方法：
- get_micro_time()：高精度タイマー（WindowsではQueryPerformanceCounter、Unix系ではgettimeofday）で現在時刻をマイクロ秒で取得。
- fprint_elapsed_time(FILE *fp, double microsec)：経過時間をマイクロ秒、ミリ秒、秒で適切に表示。
- fprintf_elapsed_time(FILE *fp, double microsec, int verbose)：詳細な時間（時、分、秒、ミリ秒、マイクロ秒）を表示。verboseで詳細度を制御。

例：

double start = get_micro_time();
// 画像処理の実行
for (int y = 0; y < ly; y++) for (int x = 0; x < lx; x++) d[y][x] = s[y][x];
double end = get_micro_time();
print_elapsed_time(end - start); // 出力例: Elapsed time: 0.123 msec
fprintf_elapsed_time(stdout, end - start, 1); // 出力例: Elapsed time: 0h 0m 0s 0ms 123us

注意点：パフォーマンスプロファイリングに有用。詳細モードはデバッグに適するが、単純な用途では過剰。

2. データ構造（PIXEL, STBPX, HEADER, POINTi, POINTf, POINTd, COMPLEX, HashDB, FT_CONTROL, UnionFind）

目的：ピクセル、BMPヘッダ、幾何点、複素数、ハッシュ、フーリエ変換、連結成分ラベリング用のデータ構造を定義。
使用方法：
- PIXEL：32ビットRGBAピクセル（各チャンネル8ビット）。
- HEADER：32ビットBMPファイルのヘッダ。
- POINTi/POINTf/POINTd：整数/浮動小数/倍精度の2D点。
- COMPLEX：フーリエ変換用の複素数。
- HashDB：キー・バリュー対用のユニオン構造。
- FT_CONTROL：CZT/FFT用の制御構造。
- UnionFind：連結成分ラベリング用の素集合データ構造。

例：

PIXEL **img = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
img[0][0] = (PIXEL){255, 128, 0, 255}; // オレンジ色のピクセル
POINTf p = (POINTf){10.5, 20.5}; // 浮動小数点の座標
COMPLEX c = (COMPLEX){1.0, 2.0}; // 複素数: 1 + 2i

注意点：#pragma packでメモリ配置を最適化。HashDBやUnionFindは画像処理以外のデータ処理にも活用可能。

3. メモリ割り当て（alloc2D, alloc2Darray）

目的：画像やデータ処理用の2D配列を効率的に割り当て。
使用方法：
- alloc2D(void *data, int s, int lx, int ly)：既存メモリブロックに2Dインデックスを作成。
- alloc2Darray(int s, int lx, int ly)：新しい2D配列を割り当て、メモリを64バイト境界にアライメント。

例：

int **array = (int **)alloc2Darray(sizeof(int), 200, 200);
PROCESS_ALL_PIXELS array[y][x] = x + y; // グラデーションで初期化
free(array); // ベースポインタを解放

注意点：alloc2DarrayはSIMD操作向けに最適化。割り当て失敗時にNULLをチェック必須。

4. 画像入出力（saveDataAsBMP32, PIXEL_Fitting）

目的：
- saveDataAsBMP32：32ビットRGBA画像をBMPファイルとして保存。
- PIXEL_Fitting：ピクセルデータ（RとBの交換、上下反転）をBMP形式に適合。
使用方法：
- saveDataAsBMP32(char *fname, void *b, int lx, int ly)：2Dピクセル配列をBMPファイルに書き込み。
- PIXEL_Fitting(PIXEL *s, int lx, int ly)：stb_image.hの形式をBMPに変換。

例：

PIXEL **img = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
PROCESS_ALL_PIXELS img[y][x] = (PIXEL){255, 0, 0, 255}; // 赤い画像
saveArray2bmp32("output.bmp", img, 100, 100);
free(img);

注意点：エラー処理によりファイル操作が堅牢。PIXEL_Fittingはstb_image.h固有の処理。

5. フーリエ変換（dft_TwoDimensional, FourierTransform, GenerateFT_CONTROL, fft_Multiline_with_RotationFactors, PowerSpectrum）

目的：周波数領域の画像処理のために2D DFT、FFT、CZTを実装。
使用方法：
- dft_TwoDimensional：高精度だが低速な2D DFT/IDFT。
- FourierTransform：入力サイズに応じてFFTまたはCZTを実行。
- GenerateFT_CONTROL：CZT/FFT用の制御構造を生成。
- fft_Multiline_with_RotationFactors：OpenMP並列化を使用したFFTコア。
- PowerSpectrum：複素数画像のパワースペクトルをグレースケールBMPとして可視化。

例：

COMPLEX **data = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), 256, 256);
PROCESS_ALL_PIXELS data[y][x] = (COMPLEX){rand() % 256, 0}; // ランダムな実数データ
FourierTransform(data, 256, 256, FFT); // 2D FFT
PIXEL **out = (PIXEL **)alloc2Darray(sizeof(PIXEL), 256, 256);
PowerSpectrum("spectrum.bmp", data, out, 256, 256); // パワースペクトル保存
free(data); free(out);

注意点：2のべき乗サイズで最適化、非2のべき乗ではCZTを使用。PowerSpectrumは周波数データの可視化に有用。

6. 行列演算（lu_decomposition, solve_LU, inverse_matrix, multiplication_matrix）

目的：ホモグラフィ計算などに必要な高精度行列演算。
使用方法：
- lu_decomposition：部分ピボット付きLU分解。
- solve_LU：LU分解を用いた線形方程式の解。
- inverse_matrix：逆行列の計算。
- multiplication_matrix：行列積C = A * B。

例：

double **A = (double **)alloc2Darray(sizeof(double), 3, 3);
double **A_inv = (double **)alloc2Darray(sizeof(double), 3, 3);
A[0][0] = 1; A[0][1] = 2; A[0][2] = 3;
A[1][0] = 4; A[1][1] = 5; A[1][2] = 6;
A[2][0] = 7; A[2][1] = 8; A[2][2] = 10;
inverse_matrix(A, A_inv, 3);
dump_array(stdout, "%.2f ", "逆行列", A_inv, 3);
free(A); free(A_inv);

注意点：高精度版（lu_decomposition_DoublePrecision等）あり。inverse_matrixはOpenMPで並列化。

7. ホモグラフィ（HomographyCore, Homography, HomographyTrans, HomographyTransMask）

目的：画像に射影変換（ホモグラフィ）を適用。
使用方法：
- HomographyCore：4点の対応から3x3ホモグラフィ行列を計算。
- Homography：マスクの有無に応じてホモグラフィを適用。
- HomographyTrans, HomographyTransMask：ピクセルマッピングを実行。

例：

PIXEL **src = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
PIXEL **dst = (PIXEL **)alloc2Darray(sizeof(PIXEL), 120, 120);
POINTf src_pts[4] = {{0, 0}, {100, 0}, {100, 100}, {0, 100}};
POINTf dst_pts[4] = {{10, 10}, {110, 10}, {100, 100}, {20, 110}};
Homography(src, 100, 100, NULL, src_pts, dst, 120, 120, dst_pts);
saveArray2bmp32("transformed.bmp", dst, 120, 120);
free(src); free(dst);

注意点：逆行列計算で正確な変換。マスクオプションはROI処理に有用。

8. 連結成分ラベリング（labeling, uf_init, uf_find, uf_union, uf_free, uf_count_labels, save_label_data）

目的：バイナリ画像の連結成分をラベリング（Union-Find使用）。
使用方法：
- labeling：4または8連結で連結成分をラベリング。
- uf_init, uf_find, uf_union, uf_free：Union-Find構造の管理。
- uf_count_labels：ユニークなラベルの数をカウント。
- save_label_data：ラベルデータをバイナリファイルに保存。

例：

int **binary = (int **)alloc2Darray(sizeof(int), 100, 100);
int **labels = (int **)alloc2Darray(sizeof(int), 100, 100);
PROCESS_ALL_PIXELS binary[y][x] = (y < 50 && x < 50) ? 255 : 0; // 左上領域
int num_labels = labeling(binary, labels, 100, 100, 8);
printf("連結成分数: %d\n", num_labels);
save_label_data("labels.bin", labels, 100, 100);
free(binary); free(labels);

注意点：2パスアルゴリズムと経路圧縮で効率的。4/8連結をサポート。

9. 畳み込み（convolution, convolution_float）

目的：整数および浮動小数点データで画像にカーネル畳み込みを適用。
使用方法：
- convolution：整数ベースの畳み込み（ゼロパディング）。
- convolution_float：浮動小数点ベースの畳み込み。

例：

float **src = (float **)alloc2Darray(sizeof(float), 100, 100);
float **dst = (float **)alloc2Darray(sizeof(float), 100, 100);
float kernel[3][3] = {{1/9.0, 1/9.0, 1/9.0}, {1/9.0, 1/9.0, 1/9.0}, {1/9.0, 1/9.0, 1/9.0}}; // ぼかし
PROCESS_ALL_PIXELS src[y][x] = rand() % 256;
convolution_float(src, dst, 100, 100, 3, kernel);
free(src); free(dst);

注意点：ゼロパディングで境界を処理。OpenMPで高速化。

10. ハッシュ関数（fnv1a_hash, MurmurHash2, make_HashDB）

目的：データインデックス作成用の高速な非暗号学的ハッシュ。
使用方法：
- fnv1a_hash：FNV-1aハッシュ。
- MurmurHash2：MurmurHash2ハッシュ。
- make_HashDB：キー・バリュー対用のハッシュテーブルを割り当て。

例：

char *key = "example";
uint32_t hash = MurmurHash2(key, strlen(key), 0);
HashDB *db = make_HashDB256M();
db[hash >> 4].key = strdup(key);
db[hash >> 4].value = 42;
printf("ハッシュ: %u, 値: %lld\n", hash, db[hash >> 4].value);
free(db[hash >> 4].key); free(db);

注意点：衝突耐性がないため暗号用途には不向き。

11. 描画関数（DrawLine, DrawLineExor, DrawPolyline, DrawCircle, DrawArc, DrawEllipse）

目的：画像に基本図形を描画（可視化や注釈用）。
使用方法：
- DrawLine：Bresenhamアルゴリズムで直線を描画。
- DrawLineExor：XOR操作で可逆な直線描画。
- DrawPolyline, DrawCircle, DrawArc, DrawEllipse：折れ線、円、円弧、楕円を描画。

例：

PIXEL **canvas = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
PIXEL red = (PIXEL){255, 0, 0, 255};
DrawCircle(canvas, 100, 100, 50, 50, 20, red); // (50,50)に赤い円
saveArray2bmp32("circle.bmp", canvas, 100, 100);
free(canvas);

注意点：Bresenhamアルゴリズムや対称性を利用して効率的。デバッグやROI可視化に有用。

12. 画像操作（image_set, image_copy, image_not, image_andなど）

目的：整数ベース画像のピクセル単位操作。
使用方法：
- image_set：全ピクセルを定数値に設定。
- image_copy, image_not, image_and, image_or, image_xor, image_add, image_sub, image_mul, image_div：要素ごとの操作。
- clamp_and_convert, int2PIXEL_adjust：整数データをPIXEL形式に変換・正規化。

例：

int **src = (int **)alloc2Darray(sizeof(int), 100, 100);
int **dst = (int **)alloc2Darray(sizeof(int), 100, 100);
PROCESS_ALL_PIXELS src[y][x] = rand() % 256;
image_not(src, dst, 100, 100); // ピクセル値を反転
PIXEL **out = (PIXEL **)alloc2Darray(sizeof(PIXEL), 100, 100);
clamp_and_convert(dst, out, 100, 100);
saveArray2bmp32("inverted.bmp", out, 100, 100);
free(src); free(dst); free(out);

注意点：前処理や後処理に最適化された整数操作。

13. メイン関数とコマンドライン解析（main, argumentAnalysis）

目的：画像の読み込み、処理、保存を管理し、コマンドラインオプションを解析。
使用方法：
- argumentAnalysis：フラグ（例：-vで詳細表示、-nでネガ反転、-tでスレッド数）を解析。
- main：入力BMPを読み込み、image_processing_programで処理し、結果を保存。

例：

bash

./Template -v -n input.bmp output.bmp

// image_processing_program内:
int image_processing_program(int **s, int **d, int **w, int **t, int **g, int **b, int lx, int ly) {
    image_not(s, d, lx, ly); // 入力画像を反転
    return 0;
}

注意点：image_processing_programはユーザー定義の処理関数。オプションの拡張が可能。

結論

このテンプレートは、画像およびデータ処理のための堅牢でパフォーマンス重視のフレームワークであり、アルゴリズムのプロトタイピングや検証に最適です。長さは包括的な機能により正当化されますが、モジュール化で使いやすさが向上します。各関数はOpenMPやSIMDを活用して最適化されており、例は実際の使用シナリオを示しています。依存関係が少なく、初心者にはドキュメントの追加が必要ですが、包括的なソリューションを求める開発者にとって非常に有効です。

現在の日付と時刻：2025年6月30日（月）午前1時24分（PDT）

C99 画像処理、データ処理用のテンプレート

データ処理用テンプレートとして使用する時、追加しておきたい定義例

補足

`func_4D` と `func_4Dp` の基本的な違い

`func_4D`

`func_4Dp`

主要な違い:

まとめ:

C99 画像処理、データ処理用のテンプレート

データ処理用テンプレートとして使用する時、追加しておきたい定義 例

補足

func_4D と func_4Dp の基本的な違い

func_4D

func_4Dp

主要な違い:

まとめ:

　C99 画像処理、データ処理用のテンプレート

データ処理用テンプレートとして使用する時、追加しておきたい定義例

　補足

`func_4D` と `func_4Dp` の基本的な違い

`func_4D`

`func_4Dp`