cnn_CIFAR.c   C99でスクラッチ実装した、CIFAR-10、CIFAR-100 用畳み込みニューラルネット(学習+評価)
検証環境が存在しないので、雛形としての提示
#define OUTPUT_SIZE           CIFAR-10:10    CIFAR-100:100
 EX.
    CIFAR-10
      +project/                 
      ├── Learning/           
      │   ├── image1.0.png   
      │   ├── image2.5.png   
      │   └── ...            
      ├── Evaluation/         
      │   ├── test1.png      
      │   ├── test2.png      
      │   └── ...            
      ├── stb_image.h         
      └── cifar.c             
    CIFAR-100
      +project/                
      ├── Learning/
      │   ├── image1.5.0.png
      │   ├── image2.8.5.png
      │   └── ...
      ├── Evaluation/  (ラベルが有れば正解率の算出)
      │   ├── test1.5.2.png
      │   ├── test2.5.1.png
      │   └── ...
      ├── stb_image.h
      └── cifar.c
// https://github.com/nothings/stb/blob/master/stb_image.h
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <dirent.h>
#include <stdint.h>
#include <ctype.h>
int verbose = 0, ActiveThread = 0;
#define PI   3.1415926535897932384626433832795L
#define M_PI PI
typedef struct COMPLEX { float r, i;} COMPLEX;
#define RotorCOMPLEX(a)    (COMPLEX){ cos(a), sin(a) }
#define ConjCOMPLEX(a)     (COMPLEX){ (a).r, -((a).i) }
#define AddCOMPLEX(a,b)    (COMPLEX){ (a).r + (b).r, (a).i + (b).i }
#define SubCOMPLEX(a,b)    (COMPLEX){ (a).r - (b).r, (a).i - (b).i }
#define MulCOMPLEX(a,b)    (COMPLEX){ (a).r*(b).r - (a).i*(b).i, (a).r*(b).i + (a).i*(b).r }
#define ImeMulCOMPLEX(a,b) (COMPLEX){ (a).r * (b), (a).i * (b) }
#define AppendCOMPLEX(a,b) { (a).r += (b).r, (a).i += (b).i ; } 
void FourierTransform2D(COMPLEX **c, int lx, int ly, int flag);
void  **alloc2Darray(int size, int lx, int ly);
void ***alloc3Darray(int size, int lx, int ly, int lz);
void extract_labels(const char *filename, int *label1, int *label2);
int  saveDataAsBMP32( char *fname, void *b, int lx, int ly );
void argumentAnalysis( int argc, char *argv[], int *skip_training,
                       int *save_flag, char **save_name,  int *load_flag, char **load_name );
#define DEFAULT_WEIGHT_FILE   "modelWeight.bin"
#define INPUT_CHANNELS     3
#define INPUT_HEIGHT      32
#define INPUT_WIDTH       32
#define CONV_FILTERS     128
#define KERNEL_SIZE        3
#define POOL_SIZE          2
#define FC_SIZE          512
#define OUTPUT_SIZE       10  // OUTPUT_SIZE  ( mode==CIFAR-10 ? 10 : ( mode==CIFAR-100 ? 100 : 0 )
#define BATCH_SIZE       256
#define EPOCHS            20  // The minimum number of learning sessions is 100.
#define LEARNING_RATE  (0.035*4)
#define MOMENTUM         0.9
#define FFT               -1
#define IFFT               1
#define DFT              FFT
#define IDFT            IFFT
typedef struct {
    char filename[256];
    int label2;
    int label;
    float input[INPUT_CHANNELS][INPUT_HEIGHT][INPUT_WIDTH];
} ImageData;
void show_image_data(ImageData *images, int here);
int  load_image(char *filepath, ImageData *image, int width, int height, int channels);
int  sample(const char *dirname, ImageData **images, int *num_samples);
typedef struct {
    int max_idx[CONV_FILTERS][INPUT_HEIGHT / POOL_SIZE][INPUT_WIDTH / POOL_SIZE][2];
} PoolIndices;
typedef struct {
    float conv1[CONV_FILTERS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE];
    float bias1[CONV_FILTERS];
    float m_conv1[CONV_FILTERS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE];
    float m_bias1[CONV_FILTERS];
    float conv2[CONV_FILTERS][CONV_FILTERS][KERNEL_SIZE][KERNEL_SIZE];
    float bias2[CONV_FILTERS];
    float m_conv2[CONV_FILTERS][CONV_FILTERS][KERNEL_SIZE][KERNEL_SIZE];
    float m_bias2[CONV_FILTERS];
    float fc1[CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)][FC_SIZE];
    float bias_fc1[FC_SIZE];
    float m_fc1[CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)][FC_SIZE];
    float m_bias_fc1[FC_SIZE];
    float fc2[FC_SIZE][OUTPUT_SIZE];
    float bias_fc2[OUTPUT_SIZE];
    float m_fc2[FC_SIZE][OUTPUT_SIZE];
    float m_bias_fc2[OUTPUT_SIZE];
    float bn1_mean[CONV_FILTERS], bn1_var[CONV_FILTERS];
    float bn1_gamma[CONV_FILTERS], bn1_beta[CONV_FILTERS];
    float bn2_mean[CONV_FILTERS], bn2_var[CONV_FILTERS];
    float bn2_gamma[CONV_FILTERS], bn2_beta[CONV_FILTERS];
    float m_bn1_gamma[CONV_FILTERS], m_bn1_beta[CONV_FILTERS];
    float m_bn2_gamma[CONV_FILTERS], m_bn2_beta[CONV_FILTERS];
} ConvolutionalNeuralNetwork;
void init_cnn(ConvolutionalNeuralNetwork *cnn) {
    float limit;
    limit = sqrtf(6.0f / (INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE));
    for(int f=0; f<CONV_FILTERS; f++) {
        for(int c=0; c<INPUT_CHANNELS; c++) {
            for(int y=0; y<KERNEL_SIZE; y++) {
                for(int x=0; x<KERNEL_SIZE; x++) {
                    cnn->conv1[f][c][y][x]   = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
                    cnn->m_conv1[f][c][y][x] = 0.0f;
                }
            }
        }
        cnn->bias1[f]       = 0.0f;
        cnn->m_bias1[f]     = 0.0f;
        cnn->bn1_mean[f]    = 0.0f;
        cnn->bn1_var[f]     = 1.0f;
        cnn->bn1_gamma[f]   = 1.0f;
        cnn->bn1_beta[f]    = 0.0f;
        cnn->m_bn1_gamma[f] = 0.0f;
        cnn->m_bn1_beta[f]  = 0.0f;
    }
    limit = sqrtf(6.0f / (CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE));
    for(int f=0; f<CONV_FILTERS; f++) {
        for(int c=0; c<CONV_FILTERS; c++) {
            for(int y=0; y<KERNEL_SIZE; y++) {
                for(int x=0; x<KERNEL_SIZE; x++) {
                    cnn->conv2[f][c][y][x]   = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
                    cnn->m_conv2[f][c][y][x] = 0.0f;
                }
            }
        }
        cnn->bias2[f]       = 0.0f;
        cnn->m_bias2[f]     = 0.0f;
        cnn->bn2_mean[f]    = 0.0f;
        cnn->bn2_var[f]     = 1.0f;
        cnn->bn2_gamma[f]   = 1.0f;
        cnn->bn2_beta[f]    = 0.0f;
        cnn->m_bn2_gamma[f] = 0.0f;
        cnn->m_bn2_beta[f]  = 0.0f;
    }
    limit = sqrtf(6.0f / (CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)));
    for(int i=0; i<CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE); i++) {
        for(int j=0; j<FC_SIZE; j++) {
            cnn->fc1[i][j]   = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
            cnn->m_fc1[i][j] = 0.0f;
        }
    }
    for(int i=0; i<FC_SIZE; i++) {
        cnn->bias_fc1[i]   = 0.0f;
        cnn->m_bias_fc1[i] = 0.0f;
    }
    limit = sqrtf(6.0f / FC_SIZE);
    for(int i=0; i<FC_SIZE; i++) {
        for(int j=0; j<OUTPUT_SIZE; j++) {
            cnn->fc2[i][j]   = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
            cnn->m_fc2[i][j] = 0.0f;
        }
    }
    for(int i=0; i<OUTPUT_SIZE; i++) {
        cnn->bias_fc2[i]   = 0.0f;
        cnn->m_bias_fc2[i] = 0.0f;
    }
}
void batch_norm(float *input, float *mean, float *var, float *gamma, float *beta, float *output, int size) {
    #pragma omp parallel for
    for(int f=0; f<size; f++) {
        for(int y=0; y<INPUT_HEIGHT; y++)for(int x=0; x<INPUT_WIDTH; x++) {
            int    idx  = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
            output[idx] = gamma[f] * (input[idx] - mean[f]) / sqrtf(var[f] + 1e-5) + beta[f];
        }
    }
}
void softmax(float *input, float *output, int size) {
    float max_val = input[0];
    for(int i=1; i<size; i++){ if(input[i]>max_val) max_val = input[i]; }
    
    float sum = 0.0f;
    for(int i=0; i<size; i++) sum += ( output[i] = expf(input[i] - max_val) );
    for(int i=0; i<size; i++) output[i] /= sum;
}
void conv_layer_fft( int in_channels, int out_channels, float input[in_channels][INPUT_HEIGHT][INPUT_WIDTH], 
                     float weights[out_channels][in_channels][KERNEL_SIZE][KERNEL_SIZE], float biases[out_channels], float *output) {
    const int pad = KERNEL_SIZE / 2;
    const int fft_size = 64;
    #pragma omp parallel for
    for(int f=0; f<out_channels; f++) {
        COMPLEX **fft_input  = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
        COMPLEX **fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
        if(!fft_input || !fft_weight){ perror("メモリ割り当て失敗"); free(fft_input); free(fft_weight); continue; }
        for(int c=0; c<in_channels; c++) {
            for(int y=0; y<fft_size; y++) {
                for(int x=0; x<fft_size; x++) {
                    fft_input[y][x].r  = 0.0;
                    fft_input[y][x].i  = 0.0;
                    fft_weight[y][x].r = 0.0;
                    fft_weight[y][x].i = 0.0;
                }
            }
            for(int y=0; y<INPUT_HEIGHT; y++) {
                for(int x=0; x<INPUT_WIDTH; x++) {
                    fft_input[y + pad][x + pad].r = input[c][y][x];
                }
            }
            for(int y=0; y<KERNEL_SIZE; y++) {
                for(int x=0; x<KERNEL_SIZE; x++) {
                    fft_weight[y][x].r = weights[f][c][y][x];
                }
            }
            FourierTransform2D(fft_input,  fft_size, fft_size, FFT);
            FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
            for(int y=0; y<fft_size; y++) {
                for(int x=0; x<fft_size; x++) {
                    fft_input[y][x] = MulCOMPLEX(fft_input[y][x], fft_weight[y][x]);
                }
            }
            FourierTransform2D(fft_input, fft_size, fft_size, IFFT);
            for(int y=0; y<INPUT_HEIGHT; y++) {
                for(int x=0; x<INPUT_WIDTH; x++) {
                    output[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += fft_input[y + pad][x + pad].r;
                }
            }
        }
        for(int y=0; y<INPUT_HEIGHT; y++) {
            for(int x=0; x<INPUT_WIDTH; x++) {
                output[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += biases[f];
            }
        }
        free(fft_input); free(fft_weight);
    }
}
void fc_layer(int input_size, int output_size, float *input, float weights[input_size][output_size], float biases[output_size], float *output) {
    #pragma omp parallel for
    for(int j=0; j<output_size; j++) {  output[j]  = biases[j];
        for(int i=0; i<input_size; i++) output[j] += input[i] * weights[i][j];
        output[j] = output[j] > 0 ? output[j] : 0;
    }
}
#define  save_cnn save_weights
void save_weights(ConvolutionalNeuralNetwork *cnn, const char *filename) {
    FILE *fp = fopen(filename, "wb");
    if(!fp){ perror("重み保存失敗"); return; }
    fwrite(cnn->conv1,     sizeof(cnn->conv1),     1, fp);
    fwrite(cnn->bias1,     sizeof(cnn->bias1),     1, fp);
    fwrite(cnn->conv2,     sizeof(cnn->conv2),     1, fp);
    fwrite(cnn->bias2,     sizeof(cnn->bias2),     1, fp);
    fwrite(cnn->fc1,       sizeof(cnn->fc1),       1, fp);
    fwrite(cnn->bias_fc1,  sizeof(cnn->bias_fc1),  1, fp);
    fwrite(cnn->fc2,       sizeof(cnn->fc2),       1, fp);
    fwrite(cnn->bias_fc2,  sizeof(cnn->bias_fc2),  1, fp);
    fwrite(cnn->bn1_mean,  sizeof(cnn->bn1_mean),  1, fp);
    fwrite(cnn->bn1_var,   sizeof(cnn->bn1_var),   1, fp);
    fwrite(cnn->bn1_gamma, sizeof(cnn->bn1_gamma), 1, fp);
    fwrite(cnn->bn1_beta,  sizeof(cnn->bn1_beta),  1, fp);
    fwrite(cnn->bn2_mean,  sizeof(cnn->bn2_mean),  1, fp);
    fwrite(cnn->bn2_var,   sizeof(cnn->bn2_var),   1, fp);
    fwrite(cnn->bn2_gamma, sizeof(cnn->bn2_gamma), 1, fp);
    fwrite(cnn->bn2_beta,  sizeof(cnn->bn2_beta),  1, fp);
    fclose(fp);
}
#define  load_cnn load_weights
void load_weights(ConvolutionalNeuralNetwork *cnn, const char *filename) {
    FILE *fp = fopen(filename, "rb");
    if(!fp){ perror("重み読み込み失敗"); return; }
    fread(cnn->conv1,      sizeof(cnn->conv1),     1, fp);
    fread(cnn->bias1,      sizeof(cnn->bias1),     1, fp);
    fread(cnn->conv2,      sizeof(cnn->conv2),     1, fp);
    fread(cnn->bias2,      sizeof(cnn->bias2),     1, fp);
    fread(cnn->fc1,        sizeof(cnn->fc1),       1, fp);
    fread(cnn->bias_fc1,   sizeof(cnn->bias_fc1),  1, fp);
    fread(cnn->fc2,        sizeof(cnn->fc2),       1, fp);
    fread(cnn->bias_fc2,   sizeof(cnn->bias_fc2),  1, fp);
    fread(cnn->bn1_mean,   sizeof(cnn->bn1_mean),  1, fp);
    fread(cnn->bn1_var,    sizeof(cnn->bn1_var),   1, fp);
    fread(cnn->bn1_gamma,  sizeof(cnn->bn1_gamma), 1, fp);
    fread(cnn->bn1_beta,   sizeof(cnn->bn1_beta),  1, fp);
    fread(cnn->bn2_mean,   sizeof(cnn->bn2_mean),  1, fp);
    fread(cnn->bn2_var,    sizeof(cnn->bn2_var),   1, fp);
    fread(cnn->bn2_gamma,  sizeof(cnn->bn2_gamma), 1, fp);
    fread(cnn->bn2_beta,   sizeof(cnn->bn2_beta),  1, fp);
    fclose(fp);
}
int evaluate(ConvolutionalNeuralNetwork *cnn, ImageData *images, int num_samples, double *eval_time) {
    double start_time = omp_get_wtime();
    float *conv1_out  = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
    float *bn1_out    = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
    float *conv2_out  = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
    float *bn2_out    = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
    float *pool_out   = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE), sizeof(float));
    float *fc1_out    = (float *)calloc(FC_SIZE,     sizeof(float));
    float *fc2_out    = (float *)calloc(OUTPUT_SIZE, sizeof(float));
    if(!conv1_out || !bn1_out || !conv2_out || !bn2_out || !pool_out || !fc1_out || !fc2_out) {
        perror("メモリ割り当て失敗");
        free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out);
        free(pool_out);  free(fc1_out); free(fc2_out);
        return 1;
    }
    int correct = 0;
    for(int i=0; i<num_samples; i++) {
        conv_layer_fft(INPUT_CHANNELS, CONV_FILTERS, images[i].input, cnn->conv1, cnn->bias1, conv1_out);
        batch_norm(conv1_out, cnn->bn1_mean, cnn->bn1_var, cnn->bn1_gamma, cnn->bn1_beta, bn1_out, CONV_FILTERS);
        #pragma omp parallel for
        for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn1_out[idx] = fmaxf(0.0f, bn1_out[idx]);
        conv_layer_fft(CONV_FILTERS, CONV_FILTERS, (float (*)[INPUT_HEIGHT][INPUT_WIDTH])bn1_out, cnn->conv2, cnn->bias2, conv2_out);
        #pragma omp parallel for
        for(int f=0; f<CONV_FILTERS; f++) {
            for(int y=0; y<INPUT_HEIGHT; y++) {
                for(int x=0; x<INPUT_WIDTH; x++) {
                    int       idx   = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
                    conv2_out[idx] += bn1_out[idx];
                }
            }
        }
        batch_norm(conv2_out, cnn->bn2_mean, cnn->bn2_var, cnn->bn2_gamma, cnn->bn2_beta, bn2_out, CONV_FILTERS);
        #pragma omp parallel for
        for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn2_out[idx] = fmaxf(0.0f, bn2_out[idx]);
        #pragma omp parallel for collapse(2)
        for(int f=0; f<CONV_FILTERS; f++) {
            for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) {
                for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) {
                    float max_val = -INFINITY;
                    for(int py = 0; py < POOL_SIZE; py++) {
                        for(int px = 0; px < POOL_SIZE; px++) {
                            int    iy = y * POOL_SIZE + py;
                            int    ix = x * POOL_SIZE + px;
                            float val = bn2_out[f * INPUT_HEIGHT * INPUT_WIDTH + iy * INPUT_WIDTH + ix];
                            if(val > max_val) max_val = val;
                        }
                    }
                    pool_out[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x] = max_val;
                }
            }
        }
        int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE);
        fc_layer(pool_size, FC_SIZE, pool_out, cnn->fc1, cnn->bias_fc1, fc1_out);
        fc_layer(FC_SIZE, OUTPUT_SIZE, fc1_out, cnn->fc2, cnn->bias_fc2, fc2_out);
        float output[OUTPUT_SIZE];
        softmax(fc2_out, output, OUTPUT_SIZE);
        int  pred = 0;
        for(int j=1; j<OUTPUT_SIZE; j++) {
            if(output[j] > output[pred]) pred = j;
        }
        if(pred == images[i].label) correct++;
    }
    *eval_time = omp_get_wtime() - start_time;
    printf("評価精度: %.2f%%\n", 100.0f * correct / num_samples);
    free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out);
    free(pool_out);  free(fc1_out); free(fc2_out);
    return 0;
}
int train(ConvolutionalNeuralNetwork *cnn, ImageData *images, int num_samples, int start_epoch, double *train_time) {
    double start_time   = omp_get_wtime();
    float *grad_fc2     = (float *)calloc(FC_SIZE * OUTPUT_SIZE,                        sizeof(float));
    float grad_bias_fc2[OUTPUT_SIZE] = {0};
    float *grad_fc1     = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE)
                                                       * (INPUT_WIDTH  / POOL_SIZE)
                                                       *  FC_SIZE,                      sizeof(float));
    float grad_bias_fc1[FC_SIZE] = {0};
    float *grad_conv2   = (float *)calloc(CONV_FILTERS * CONV_FILTERS * KERNEL_SIZE 
                                                                      * KERNEL_SIZE,    sizeof(float));
    float grad_bias2[CONV_FILTERS] = {0};
    float *grad_conv1   = (float *)calloc(CONV_FILTERS * INPUT_CHANNELS * KERNEL_SIZE
                                                                        * KERNEL_SIZE,  sizeof(float));
    float grad_bias1[CONV_FILTERS] = {0};
    float grad_bn1_gamma[CONV_FILTERS] = {0}, grad_bn1_beta[CONV_FILTERS] = {0};
    float grad_bn2_gamma[CONV_FILTERS] = {0}, grad_bn2_beta[CONV_FILTERS] = {0};
    float *conv1_out    = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *bn1_out      = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *conv2_out    = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *bn2_out      = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *pool_out     = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE)
                                                       * (INPUT_WIDTH  / POOL_SIZE),    sizeof(float));
    float *fc1_out      = (float *)calloc(FC_SIZE,                                      sizeof(float));
    float *fc2_out      = (float *)calloc(OUTPUT_SIZE,                                  sizeof(float));
    float *grad_pool    = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *grad_bn2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *grad_bn1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *grad_add     = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,    sizeof(float));
    float *grad_from_conv2 = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
    float *grad_conv1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH,  sizeof(float));
    PoolIndices pool_indices = {0};
    if( !grad_fc2 || !grad_fc1  || !grad_conv2 || !grad_conv1 || !conv1_out || !bn1_out      || !conv2_out    ||
        !bn2_out  || !pool_out  || !fc1_out    || !fc2_out    || !grad_pool || !grad_bn2_out || !grad_bn1_out ||
        !grad_add || !grad_from_conv2 || !grad_conv1_out) {
        perror("メモリ割り当て失敗");    // free all
        free(grad_fc2);  free(grad_fc1); free(grad_conv2); free(grad_conv1);
        free(conv1_out); free(bn1_out);  free(conv2_out);  free(bn2_out);
        free(pool_out);  free(fc1_out);  free(fc2_out);
        free(grad_pool); free(grad_bn2_out);    free(grad_bn1_out);
        free(grad_add);  free(grad_from_conv2); free(grad_conv1_out);
        return 1;
    }
    float learning_rate = LEARNING_RATE;
    for(int epoch = start_epoch; epoch < EPOCHS; epoch++) {
        if(epoch % 10 == 0 && epoch > start_epoch) learning_rate *= 0.1;
        float loss  = 0.0f;
        int correct = 0;
        for(int b=0; b<num_samples; b+=BATCH_SIZE){ printf(" % 6d: % 6d (% 6d / %d )   % 4d / %d  elapsed time = %.2fsec\n",
                                                           b, num_samples/BATCH_SIZE, num_samples, BATCH_SIZE, epoch, EPOCHS, 
                                                           omp_get_wtime() - start_time );
            memset(grad_fc2,       0, sizeof(float) * FC_SIZE * OUTPUT_SIZE );
            memset(grad_bias_fc2,  0, sizeof(grad_bias_fc2));
            memset(grad_fc1,       0, sizeof(float) * CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) * FC_SIZE);
            memset(grad_bias_fc1,  0, sizeof(grad_bias_fc1));
            memset(grad_conv2,     0, sizeof(float) * CONV_FILTERS * CONV_FILTERS   * KERNEL_SIZE * KERNEL_SIZE);
            memset(grad_bias2,     0, sizeof(grad_bias2));
            memset(grad_conv1,     0, sizeof(float) * CONV_FILTERS * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE);
            memset(grad_bias1,     0, sizeof(grad_bias1));
            memset(grad_bn1_gamma, 0, sizeof(grad_bn1_gamma));
            memset(grad_bn1_beta,  0, sizeof(grad_bn1_beta));
            memset(grad_bn2_gamma, 0, sizeof(grad_bn2_gamma));
            memset(grad_bn2_beta,  0, sizeof(grad_bn2_beta));
            memset(grad_add,       0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH);
            memset(grad_from_conv2,0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH);
            memset(grad_conv1_out, 0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH);
            memset(&pool_indices,  0, sizeof(PoolIndices));
            
            for(int i = b; i < b + BATCH_SIZE && i < num_samples; i++) {
                conv_layer_fft(INPUT_CHANNELS, CONV_FILTERS, images[i].input, cnn->conv1, cnn->bias1, conv1_out);
                batch_norm(conv1_out, cnn->bn1_mean, cnn->bn1_var, cnn->bn1_gamma, cnn->bn1_beta, bn1_out, CONV_FILTERS);
                #pragma omp parallel for
                for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn1_out[idx] = fmaxf(0.0f, bn1_out[idx]);
                conv_layer_fft(CONV_FILTERS, CONV_FILTERS, (float (*)[INPUT_HEIGHT][INPUT_WIDTH])bn1_out, cnn->conv2, cnn->bias2, conv2_out);
                #pragma omp parallel for
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
                            conv2_out[idx] += bn1_out[idx];
                        }
                    }
                }
                batch_norm(conv2_out, cnn->bn2_mean, cnn->bn2_var, cnn->bn2_gamma, cnn->bn2_beta, bn2_out, CONV_FILTERS);
                #pragma omp parallel for
                for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn2_out[idx] = fmaxf(0.0f, bn2_out[idx]);
                #pragma omp parallel for collapse(2)
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) {
                        for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) {
                            float max_val = -INFINITY;
                            int max_y = y * POOL_SIZE, max_x = x * POOL_SIZE;
                            for(int py = 0; py < POOL_SIZE; py++) {
                                for(int px = 0; px < POOL_SIZE; px++) {
                                    int iy = y * POOL_SIZE + py;
                                    int ix = x * POOL_SIZE + px;
                                    float val = bn2_out[f * INPUT_HEIGHT * INPUT_WIDTH + iy * INPUT_WIDTH + ix];
                                    if(isnan(val) || isinf(val)) continue;
                                    if(val > max_val) {
                                        max_val = val;
                                        max_y   = iy;
                                        max_x   = ix;
                                    }
                                }
                            }
                            pool_out[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x] = max_val;
                            pool_indices.max_idx[f][y][x][0] = max_y;
                            pool_indices.max_idx[f][y][x][1] = max_x;
                        }
                    }
                }
                int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE);
                fc_layer(pool_size, FC_SIZE, pool_out, cnn->fc1, cnn->bias_fc1, fc1_out);
                fc_layer(FC_SIZE, OUTPUT_SIZE, fc1_out, cnn->fc2, cnn->bias_fc2, fc2_out);
                float output[OUTPUT_SIZE];
                softmax(fc2_out, output, OUTPUT_SIZE);
                int label = images[i].label;
                if(label < 0 || label >= OUTPUT_SIZE) continue;
                loss -= logf(output[label] + 1e-10);
                int pred = 0;
                for(int j=1; j<OUTPUT_SIZE; j++) {
                    if(output[j] > output[pred]) pred = j;
                }
                correct += (pred == label) ? 1 : 0;
                float grad_fc2_in[OUTPUT_SIZE];
                for(int j=0; j<OUTPUT_SIZE; j++) {
                    grad_fc2_in[j] = output[j] - (j == label ? 1.0f : 0.0f);
                }
                #pragma omp parallel for
                for(int j=0; j<OUTPUT_SIZE; j++) {
                    grad_bias_fc2[j] += grad_fc2_in[j];
                    for(int k=0; k<FC_SIZE; k++) {
                        grad_fc2[k * OUTPUT_SIZE + j] += grad_fc2_in[j] * fc1_out[k];
                    }
                }
                float grad_fc1_in[FC_SIZE] = {0};
                #pragma omp parallel for
                for(int k=0; k<FC_SIZE; k++) {
                    grad_fc1_in[k] = 0.0f;
                    if(fc1_out[k] > 0) {
                        for(int j=0; j<OUTPUT_SIZE; j++) {
                            grad_fc1_in[k] += grad_fc2_in[j] * cnn->fc2[k][j];
                        }
                    }
                    grad_bias_fc1[k] += grad_fc1_in[k];
                    for(int m=0; m<pool_size; m++) {
                        grad_fc1[m * FC_SIZE + k] += grad_fc1_in[k] * pool_out[m];
                    }
                }
                #pragma omp parallel for
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) {
                        for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) {
                            int max_y = pool_indices.max_idx[f][y][x][0];
                            int max_x = pool_indices.max_idx[f][y][x][1];
                            if(max_y >= 0 && max_y < INPUT_HEIGHT && max_x >= 0 && max_x < INPUT_WIDTH) {
                                grad_pool[f * INPUT_HEIGHT * INPUT_WIDTH + max_y * INPUT_WIDTH + max_x] =
                                    grad_fc1_in[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x];
                            }
                        }
                    }
                }
                #pragma omp parallel for collapse(2)
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
                            grad_bn2_out[idx] = grad_pool[idx] * (bn2_out[idx] > 0 ? 1.0f : 0.0f);
                        }
                    }
                }
                #pragma omp parallel for collapse(2)
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
                            grad_bn2_gamma[f] += grad_bn2_out[idx] * (conv2_out[idx] - cnn->bn2_mean[f]) / sqrtf(cnn->bn2_var[f] + 1e-5);
                            grad_bn2_beta[f]  += grad_bn2_out[idx];
                        }
                    }
                }
                #pragma omp parallel for collapse(2)
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
                            grad_add[idx] = grad_bn2_out[idx] * cnn->bn2_gamma[f] / sqrtf(cnn->bn2_var[f] + 1e-5);
                        }
                    }
                }
                const int pad = KERNEL_SIZE / 2;
                const int fft_size = 64;
                COMPLEX **fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
                COMPLEX **fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
                if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; }
                #pragma omp parallel for
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int c=0; c<CONV_FILTERS; c++) {
                        for(int y=0; y<fft_size; y++) {
                            for(int x=0; x<fft_size; x++) {
                                fft_grad[y][x].r = 0.0f;
                                fft_grad[y][x].i = 0.0f;
                                fft_weight[y][x].r = 0.0f;
                                fft_weight[y][x].i = 0.0f;
                            }
                        }
                        for(int y=0; y<INPUT_HEIGHT; y++) {
                            for(int x=0; x<INPUT_WIDTH; x++) {
                                fft_grad[y + pad][x + pad].r = grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
                            }
                        }
                        for(int y=0; y<KERNEL_SIZE; y++) {
                            for(int x=0; x<KERNEL_SIZE; x++) {
                                fft_weight[y][x].r = cnn->conv2[f][c][y][x];
                            }
                        }
                        FourierTransform2D(fft_grad, fft_size, fft_size, FFT);
                        FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
                        for(int y=0; y<fft_size; y++) {
                            for(int x=0; x<fft_size; x++) {
                                fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], ConjCOMPLEX(fft_weight[y][x]));
                            }
                        }
                        FourierTransform2D(fft_grad, fft_size, fft_size, IFFT);
                        for(int ky = 0; ky < KERNEL_SIZE; ky++) {
                            for(int kx = 0; kx < KERNEL_SIZE; kx++) {
                                grad_conv2[f * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx] += fft_grad[ky][kx].r;
                            }
                        }
                    }
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            grad_bias2[f] += grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
                        }
                    }
                }
                free(fft_grad);
                free(fft_weight);
                // Add the missing grad_from_conv2 (backprop for input of conv2)
                fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
                fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
                if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; }
                #pragma omp parallel for
                for(int c=0; c<CONV_FILTERS; c++) {
                    for(int f=0; f<CONV_FILTERS; f++) {
                        for(int y=0; y<fft_size; y++) {
                            for(int x=0; x<fft_size; x++) {
                                fft_grad[y][x].r = 0.0f;
                                fft_grad[y][x].i = 0.0f;
                                fft_weight[y][x].r = 0.0f;
                                fft_weight[y][x].i = 0.0f;
                            }
                        }
                        for(int y=0; y<INPUT_HEIGHT; y++) {
                            for(int x=0; x<INPUT_WIDTH; x++) {
                                fft_grad[y + pad][x + pad].r = grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
                            }
                        }
                        for(int y=0; y<KERNEL_SIZE; y++) {
                            for(int x=0; x<KERNEL_SIZE; x++) {
                                fft_weight[y][x].r = cnn->conv2[f][c][y][x];
                            }
                        }
                        FourierTransform2D(fft_grad, fft_size, fft_size, FFT);
                        FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
                        for(int y=0; y<fft_size; y++) {
                            for(int x=0; x<fft_size; x++) {
                                fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], fft_weight[y][x]);
                            }
                        }
                        FourierTransform2D(fft_grad, fft_size, fft_size, IFFT);
                        for(int y=0; y<INPUT_HEIGHT; y++) {
                            for(int x=0; x<INPUT_WIDTH; x++) {
                                grad_from_conv2[c * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += fft_grad[y + pad][x + pad].r;
                            }
                        }
                    }
                }
                free(fft_grad);
                free(fft_weight);
                #pragma omp parallel for collapse(2)
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
                            grad_bn1_out[idx] = grad_from_conv2[idx] * (bn1_out[idx] > 0.0f ? 1.0f : 0.0f) + grad_add[idx];
                            grad_bn1_gamma[f] += grad_bn1_out[idx] * (conv1_out[idx] - cnn->bn1_mean[f]) / sqrtf(cnn->bn1_var[f] + 1e-5);
                            grad_bn1_beta[f]  += grad_bn1_out[idx];
                        }
                    }
                }
                #pragma omp parallel for collapse(2)
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
                            grad_conv1_out[idx] = grad_bn1_out[idx] * cnn->bn1_gamma[f] / sqrtf(cnn->bn1_var[f] + 1e-5);
                        }
                    }
                }
                fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
                fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
                if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; }
                #pragma omp parallel for
                for(int f=0; f<CONV_FILTERS; f++) {
                    for(int c=0; c<INPUT_CHANNELS; c++) {
                        for(int y=0; y<fft_size; y++) {
                            for(int x=0; x<fft_size; x++) {
                                fft_grad[y][x].r = 0.0f;
                                fft_grad[y][x].i = 0.0f;
                                fft_weight[y][x].r = 0.0f;
                                fft_weight[y][x].i = 0.0f;
                            }
                        }
                        for(int y=0; y<INPUT_HEIGHT; y++) {
                            for(int x=0; x<INPUT_WIDTH; x++) {
                                fft_grad[y + pad][x + pad].r = grad_conv1_out[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
                                fft_weight[y + pad][x + pad].r = images[i].input[c][y][x];
                            }
                        }
                        FourierTransform2D(fft_grad, fft_size, fft_size, FFT);
                        FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
                        for(int y=0; y<fft_size; y++) {
                            for(int x=0; x<fft_size; x++) {
                                fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], ConjCOMPLEX(fft_weight[y][x]));
                            }
                        }
                        FourierTransform2D(fft_grad, fft_size, fft_size, IFFT);
                        for(int ky = 0; ky < KERNEL_SIZE; ky++) {
                            for(int kx = 0; kx < KERNEL_SIZE; kx++) {
                                grad_conv1[f * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx] += fft_grad[ky][kx].r;
                            }
                        }
                    }
                    for(int y=0; y<INPUT_HEIGHT; y++) {
                        for(int x=0; x<INPUT_WIDTH; x++) {
                            grad_bias1[f] += grad_conv1_out[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
                        }
                    }
                }
                free(fft_grad);
                free(fft_weight);
            }
            // Update parameters
            #pragma omp parallel for
            for(int j=0; j<OUTPUT_SIZE; j++) {
                cnn->m_bias_fc2[j] = MOMENTUM * cnn->m_bias_fc2[j] - learning_rate * grad_bias_fc2[j] / BATCH_SIZE;
                cnn->bias_fc2[j]  += cnn->m_bias_fc2[j];
                for(int k = 0; k < FC_SIZE; k++) {
                    cnn->m_fc2[k][j] = MOMENTUM * cnn->m_fc2[k][j] - learning_rate * grad_fc2[k * OUTPUT_SIZE + j] / BATCH_SIZE;
                    cnn->fc2[k][j]  += cnn->m_fc2[k][j];
                }
            }
            int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE);
            #pragma omp parallel for
            for(int k = 0; k < FC_SIZE; k++) {
                cnn->m_bias_fc1[k] = MOMENTUM * cnn->m_bias_fc1[k] - learning_rate * grad_bias_fc1[k] / BATCH_SIZE;
                cnn->bias_fc1[k]  += cnn->m_bias_fc1[k];
                for(int m = 0; m < pool_size; m++) {
                    cnn->m_fc1[m][k] = MOMENTUM * cnn->m_fc1[m][k] - learning_rate * grad_fc1[m * FC_SIZE + k] / BATCH_SIZE;
                    cnn->fc1[m][k]  += cnn->m_fc1[m][k];
                }
            }
            
            #pragma omp parallel for
            for(int f=0; f<CONV_FILTERS; f++) {
                cnn->m_bias2[f] = MOMENTUM * cnn->m_bias2[f] - learning_rate * grad_bias2[f] / BATCH_SIZE;
                cnn->bias2[f] += cnn->m_bias2[f];
                for(int c=0; c<CONV_FILTERS; c++) {
                    for(int ky = 0; ky < KERNEL_SIZE; ky++) {
                        for(int kx = 0; kx < KERNEL_SIZE; kx++) {
                            int idx = f * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx;
                            cnn->m_conv2[f][c][ky][kx] = MOMENTUM * cnn->m_conv2[f][c][ky][kx] - learning_rate * grad_conv2[idx] / BATCH_SIZE;
                            cnn->conv2[f][c][ky][kx]  += cnn->m_conv2[f][c][ky][kx];
                        }
                    }
                }
                cnn->m_bn2_gamma[f] = MOMENTUM * cnn->m_bn2_gamma[f] - learning_rate * grad_bn2_gamma[f] / BATCH_SIZE;
                cnn->bn2_gamma[f]  += cnn->m_bn2_gamma[f];
                cnn->m_bn2_beta[f]  = MOMENTUM * cnn->m_bn2_beta[f] - learning_rate * grad_bn2_beta[f] / BATCH_SIZE;
                cnn->bn2_beta[f]   += cnn->m_bn2_beta[f];
            }
            #pragma omp parallel for
            for(int f=0; f<CONV_FILTERS; f++) {
                cnn->m_bias1[f] = MOMENTUM * cnn->m_bias1[f] - learning_rate * grad_bias1[f] / BATCH_SIZE;
                cnn->bias1[f] += cnn->m_bias1[f];
                for(int c=0; c<INPUT_CHANNELS; c++) {
                    for(int ky = 0; ky < KERNEL_SIZE; ky++) {
                        for(int kx = 0; kx < KERNEL_SIZE; kx++) {
                            int idx = f * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx;
                            cnn->m_conv1[f][c][ky][kx] = MOMENTUM * cnn->m_conv1[f][c][ky][kx] - learning_rate * grad_conv1[idx] / BATCH_SIZE;
                            cnn->conv1[f][c][ky][kx]  += cnn->m_conv1[f][c][ky][kx];
                        }
                    }
                }
                cnn->m_bn1_gamma[f] = MOMENTUM * cnn->m_bn1_gamma[f] - learning_rate * grad_bn1_gamma[f] / BATCH_SIZE;
                cnn->bn1_gamma[f]  += cnn->m_bn1_gamma[f];
                cnn->m_bn1_beta[f]  = MOMENTUM * cnn->m_bn1_beta[f] - learning_rate * grad_bn1_beta[f] / BATCH_SIZE;
                cnn->bn1_beta[f]   += cnn->m_bn1_beta[f];
            }
        }
        printf("エポック %d: 損失=%.4f, 精度=%.2f%%\n", epoch, loss / num_samples, 100.0f * correct / num_samples);
    }
    *train_time = omp_get_wtime() - start_time;
    // free all
    free(grad_fc2);  free(grad_fc1);        free(grad_conv2); free(grad_conv1);
    free(conv1_out); free(bn1_out);         free(conv2_out);  free(bn2_out);
    free(pool_out);  free(fc1_out);         free(fc2_out);
    free(grad_pool); free(grad_bn2_out);    free(grad_bn1_out);
    free(grad_add);  free(grad_from_conv2); free(grad_conv1_out);
    return 0;
}
int main(int argc, char *argv[]){ srand(time(NULL));
    double start_time = omp_get_wtime(), train_time = 0.0, eval_time = 0.0;
    
    int  skip_training = 0, save_flag = 0, load_flag = 0, status = 1;
    char *save_file = DEFAULT_WEIGHT_FILE;
    char *load_file = DEFAULT_WEIGHT_FILE;
    argumentAnalysis( argc, argv, &skip_training, &save_flag, &save_file, &load_flag, &load_file );
    
    ImageData *train_images = NULL;
    ImageData *test_images  = NULL;
    ConvolutionalNeuralNetwork *cnn = (ConvolutionalNeuralNetwork *)calloc(1,sizeof(ConvolutionalNeuralNetwork));
    if(!cnn){ fprintf(stderr, "Failed to allocate memory for ConvolutionalNeuralNetwork.\n");
              goto cleanup; }
    else if(load_flag) load_cnn(cnn, load_file);
    else               init_cnn(cnn);
    int num_train_samples = 0;
    if(!skip_training) goto SKIP_LOAD_TRAINING;
    if(!sample("Learning", &train_images, &num_train_samples)) {
        fprintf(stderr, "トレーニングデータ読み込み失敗\n");
        goto cleanup;;
    }  
    SKIP_LOAD_TRAINING:
    
  
    int num_test_samples = 0;
    if(!sample("Evaluation", &test_images, &num_test_samples)) {
        fprintf(stderr, "テストデータ読み込み失敗\n");
        goto cleanup;;
    } 
    
    if(!skip_training) goto SKIP_THIS_TRAINING;
    train(cnn, train_images, num_train_samples, 0, &train_time);
    printf("トレーニング時間: %.2f秒\n", train_time);
    if(save_flag) save_cnn(cnn, save_file);
    SKIP_THIS_TRAINING:
    
    evaluate(cnn, test_images, num_test_samples, &eval_time);
    printf("評価時間: %.2f秒\n", eval_time);
    
    double total_time = omp_get_wtime() - start_time;
    char msg[256];
    snprintf(msg, sizeof(msg), "Total processing time: %.2f seconds (Training: %.2f, Evaluation: %.2f)",
             total_time, skip_training ? 0.0 : train_time, eval_time);  printf("\n%s\n", msg);
    status = 0;
cleanup:
    free(train_images);
    free(test_images);
    free(cnn);
    return status;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// これ以降の部分は CNN プログラム(上部のコード)を実行するのに必要な環境を提供する
// プログラム群 CNN とは、分離して考える事。
#include <string.h>
// コマンドライン引数解析
void argumentAnalysis( int argc, char *argv[], int *skip_training,
                       int *save_flag, char **save_name,  int *load_flag, char **load_name ){
    int  v_flg=0, s_flg=0, l_flg=0, r_flg=0, t_flg=0;
    for(int i=1; i<argc; i++ ){ // プログラム起動時の引数解析処理
        if( argv[i][0]=='-' ) {
            switch(argv[i][1]){
                case 's': if( s_flg++ ){
                              fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
                              goto GOOD_BY;
                          } else if( r_flg ){
                              fprintf(stderr, "Error: -s and -r cannot be used together\n" );
                              goto GOOD_BY;
                          } else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;}
                          *save_flag  = 1;
                          *save_name  = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i]
                                      : DEFAULT_WEIGHT_FILE;
                          break;
                case 'l': if( l_flg++ ){
                              fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
                              goto GOOD_BY;
                          } else if( r_flg ) {
                              fprintf(stderr, "Error: -l and -r cannot be used together\n" );
                              goto GOOD_BY;
                          } else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;}
                          *load_flag  = 1;
                          *load_name  = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i]
                                      : DEFAULT_WEIGHT_FILE;
                          break;
                case 'r': 
                          if( r_flg++ ){ printf( "++r_flg = %d\n", r_flg );
                              fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
                              goto GOOD_BY;
                          } else if( l_flg ){
                              fprintf(stderr, "Error: -l and -r cannot be used together\n" );
                              goto GOOD_BY;
                          } else if( s_flg ){
                              fprintf(stderr, "Error: -s and -r cannot be used together\n" );
                              goto GOOD_BY;
                          } else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;}
                          *skip_training = 1;
                          *load_flag     = 1;
                          *load_name     = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i]
                                         : DEFAULT_WEIGHT_FILE;
                          break;
                case 't': if( t_flg++ ){
                              fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
                              goto GOOD_BY;
                          } else if( (i+1)>=argc || argv[i+1][0]=='-' ){
                              fprintf(stderr, "Error: No maximum number specified\n" );
                              goto GOOD_BY;
                          } else if( argv[i][2] ) goto GOOD_BY;
                          if( (ActiveThread = atoi(argv[++i]))<0 ){
                              fprintf(stderr, "Error: The number of threads must be specified.\n");
                              goto GOOD_BY;
                          }
                          break;
                case 'V': //fallthrough
                case 'v': if( v_flg++ ){
                              fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
                              goto GOOD_BY;
                          } else if( argv[i][2] ) goto GOOD_BY;
                          verbose = argv[i][1]=='v' ? 1 : 2; 
                          break;
                case 'h': // fallthrough
                default : printf( "'%c''%c' \n", argv[i][0], argv[i][1] );
                          goto GOOD_BY;
            }
        }
    }   return;
GOOD_BY:; int i; char *p=argv[0], *op="[-v] [-t number] [-s [save_file]] [-l [load_file]] [-r [load_file]]";
    for(i=strlen(p); i>0 && p[i]!='\\' && p[i]!='/'; i-- );; if(i) i++;
    fprintf( stderr, "\nusage: %s %s\n\n", &p[i], op );
    exit( 1 );
}
#pragma pack(push,1)
typedef struct PIXEL { uint8_t B, G, R, A; } PIXEL ;
typedef struct STBPX { uint8_t R, G, B, A; } STBPX ;
typedef struct ARRAY { uint8_t v[4];       } ARRAY ;
#pragma pack(pop)
#define v2D(type,lx,name)     ((type(*)[(lx)])name)
#define v2Dc(lx,name)         v2D(char, lx, name)
#define v2Di(lx,name)         v2D(int,lx,name)
#define v2Df(lx,name)         v2D(float,lx,name)
#define v2Dp(lx,name)         v2D(PIXEL,lx,name)
#define v2Da(lx,name)         v2D(ARRAY,lx,name)
#define v3D(type,lx,ly,name)  ((type(*)[(lx)][(ly)])name)
#define v3Df(lx,ly,name)      v3D(float, lx, ly, name)
#define HERE                            fprintf(stderr,"% 6d: %s\n",__LINE__, __func__);
#define PROCESS_ALL_PIXELS              for(int y=0;y<ly;y++)for(int x=0;x<lx;x++)
#define saveArray2bmp32(name,a,w,h)     saveDataAsBMP32((name),(&(a)[0][0]),(w),(h))
// BMPヘッダ構造体
#pragma pack(push, 2)
typedef struct HEADER {
    unsigned short bfType;              // 'BM' (0x4D42)
    unsigned int   bfSize;              // ファイルサイズ
    unsigned short bfReserved1;         // 0
    unsigned short bfReserved2;         // 0
    unsigned int   bfOffBits;           // データオフセット
    unsigned int   biSize;              // 情報ヘッダサイズ (40)
    signed int     biWidth;             // 幅
    signed int     biHeight;            // 高さ(負で上下反転)
    unsigned short biPlanes;            // 1
    unsigned short biBitCount;          // 32
    unsigned int   biCompression;       // 0 (BI_RGB)
    unsigned int   biSizeImage;         // 画像データサイズ
    signed int     biXPelsPerMeter;     // 0
    signed int     biYPelsPerMeter;     // 0
    unsigned int   biClrUsed;           // 0 
    unsigned int   biClrImportant;      // 0
    unsigned char  lut[];               // パレット(8ビット用)
} HEADER;
#pragma pack(pop)
// 画像パラメータ
#define BMPID                           0x4D42   // 'BM'
#define BMPIFHSIZE                      40       // 情報ヘッダサイズ
int saveDataAsBMP32( char *fname, void *b, int lx, int ly ){
  HEADER h = {BMPID, lx*abs(ly)*sizeof(PIXEL)+sizeof(HEADER), 0, 0, sizeof(HEADER),
              BMPIFHSIZE, lx, ly, 1, 32, 0, lx*abs(ly)*sizeof(PIXEL), 0, 0, 0, 0 };
  FILE *fp=NULL ;
  int  st=-1;
    if( (fp=fopen(fname, "wb")) == NULL ) {
        fprintf( stderr, "File '%s': cannot create.\n", fname );
    } else if( fwrite( &h, sizeof(HEADER), 1, fp ) != 1 ) {
        fprintf( stderr, "File '%s': Header write error.\n", fname );
    } else if( fwrite( b, h.biSizeImage, 1, fp ) != 1 ) {
        fprintf( stderr, "File '%s': Data write error.\n", fname );
    } else st=0;
    if(fp)fclose(fp) ;
    return st;
}
static inline int ft_size_check(int n){ return (n > 0) && !(n & (n - 1));}
void **alloc2Darray(int s, int lx, int ly){ if(s<1 || lx<1 || ly<1) return NULL;
    long long w = lx, h = ly, next = s*w, space = h*sizeof(void *) + w*h*s + 64;
    void    **d = (void **)calloc(1, space);
    if(!d){fprintf(stderr,"alloc2Darray failed: %lld bytes\n",space); return NULL;}
    void  *base = (void*)((uintptr_t)((char*)d + h * sizeof(void*) + 63) & ~63LL);
    for(long long i=0; i<h; i++) d[i] = (void*)((char*)base + next * i);
    return d;                         // C99: uintptr_t same as (void *)
}
void ***alloc3Darray(int size, int lx, int ly, int lz) {
    if(size < 1 || lx < 1 || ly < 1 || lz < 1) return NULL;
    long long s = size, llx = lx, lly = ly, llz = lz, lyz = lly * llz, line = s * llx;
    long long index = (llz * lly + llz) * sizeof(void *), body = llx * lly * llz * s, space;
    void ***d = (void ***)calloc(1, space = (index + body + 64));
    if(!d){ fprintf(stderr, "alloc3Darray failed: %lld bytes\n", space);
        return NULL;
    }else{ void **area=(void **)&d[llz], *base=(void*)(((uintptr_t)((char*)d+index+63))&~63LL);
        for(int i=0; i<llz; i++)    d[i] = &area[lly * i];
        for(int i=0; i<lyz; i++) area[i] = (void *)((char *)base + line * i);
    }
    return d;
}
void FourierTransform2D(COMPLEX **c, int lx, int ly, int flag) {
    int elements = lx;
    double  unit = (flag == IFFT) ? 2 * M_PI / lx : -2 * M_PI / lx;
    COMPLEX rotor[lx>ly?lx:ly]; for(int e=0;e<lx;rotor[e]=RotorCOMPLEX(unit*e),e++);
    if(ft_size_check(elements)) {
        #pragma omp parallel for
        for(int y=0; y<ly; y++){ COMPLEX W, temp;
            for(int i=0, j=1; j<elements-1; j++){// scrambler, for initial placement
                for(int k = elements>>1; k>(i^=k); k>>=1);
                if(j<i){temp=c[y][j]; c[y][j]=c[y][i]; c[y][i]=temp;} // exchange
            }
            for(int ir, i, k, m, mh=1; (m=mh<<1)<=elements; mh=m) {   // Operations
                for(ir=0, i=0; i<elements; i+=m) {                    // ir=0: Rotation angle is 0rad.
                    for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 );
                    for(int j=i; j<mh+i; j++ ) { int a = j,  b = j + mh;
                           temp = SubCOMPLEX(c[y][a], c[y][b]);       // For butterfly operation
                        c[y][a] = AddCOMPLEX(c[y][a], c[y][b]);       // a ->- (a + b) ->- a
                        c[y][b] = MulCOMPLEX(temp, W);                // b ->- (a-b)*W ->- b
            }   }   }   double reciprocal = 1.0 / elements;
            if(flag==IFFT) for(int e=0; e<elements; c[y][e]=ImeMulCOMPLEX(c[y][e],reciprocal), e++);
        }
    } else {// DFT
        #pragma omp parallel for
        for(int y=0; y<ly; y++ ) { COMPLEX tmp[elements]; // 32xsizeof(float) is enough small
            for(int u=0; u<elements; u++ ){int v; COMPLEX t = (COMPLEX){0,0};
               for( v=0; v<elements; v++ ) AppendCOMPLEX( t, MulCOMPLEX(c[y][v],rotor[(u*v)%elements]) );
               tmp[u] = t;
            }  double reciprocal = 1./elements;           // tmp[u]/elements -> tmp[u]*reciprocal
            if(flag==IDFT) for(int e=0; e<elements; e++ ) c[y][e] = ImeMulCOMPLEX(tmp[e],reciprocal);
            else           for(int e=0; e<elements; e++ ) c[y][e] = tmp[e];
        }
    }
    if( lx!=(elements=ly )){ unit = (flag==IFFT) ? 2 * M_PI /elements : -2 * M_PI /elements;
        for(int e=0; e<elements; rotor[e]=RotorCOMPLEX(unit*e), e++);
    }
    if(ft_size_check(elements)){
        #pragma omp parallel for
        for(int x=0; x<lx; x++ ){ COMPLEX  W, temp;
            for(int i=0, j=1; j<elements-1; j++ ){// scrambler, for initial placement
                for(int k=elements>>1; k>(i^=k); k>>=1 );
                if(j<i){temp=c[j][x], c[j][x]=c[i][x], c[i][x]=temp;} // exchange
            }
            for(int ir, i, k, m, mh=1; (m=mh<<1)<=elements; mh=m) {   // Operations
                for(ir=0, i=0; i<elements; i+=m ) {                   // ir=0: Rotation angle is 0rad.
                    for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 );
                    for(int j=i; j<mh+i; j++ ) { int a = j,  b = j + mh;
                          temp  = SubCOMPLEX(c[a][x], c[b][x]);       // For butterfly operation
                        c[a][x] = AddCOMPLEX(c[a][x], c[b][x]);       // a ->- (a + b) ->- a
                        c[b][x] = MulCOMPLEX(temp, W);                // b ->- (a-b)*W ->- b
            }   }   }   double reciprocal=1./elements;
            if(flag==IDFT) for(int e=0; e<elements; c[e][x]=ImeMulCOMPLEX(c[e][x],reciprocal), e++);
        }
    } else {// DFT
        #pragma omp parallel for
        for(int x=0; x<lx; x++ ) { COMPLEX tmp[elements]; // 32xsizeof(float) is enough small
            for(int u=0; u<elements; u++ ){int v; COMPLEX t = (COMPLEX){0,0};
               for( v=0; v<elements; v++ ) AppendCOMPLEX( t, MulCOMPLEX(c[v][x],rotor[(u*v)%elements]) );
               tmp[u] = t;
            }  double reciprocal = 1./elements;           // tmp[u]/elements -> tmp[u]*reciprocal
            if(flag==IDFT) for(int e=0; e<elements; e++ ) c[e][x] = ImeMulCOMPLEX(tmp[e],reciprocal);
            else           for(int e=0; e<elements; e++ ) c[e][x] = tmp[e];
        }
    }
}
void extract_labels(const char *filename, int *label1, int *label2) {
    int dots = 0, len = 0, start = 0;
    if(label1) *label1 = -1;
    else { fprintf(stderr, "invalid argument\n"); return; }
    if(label2) *label2 = -1;
    if(!filename || (len = strlen(filename)) < 1) {
        fprintf(stderr, "invalid argument\n");
        return;
    }
    for(int e = 0; e < len; e++) if(filename[e] == '.') dots++;
    if(dots <= 1) return;
    char string[256] = {0};
    for(int e = 0; e < len; e++) { if(filename[start = e] == '.') break; }
    snprintf(string, sizeof(string), "%s", &filename[start]);
    *strrchr(string, '.') = 0;
    int head, tail = strlen(string) - 1;
    for(head = tail; 0 <= head; head--) {
        if(string[head] != '.' && isdigit(string[head])) continue;
        if(string[head] != '.') {
            fprintf(stderr, "invalid label character 0x%2.2X\n", string[head]);
            return;
        } else {
            *label1 = atoi(&string[head + 1]);
            string[head--] = 0;
        }
        if(!label2 || dots < 3) break;
        for(; 0 <= head && isdigit(string[head]); head--);
        if(string[head] != '.') {
            fprintf(stderr, "invalid label character 0x%2.2X\n", string[head]);
            return;
        } else {
            *label2 = atoi(&string[head + 1]);
            return;
        }
    }
}
int load_image(char *filepath, ImageData *image, int width, int height, int channels) {
    int lx = 0, ly = 0, bpp = 0;
    unsigned char *data = stbi_load(filepath, &lx, &ly, &bpp, 4);
    if(!data || lx != width || ly != height || bpp != 4) {
        if(data) stbi_image_free(data);
        fprintf(stderr, "画像サイズ不一致または読み込み失敗: %s\n", image->filename);
        return 1;
    }
    for(int ey = (ly + 1) / 2, ty = ly - 1, y = 0; y < ey; y++) {
        for(int w = ty - y, x = 0; x < lx; x++) {
            PIXEL p = { v2Dp(lx, data)[y][x].R, v2Dp(lx, data)[y][x].G, v2Dp(lx, data)[y][x].B, 0 };
            v2Dp(lx, data)[y][x] = (PIXEL){ v2Dp(lx, data)[w][x].R, v2Dp(lx, data)[w][x].G, v2Dp(lx, data)[w][x].B, 0 };
            v2Dp(lx, data)[w][x] = p;
        }
    }
    for(int end = (4 < channels ? 4 : channels), c = 0; c < end; c++) {
        for(int y=0; y<ly; y++) {
            for(int x=0; x<lx; x++) {
                v3Df(lx, ly, image->input)[c][y][x] = v2Da(lx, data)[y][x].v[c] / 255.0f;
            }
        }
    }
    stbi_image_free(data);
    extract_labels(image->filename, &image->label, &image->label2);
    return 0;
}
int sample(const char *dirname, ImageData **images, int *num_samples) {
    DIR *dir = opendir(dirname);
    if(!dir) { fprintf(stderr, "ディレクトリ %s のオープンに失敗\n", dirname); return 0; }
    char *ext = ".bmp";
    struct dirent *entry;
    for(*num_samples = 0; (entry = readdir(dir)); ) if(strstr(entry->d_name, ext)) (*num_samples)++;
    *images = (ImageData *)calloc((*num_samples) + 2, sizeof(ImageData));
    if(!*images) { perror("メモリ割り当て失敗"); closedir(dir); return 0; }
    rewinddir(dir);
    int idx = 0;
    while ((entry = readdir(dir))) {
        if(!strstr(entry->d_name, ext)) continue;
        char filepath[256];
        snprintf(filepath, sizeof(filepath), "%s/%s", dirname, entry->d_name);
        snprintf((*images)[idx].filename, sizeof((*images)[idx].filename), "%s", entry->d_name);
        if(load_image(filepath, &(*images)[idx], INPUT_WIDTH, INPUT_HEIGHT, INPUT_CHANNELS)) continue;
        idx++;
        if(verbose && idx % 1000 == 0) printf("\r% 8d\t%8.2f%%", idx, 0.005+100.0f*(idx)/(*num_samples));
    }   if(verbose) printf("\n");
    closedir(dir);
    *num_samples = idx;
    return 1;
}
void show_image_data(ImageData *images, int here){
    char fname[256]; snprintf(fname,250,"debug_image_%4.4d.bmp",here);
    printf("  '%s' save as '%s'\n\n",images[here].filename, fname );
    PIXEL d[INPUT_HEIGHT][INPUT_WIDTH];
    int   lx=INPUT_WIDTH, ly=INPUT_HEIGHT ;
    PROCESS_ALL_PIXELS  d[y][x] = (PIXEL){ images[here].input[0][y][x]*255,
           images[here].input[1][y][x]*255,images[here].input[2][y][x]*255, 0 };
    saveDataAsBMP32(fname, &d[0][0],lx, ly );
}