ニューラルネットワークの CIFAR向け 畳み込みモデルを C99
 C Language で フルスクラッチ実装する。

 
 
 C99 ソースコードの提示、”評価は実施していません”、重す過ぎてCPUのみでは無理。
 
   1日かけても1epock も処理できない
          cnn_CPU.exe
           50000          100.00%
           10000          100.00%
              0:    195 ( 50000 / 256 )      0 / 20
            256:    195 ( 50000 / 256 )      0 / 20
            512:    195 ( 50000 / 256 )      0 / 20
            768:    195 ( 50000 / 256 )      0 / 20
           1024:    195 ( 50000 / 256 )      0 / 20
           1280:    195 ( 50000 / 256 )      0 / 20
           1536:    195 ( 50000 / 256 )      0 / 20
           1792:    195 ( 50000 / 256 )      0 / 20
           2048:    195 ( 50000 / 256 )      0 / 20
           2304:    195 ( 50000 / 256 )      0 / 20
           2560:    195 ( 50000 / 256 )      0 / 20
           2816:    195 ( 50000 / 256 )      0 / 20
           3072:    195 ( 50000 / 256 )      0 / 20
           3328:    195 ( 50000 / 256 )      0 / 20
           3584:    195 ( 50000 / 256 )      0 / 20
           3840:    195 ( 50000 / 256 )      0 / 20
           4096:    195 ( 50000 / 256 )      0 / 20
           4352:    195 ( 50000 / 256 )      0 / 20
           4608:    195 ( 50000 / 256 )      0 / 20
           4864:    195 ( 50000 / 256 )      0 / 20
           5120:    195 ( 50000 / 256 )      0 / 20
 
 

CIFARデータセット(CIFAR-10または CIFAR-100)をダウンロードする方法

公式ウェブサイトからの手動ダウンロード

  1. 公式サイトにアクセス
  2. データセットのダウンロード
    • CIFAR-10: 「CIFAR-10 python version」
      リンクをクリックしてcifar-10-python.tar.gzをダウンロード。
    • CIFAR-100:「CIFAR-100 python version」
      リンクをクリックしてcifar-100-python.tar.gzをダウンロード。
  3. 解凍と読み込み
    • ダウンロードしたファイルを解凍します(例: tar -xvzf cifar-10-python.tar.gz)。
    • Pythonでデータを読み込むには、以下のようにpickleを使用します:
      import pickle
      import numpy as np

      def unpickle(file):
          with open(file, 'rb') as fo:
              dict = pickle.load(fo, encoding='bytes')
          return dict

      # CIFAR-10の例(バッチファイルの読み込み)
      data_batch = unpickle('cifar-10-batches-py/data_batch_1')
      images = data_batch[b'data']  # 画像データ
      labels = data_batch[b'labels']  # ラベル

注意点

 
 
 
   もし奇特な人が試験環境を提供してくれたら、感涙するくらいに幸せな事です。
   期待するシステム構成(50万程度になるので、当方では無理)
 
   参考スペック:
   X870E Taichi、Ryzen 9 9950X3D、 Crucial Pro 128GB キット(2x64GB)を 2セット(256G)、
   GeForce RTX 5070 Ti 16G
 
   正直言って、何百万もするようなAIエントリーモデルを買うなら、HEDT を使用した方が
   効率的でしょう。 次はクラスター化がキーになるので、4から8台用意できれば、次の
   ステップも容易に評価範囲に入ると思います。
 
 



cnn_CIFAR.c   C99でスクラッチ実装した、CIFAR-10、CIFAR-100 用畳み込みニューラルネット(学習+評価)

検証環境が存在しないので、雛形としての提示


#define OUTPUT_SIZE           CIFAR-10:10    CIFAR-100:100

 EX.
    CIFAR-10
      +project/                 
      ├── Learning/           
      │   ├── image1.0.png   
      │   ├── image2.5.png   
      │   └── ...            
      ├── Evaluation/         
      │   ├── test1.png      
      │   ├── test2.png      
      │   └── ...            
      ├── stb_image.h         
      └── cifar.c             

    CIFAR-100
      +project/                
      ├── Learning/
      │   ├── image1.5.0.png
      │   ├── image2.8.5.png
      │   └── ...
      ├── Evaluation/  (ラベルが有れば正解率の算出)
      │   ├── test1.5.2.png
      │   ├── test2.5.1.png
      │   └── ...
      ├── stb_image.h
      └── cifar.c



// https://github.com/nothings/stb/blob/master/stb_image.h #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <omp.h> #include <time.h> #include <dirent.h> #include <stdint.h> #include <ctype.h> int verbose = 0, ActiveThread = 0; #define PI 3.1415926535897932384626433832795L #define M_PI PI typedef struct COMPLEX { float r, i;} COMPLEX; #define RotorCOMPLEX(a) (COMPLEX){ cos(a), sin(a) } #define ConjCOMPLEX(a) (COMPLEX){ (a).r, -((a).i) } #define AddCOMPLEX(a,b) (COMPLEX){ (a).r + (b).r, (a).i + (b).i } #define SubCOMPLEX(a,b) (COMPLEX){ (a).r - (b).r, (a).i - (b).i } #define MulCOMPLEX(a,b) (COMPLEX){ (a).r*(b).r - (a).i*(b).i, (a).r*(b).i + (a).i*(b).r } #define ImeMulCOMPLEX(a,b) (COMPLEX){ (a).r * (b), (a).i * (b) } #define AppendCOMPLEX(a,b) { (a).r += (b).r, (a).i += (b).i ; } void FourierTransform2D(COMPLEX **c, int lx, int ly, int flag); void **alloc2Darray(int size, int lx, int ly); void ***alloc3Darray(int size, int lx, int ly, int lz); void extract_labels(const char *filename, int *label1, int *label2); int saveDataAsBMP32( char *fname, void *b, int lx, int ly ); void argumentAnalysis( int argc, char *argv[], int *skip_training, int *save_flag, char **save_name, int *load_flag, char **load_name ); #define DEFAULT_WEIGHT_FILE "modelWeight.bin" #define INPUT_CHANNELS 3 #define INPUT_HEIGHT 32 #define INPUT_WIDTH 32 #define CONV_FILTERS 128 #define KERNEL_SIZE 3 #define POOL_SIZE 2 #define FC_SIZE 512 #define OUTPUT_SIZE 10 // OUTPUT_SIZE ( mode==CIFAR-10 ? 10 : ( mode==CIFAR-100 ? 100 : 0 ) #define BATCH_SIZE 256 #define EPOCHS 20 // The minimum number of learning sessions is 100. #define LEARNING_RATE (0.035*4) #define MOMENTUM 0.9 #define FFT -1 #define IFFT 1 #define DFT FFT #define IDFT IFFT typedef struct { char filename[256]; int label2; int label; float input[INPUT_CHANNELS][INPUT_HEIGHT][INPUT_WIDTH]; } ImageData; void show_image_data(ImageData *images, int here); int load_image(char *filepath, ImageData *image, int width, int height, int channels); int sample(const char *dirname, ImageData **images, int *num_samples); typedef struct { int max_idx[CONV_FILTERS][INPUT_HEIGHT / POOL_SIZE][INPUT_WIDTH / POOL_SIZE][2]; } PoolIndices; typedef struct { float conv1[CONV_FILTERS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE]; float bias1[CONV_FILTERS]; float m_conv1[CONV_FILTERS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE]; float m_bias1[CONV_FILTERS]; float conv2[CONV_FILTERS][CONV_FILTERS][KERNEL_SIZE][KERNEL_SIZE]; float bias2[CONV_FILTERS]; float m_conv2[CONV_FILTERS][CONV_FILTERS][KERNEL_SIZE][KERNEL_SIZE]; float m_bias2[CONV_FILTERS]; float fc1[CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)][FC_SIZE]; float bias_fc1[FC_SIZE]; float m_fc1[CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)][FC_SIZE]; float m_bias_fc1[FC_SIZE]; float fc2[FC_SIZE][OUTPUT_SIZE]; float bias_fc2[OUTPUT_SIZE]; float m_fc2[FC_SIZE][OUTPUT_SIZE]; float m_bias_fc2[OUTPUT_SIZE]; float bn1_mean[CONV_FILTERS], bn1_var[CONV_FILTERS]; float bn1_gamma[CONV_FILTERS], bn1_beta[CONV_FILTERS]; float bn2_mean[CONV_FILTERS], bn2_var[CONV_FILTERS]; float bn2_gamma[CONV_FILTERS], bn2_beta[CONV_FILTERS]; float m_bn1_gamma[CONV_FILTERS], m_bn1_beta[CONV_FILTERS]; float m_bn2_gamma[CONV_FILTERS], m_bn2_beta[CONV_FILTERS]; } ConvolutionalNeuralNetwork; void init_cnn(ConvolutionalNeuralNetwork *cnn) { float limit; limit = sqrtf(6.0f / (INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE)); for(int f=0; f<CONV_FILTERS; f++) { for(int c=0; c<INPUT_CHANNELS; c++) { for(int y=0; y<KERNEL_SIZE; y++) { for(int x=0; x<KERNEL_SIZE; x++) { cnn->conv1[f][c][y][x] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit; cnn->m_conv1[f][c][y][x] = 0.0f; } } } cnn->bias1[f] = 0.0f; cnn->m_bias1[f] = 0.0f; cnn->bn1_mean[f] = 0.0f; cnn->bn1_var[f] = 1.0f; cnn->bn1_gamma[f] = 1.0f; cnn->bn1_beta[f] = 0.0f; cnn->m_bn1_gamma[f] = 0.0f; cnn->m_bn1_beta[f] = 0.0f; } limit = sqrtf(6.0f / (CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE)); for(int f=0; f<CONV_FILTERS; f++) { for(int c=0; c<CONV_FILTERS; c++) { for(int y=0; y<KERNEL_SIZE; y++) { for(int x=0; x<KERNEL_SIZE; x++) { cnn->conv2[f][c][y][x] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit; cnn->m_conv2[f][c][y][x] = 0.0f; } } } cnn->bias2[f] = 0.0f; cnn->m_bias2[f] = 0.0f; cnn->bn2_mean[f] = 0.0f; cnn->bn2_var[f] = 1.0f; cnn->bn2_gamma[f] = 1.0f; cnn->bn2_beta[f] = 0.0f; cnn->m_bn2_gamma[f] = 0.0f; cnn->m_bn2_beta[f] = 0.0f; } limit = sqrtf(6.0f / (CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE))); for(int i=0; i<CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE); i++) { for(int j=0; j<FC_SIZE; j++) { cnn->fc1[i][j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit; cnn->m_fc1[i][j] = 0.0f; } } for(int i=0; i<FC_SIZE; i++) { cnn->bias_fc1[i] = 0.0f; cnn->m_bias_fc1[i] = 0.0f; } limit = sqrtf(6.0f / FC_SIZE); for(int i=0; i<FC_SIZE; i++) { for(int j=0; j<OUTPUT_SIZE; j++) { cnn->fc2[i][j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit; cnn->m_fc2[i][j] = 0.0f; } } for(int i=0; i<OUTPUT_SIZE; i++) { cnn->bias_fc2[i] = 0.0f; cnn->m_bias_fc2[i] = 0.0f; } } void batch_norm(float *input, float *mean, float *var, float *gamma, float *beta, float *output, int size) { #pragma omp parallel for for(int f=0; f<size; f++) { for(int y=0; y<INPUT_HEIGHT; y++)for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; output[idx] = gamma[f] * (input[idx] - mean[f]) / sqrtf(var[f] + 1e-5) + beta[f]; } } } void softmax(float *input, float *output, int size) { float max_val = input[0]; for(int i=1; i<size; i++){ if(input[i]>max_val) max_val = input[i]; } float sum = 0.0f; for(int i=0; i<size; i++) sum += ( output[i] = expf(input[i] - max_val) ); for(int i=0; i<size; i++) output[i] /= sum; } void conv_layer_fft( int in_channels, int out_channels, float input[in_channels][INPUT_HEIGHT][INPUT_WIDTH], float weights[out_channels][in_channels][KERNEL_SIZE][KERNEL_SIZE], float biases[out_channels], float *output) { const int pad = KERNEL_SIZE / 2; const int fft_size = 64; #pragma omp parallel for for(int f=0; f<out_channels; f++) { COMPLEX **fft_input = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); COMPLEX **fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); if(!fft_input || !fft_weight){ perror("メモリ割り当て失敗"); free(fft_input); free(fft_weight); continue; } for(int c=0; c<in_channels; c++) { for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_input[y][x].r = 0.0; fft_input[y][x].i = 0.0; fft_weight[y][x].r = 0.0; fft_weight[y][x].i = 0.0; } } for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { fft_input[y + pad][x + pad].r = input[c][y][x]; } } for(int y=0; y<KERNEL_SIZE; y++) { for(int x=0; x<KERNEL_SIZE; x++) { fft_weight[y][x].r = weights[f][c][y][x]; } } FourierTransform2D(fft_input, fft_size, fft_size, FFT); FourierTransform2D(fft_weight, fft_size, fft_size, FFT); for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_input[y][x] = MulCOMPLEX(fft_input[y][x], fft_weight[y][x]); } } FourierTransform2D(fft_input, fft_size, fft_size, IFFT); for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { output[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += fft_input[y + pad][x + pad].r; } } } for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { output[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += biases[f]; } } free(fft_input); free(fft_weight); } } void fc_layer(int input_size, int output_size, float *input, float weights[input_size][output_size], float biases[output_size], float *output) { #pragma omp parallel for for(int j=0; j<output_size; j++) { output[j] = biases[j]; for(int i=0; i<input_size; i++) output[j] += input[i] * weights[i][j]; output[j] = output[j] > 0 ? output[j] : 0; } } #define save_cnn save_weights void save_weights(ConvolutionalNeuralNetwork *cnn, const char *filename) { FILE *fp = fopen(filename, "wb"); if(!fp){ perror("重み保存失敗"); return; } fwrite(cnn->conv1, sizeof(cnn->conv1), 1, fp); fwrite(cnn->bias1, sizeof(cnn->bias1), 1, fp); fwrite(cnn->conv2, sizeof(cnn->conv2), 1, fp); fwrite(cnn->bias2, sizeof(cnn->bias2), 1, fp); fwrite(cnn->fc1, sizeof(cnn->fc1), 1, fp); fwrite(cnn->bias_fc1, sizeof(cnn->bias_fc1), 1, fp); fwrite(cnn->fc2, sizeof(cnn->fc2), 1, fp); fwrite(cnn->bias_fc2, sizeof(cnn->bias_fc2), 1, fp); fwrite(cnn->bn1_mean, sizeof(cnn->bn1_mean), 1, fp); fwrite(cnn->bn1_var, sizeof(cnn->bn1_var), 1, fp); fwrite(cnn->bn1_gamma, sizeof(cnn->bn1_gamma), 1, fp); fwrite(cnn->bn1_beta, sizeof(cnn->bn1_beta), 1, fp); fwrite(cnn->bn2_mean, sizeof(cnn->bn2_mean), 1, fp); fwrite(cnn->bn2_var, sizeof(cnn->bn2_var), 1, fp); fwrite(cnn->bn2_gamma, sizeof(cnn->bn2_gamma), 1, fp); fwrite(cnn->bn2_beta, sizeof(cnn->bn2_beta), 1, fp); fclose(fp); } #define load_cnn load_weights void load_weights(ConvolutionalNeuralNetwork *cnn, const char *filename) { FILE *fp = fopen(filename, "rb"); if(!fp){ perror("重み読み込み失敗"); return; } fread(cnn->conv1, sizeof(cnn->conv1), 1, fp); fread(cnn->bias1, sizeof(cnn->bias1), 1, fp); fread(cnn->conv2, sizeof(cnn->conv2), 1, fp); fread(cnn->bias2, sizeof(cnn->bias2), 1, fp); fread(cnn->fc1, sizeof(cnn->fc1), 1, fp); fread(cnn->bias_fc1, sizeof(cnn->bias_fc1), 1, fp); fread(cnn->fc2, sizeof(cnn->fc2), 1, fp); fread(cnn->bias_fc2, sizeof(cnn->bias_fc2), 1, fp); fread(cnn->bn1_mean, sizeof(cnn->bn1_mean), 1, fp); fread(cnn->bn1_var, sizeof(cnn->bn1_var), 1, fp); fread(cnn->bn1_gamma, sizeof(cnn->bn1_gamma), 1, fp); fread(cnn->bn1_beta, sizeof(cnn->bn1_beta), 1, fp); fread(cnn->bn2_mean, sizeof(cnn->bn2_mean), 1, fp); fread(cnn->bn2_var, sizeof(cnn->bn2_var), 1, fp); fread(cnn->bn2_gamma, sizeof(cnn->bn2_gamma), 1, fp); fread(cnn->bn2_beta, sizeof(cnn->bn2_beta), 1, fp); fclose(fp); } int evaluate(ConvolutionalNeuralNetwork *cnn, ImageData *images, int num_samples, double *eval_time) { double start_time = omp_get_wtime(); float *conv1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *bn1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *conv2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *bn2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *pool_out = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE), sizeof(float)); float *fc1_out = (float *)calloc(FC_SIZE, sizeof(float)); float *fc2_out = (float *)calloc(OUTPUT_SIZE, sizeof(float)); if(!conv1_out || !bn1_out || !conv2_out || !bn2_out || !pool_out || !fc1_out || !fc2_out) { perror("メモリ割り当て失敗"); free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out); free(pool_out); free(fc1_out); free(fc2_out); return 1; } int correct = 0; for(int i=0; i<num_samples; i++) { conv_layer_fft(INPUT_CHANNELS, CONV_FILTERS, images[i].input, cnn->conv1, cnn->bias1, conv1_out); batch_norm(conv1_out, cnn->bn1_mean, cnn->bn1_var, cnn->bn1_gamma, cnn->bn1_beta, bn1_out, CONV_FILTERS); #pragma omp parallel for for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn1_out[idx] = fmaxf(0.0f, bn1_out[idx]); conv_layer_fft(CONV_FILTERS, CONV_FILTERS, (float (*)[INPUT_HEIGHT][INPUT_WIDTH])bn1_out, cnn->conv2, cnn->bias2, conv2_out); #pragma omp parallel for for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; conv2_out[idx] += bn1_out[idx]; } } } batch_norm(conv2_out, cnn->bn2_mean, cnn->bn2_var, cnn->bn2_gamma, cnn->bn2_beta, bn2_out, CONV_FILTERS); #pragma omp parallel for for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn2_out[idx] = fmaxf(0.0f, bn2_out[idx]); #pragma omp parallel for collapse(2) for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) { for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) { float max_val = -INFINITY; for(int py = 0; py < POOL_SIZE; py++) { for(int px = 0; px < POOL_SIZE; px++) { int iy = y * POOL_SIZE + py; int ix = x * POOL_SIZE + px; float val = bn2_out[f * INPUT_HEIGHT * INPUT_WIDTH + iy * INPUT_WIDTH + ix]; if(val > max_val) max_val = val; } } pool_out[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x] = max_val; } } } int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE); fc_layer(pool_size, FC_SIZE, pool_out, cnn->fc1, cnn->bias_fc1, fc1_out); fc_layer(FC_SIZE, OUTPUT_SIZE, fc1_out, cnn->fc2, cnn->bias_fc2, fc2_out); float output[OUTPUT_SIZE]; softmax(fc2_out, output, OUTPUT_SIZE); int pred = 0; for(int j=1; j<OUTPUT_SIZE; j++) { if(output[j] > output[pred]) pred = j; } if(pred == images[i].label) correct++; } *eval_time = omp_get_wtime() - start_time; printf("評価精度: %.2f%%\n", 100.0f * correct / num_samples); free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out); free(pool_out); free(fc1_out); free(fc2_out); return 0; } int train(ConvolutionalNeuralNetwork *cnn, ImageData *images, int num_samples, int start_epoch, double *train_time) { double start_time = omp_get_wtime(); float *grad_fc2 = (float *)calloc(FC_SIZE * OUTPUT_SIZE, sizeof(float)); float grad_bias_fc2[OUTPUT_SIZE] = {0}; float *grad_fc1 = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) * FC_SIZE, sizeof(float)); float grad_bias_fc1[FC_SIZE] = {0}; float *grad_conv2 = (float *)calloc(CONV_FILTERS * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE, sizeof(float)); float grad_bias2[CONV_FILTERS] = {0}; float *grad_conv1 = (float *)calloc(CONV_FILTERS * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE, sizeof(float)); float grad_bias1[CONV_FILTERS] = {0}; float grad_bn1_gamma[CONV_FILTERS] = {0}, grad_bn1_beta[CONV_FILTERS] = {0}; float grad_bn2_gamma[CONV_FILTERS] = {0}, grad_bn2_beta[CONV_FILTERS] = {0}; float *conv1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *bn1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *conv2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *bn2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *pool_out = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE), sizeof(float)); float *fc1_out = (float *)calloc(FC_SIZE, sizeof(float)); float *fc2_out = (float *)calloc(OUTPUT_SIZE, sizeof(float)); float *grad_pool = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *grad_bn2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *grad_bn1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *grad_add = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *grad_from_conv2 = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); float *grad_conv1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float)); PoolIndices pool_indices = {0}; if( !grad_fc2 || !grad_fc1 || !grad_conv2 || !grad_conv1 || !conv1_out || !bn1_out || !conv2_out || !bn2_out || !pool_out || !fc1_out || !fc2_out || !grad_pool || !grad_bn2_out || !grad_bn1_out || !grad_add || !grad_from_conv2 || !grad_conv1_out) { perror("メモリ割り当て失敗"); // free all free(grad_fc2); free(grad_fc1); free(grad_conv2); free(grad_conv1); free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out); free(pool_out); free(fc1_out); free(fc2_out); free(grad_pool); free(grad_bn2_out); free(grad_bn1_out); free(grad_add); free(grad_from_conv2); free(grad_conv1_out); return 1; } float learning_rate = LEARNING_RATE; for(int epoch = start_epoch; epoch < EPOCHS; epoch++) { if(epoch % 10 == 0 && epoch > start_epoch) learning_rate *= 0.1; float loss = 0.0f; int correct = 0; for(int b=0; b<num_samples; b+=BATCH_SIZE){ printf(" % 6d: % 6d (% 6d / %d ) % 4d / %d elapsed time = %.2fsec\n", b, num_samples/BATCH_SIZE, num_samples, BATCH_SIZE, epoch, EPOCHS, omp_get_wtime() - start_time ); memset(grad_fc2, 0, sizeof(float) * FC_SIZE * OUTPUT_SIZE ); memset(grad_bias_fc2, 0, sizeof(grad_bias_fc2)); memset(grad_fc1, 0, sizeof(float) * CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) * FC_SIZE); memset(grad_bias_fc1, 0, sizeof(grad_bias_fc1)); memset(grad_conv2, 0, sizeof(float) * CONV_FILTERS * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE); memset(grad_bias2, 0, sizeof(grad_bias2)); memset(grad_conv1, 0, sizeof(float) * CONV_FILTERS * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE); memset(grad_bias1, 0, sizeof(grad_bias1)); memset(grad_bn1_gamma, 0, sizeof(grad_bn1_gamma)); memset(grad_bn1_beta, 0, sizeof(grad_bn1_beta)); memset(grad_bn2_gamma, 0, sizeof(grad_bn2_gamma)); memset(grad_bn2_beta, 0, sizeof(grad_bn2_beta)); memset(grad_add, 0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH); memset(grad_from_conv2,0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH); memset(grad_conv1_out, 0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH); memset(&pool_indices, 0, sizeof(PoolIndices)); for(int i = b; i < b + BATCH_SIZE && i < num_samples; i++) { conv_layer_fft(INPUT_CHANNELS, CONV_FILTERS, images[i].input, cnn->conv1, cnn->bias1, conv1_out); batch_norm(conv1_out, cnn->bn1_mean, cnn->bn1_var, cnn->bn1_gamma, cnn->bn1_beta, bn1_out, CONV_FILTERS); #pragma omp parallel for for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn1_out[idx] = fmaxf(0.0f, bn1_out[idx]); conv_layer_fft(CONV_FILTERS, CONV_FILTERS, (float (*)[INPUT_HEIGHT][INPUT_WIDTH])bn1_out, cnn->conv2, cnn->bias2, conv2_out); #pragma omp parallel for for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; conv2_out[idx] += bn1_out[idx]; } } } batch_norm(conv2_out, cnn->bn2_mean, cnn->bn2_var, cnn->bn2_gamma, cnn->bn2_beta, bn2_out, CONV_FILTERS); #pragma omp parallel for for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn2_out[idx] = fmaxf(0.0f, bn2_out[idx]); #pragma omp parallel for collapse(2) for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) { for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) { float max_val = -INFINITY; int max_y = y * POOL_SIZE, max_x = x * POOL_SIZE; for(int py = 0; py < POOL_SIZE; py++) { for(int px = 0; px < POOL_SIZE; px++) { int iy = y * POOL_SIZE + py; int ix = x * POOL_SIZE + px; float val = bn2_out[f * INPUT_HEIGHT * INPUT_WIDTH + iy * INPUT_WIDTH + ix]; if(isnan(val) || isinf(val)) continue; if(val > max_val) { max_val = val; max_y = iy; max_x = ix; } } } pool_out[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x] = max_val; pool_indices.max_idx[f][y][x][0] = max_y; pool_indices.max_idx[f][y][x][1] = max_x; } } } int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE); fc_layer(pool_size, FC_SIZE, pool_out, cnn->fc1, cnn->bias_fc1, fc1_out); fc_layer(FC_SIZE, OUTPUT_SIZE, fc1_out, cnn->fc2, cnn->bias_fc2, fc2_out); float output[OUTPUT_SIZE]; softmax(fc2_out, output, OUTPUT_SIZE); int label = images[i].label; if(label < 0 || label >= OUTPUT_SIZE) continue; loss -= logf(output[label] + 1e-10); int pred = 0; for(int j=1; j<OUTPUT_SIZE; j++) { if(output[j] > output[pred]) pred = j; } correct += (pred == label) ? 1 : 0; float grad_fc2_in[OUTPUT_SIZE]; for(int j=0; j<OUTPUT_SIZE; j++) { grad_fc2_in[j] = output[j] - (j == label ? 1.0f : 0.0f); } #pragma omp parallel for for(int j=0; j<OUTPUT_SIZE; j++) { grad_bias_fc2[j] += grad_fc2_in[j]; for(int k=0; k<FC_SIZE; k++) { grad_fc2[k * OUTPUT_SIZE + j] += grad_fc2_in[j] * fc1_out[k]; } } float grad_fc1_in[FC_SIZE] = {0}; #pragma omp parallel for for(int k=0; k<FC_SIZE; k++) { grad_fc1_in[k] = 0.0f; if(fc1_out[k] > 0) { for(int j=0; j<OUTPUT_SIZE; j++) { grad_fc1_in[k] += grad_fc2_in[j] * cnn->fc2[k][j]; } } grad_bias_fc1[k] += grad_fc1_in[k]; for(int m=0; m<pool_size; m++) { grad_fc1[m * FC_SIZE + k] += grad_fc1_in[k] * pool_out[m]; } } #pragma omp parallel for for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) { for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) { int max_y = pool_indices.max_idx[f][y][x][0]; int max_x = pool_indices.max_idx[f][y][x][1]; if(max_y >= 0 && max_y < INPUT_HEIGHT && max_x >= 0 && max_x < INPUT_WIDTH) { grad_pool[f * INPUT_HEIGHT * INPUT_WIDTH + max_y * INPUT_WIDTH + max_x] = grad_fc1_in[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x]; } } } } #pragma omp parallel for collapse(2) for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; grad_bn2_out[idx] = grad_pool[idx] * (bn2_out[idx] > 0 ? 1.0f : 0.0f); } } } #pragma omp parallel for collapse(2) for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; grad_bn2_gamma[f] += grad_bn2_out[idx] * (conv2_out[idx] - cnn->bn2_mean[f]) / sqrtf(cnn->bn2_var[f] + 1e-5); grad_bn2_beta[f] += grad_bn2_out[idx]; } } } #pragma omp parallel for collapse(2) for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; grad_add[idx] = grad_bn2_out[idx] * cnn->bn2_gamma[f] / sqrtf(cnn->bn2_var[f] + 1e-5); } } } const int pad = KERNEL_SIZE / 2; const int fft_size = 64; COMPLEX **fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); COMPLEX **fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; } #pragma omp parallel for for(int f=0; f<CONV_FILTERS; f++) { for(int c=0; c<CONV_FILTERS; c++) { for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_grad[y][x].r = 0.0f; fft_grad[y][x].i = 0.0f; fft_weight[y][x].r = 0.0f; fft_weight[y][x].i = 0.0f; } } for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { fft_grad[y + pad][x + pad].r = grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x]; } } for(int y=0; y<KERNEL_SIZE; y++) { for(int x=0; x<KERNEL_SIZE; x++) { fft_weight[y][x].r = cnn->conv2[f][c][y][x]; } } FourierTransform2D(fft_grad, fft_size, fft_size, FFT); FourierTransform2D(fft_weight, fft_size, fft_size, FFT); for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], ConjCOMPLEX(fft_weight[y][x])); } } FourierTransform2D(fft_grad, fft_size, fft_size, IFFT); for(int ky = 0; ky < KERNEL_SIZE; ky++) { for(int kx = 0; kx < KERNEL_SIZE; kx++) { grad_conv2[f * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx] += fft_grad[ky][kx].r; } } } for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { grad_bias2[f] += grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x]; } } } free(fft_grad); free(fft_weight); // Add the missing grad_from_conv2 (backprop for input of conv2) fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; } #pragma omp parallel for for(int c=0; c<CONV_FILTERS; c++) { for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_grad[y][x].r = 0.0f; fft_grad[y][x].i = 0.0f; fft_weight[y][x].r = 0.0f; fft_weight[y][x].i = 0.0f; } } for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { fft_grad[y + pad][x + pad].r = grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x]; } } for(int y=0; y<KERNEL_SIZE; y++) { for(int x=0; x<KERNEL_SIZE; x++) { fft_weight[y][x].r = cnn->conv2[f][c][y][x]; } } FourierTransform2D(fft_grad, fft_size, fft_size, FFT); FourierTransform2D(fft_weight, fft_size, fft_size, FFT); for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], fft_weight[y][x]); } } FourierTransform2D(fft_grad, fft_size, fft_size, IFFT); for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { grad_from_conv2[c * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += fft_grad[y + pad][x + pad].r; } } } } free(fft_grad); free(fft_weight); #pragma omp parallel for collapse(2) for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; grad_bn1_out[idx] = grad_from_conv2[idx] * (bn1_out[idx] > 0.0f ? 1.0f : 0.0f) + grad_add[idx]; grad_bn1_gamma[f] += grad_bn1_out[idx] * (conv1_out[idx] - cnn->bn1_mean[f]) / sqrtf(cnn->bn1_var[f] + 1e-5); grad_bn1_beta[f] += grad_bn1_out[idx]; } } } #pragma omp parallel for collapse(2) for(int f=0; f<CONV_FILTERS; f++) { for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x; grad_conv1_out[idx] = grad_bn1_out[idx] * cnn->bn1_gamma[f] / sqrtf(cnn->bn1_var[f] + 1e-5); } } } fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size); if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; } #pragma omp parallel for for(int f=0; f<CONV_FILTERS; f++) { for(int c=0; c<INPUT_CHANNELS; c++) { for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_grad[y][x].r = 0.0f; fft_grad[y][x].i = 0.0f; fft_weight[y][x].r = 0.0f; fft_weight[y][x].i = 0.0f; } } for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { fft_grad[y + pad][x + pad].r = grad_conv1_out[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x]; fft_weight[y + pad][x + pad].r = images[i].input[c][y][x]; } } FourierTransform2D(fft_grad, fft_size, fft_size, FFT); FourierTransform2D(fft_weight, fft_size, fft_size, FFT); for(int y=0; y<fft_size; y++) { for(int x=0; x<fft_size; x++) { fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], ConjCOMPLEX(fft_weight[y][x])); } } FourierTransform2D(fft_grad, fft_size, fft_size, IFFT); for(int ky = 0; ky < KERNEL_SIZE; ky++) { for(int kx = 0; kx < KERNEL_SIZE; kx++) { grad_conv1[f * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx] += fft_grad[ky][kx].r; } } } for(int y=0; y<INPUT_HEIGHT; y++) { for(int x=0; x<INPUT_WIDTH; x++) { grad_bias1[f] += grad_conv1_out[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x]; } } } free(fft_grad); free(fft_weight); } // Update parameters #pragma omp parallel for for(int j=0; j<OUTPUT_SIZE; j++) { cnn->m_bias_fc2[j] = MOMENTUM * cnn->m_bias_fc2[j] - learning_rate * grad_bias_fc2[j] / BATCH_SIZE; cnn->bias_fc2[j] += cnn->m_bias_fc2[j]; for(int k = 0; k < FC_SIZE; k++) { cnn->m_fc2[k][j] = MOMENTUM * cnn->m_fc2[k][j] - learning_rate * grad_fc2[k * OUTPUT_SIZE + j] / BATCH_SIZE; cnn->fc2[k][j] += cnn->m_fc2[k][j]; } } int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE); #pragma omp parallel for for(int k = 0; k < FC_SIZE; k++) { cnn->m_bias_fc1[k] = MOMENTUM * cnn->m_bias_fc1[k] - learning_rate * grad_bias_fc1[k] / BATCH_SIZE; cnn->bias_fc1[k] += cnn->m_bias_fc1[k]; for(int m = 0; m < pool_size; m++) { cnn->m_fc1[m][k] = MOMENTUM * cnn->m_fc1[m][k] - learning_rate * grad_fc1[m * FC_SIZE + k] / BATCH_SIZE; cnn->fc1[m][k] += cnn->m_fc1[m][k]; } } #pragma omp parallel for for(int f=0; f<CONV_FILTERS; f++) { cnn->m_bias2[f] = MOMENTUM * cnn->m_bias2[f] - learning_rate * grad_bias2[f] / BATCH_SIZE; cnn->bias2[f] += cnn->m_bias2[f]; for(int c=0; c<CONV_FILTERS; c++) { for(int ky = 0; ky < KERNEL_SIZE; ky++) { for(int kx = 0; kx < KERNEL_SIZE; kx++) { int idx = f * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx; cnn->m_conv2[f][c][ky][kx] = MOMENTUM * cnn->m_conv2[f][c][ky][kx] - learning_rate * grad_conv2[idx] / BATCH_SIZE; cnn->conv2[f][c][ky][kx] += cnn->m_conv2[f][c][ky][kx]; } } } cnn->m_bn2_gamma[f] = MOMENTUM * cnn->m_bn2_gamma[f] - learning_rate * grad_bn2_gamma[f] / BATCH_SIZE; cnn->bn2_gamma[f] += cnn->m_bn2_gamma[f]; cnn->m_bn2_beta[f] = MOMENTUM * cnn->m_bn2_beta[f] - learning_rate * grad_bn2_beta[f] / BATCH_SIZE; cnn->bn2_beta[f] += cnn->m_bn2_beta[f]; } #pragma omp parallel for for(int f=0; f<CONV_FILTERS; f++) { cnn->m_bias1[f] = MOMENTUM * cnn->m_bias1[f] - learning_rate * grad_bias1[f] / BATCH_SIZE; cnn->bias1[f] += cnn->m_bias1[f]; for(int c=0; c<INPUT_CHANNELS; c++) { for(int ky = 0; ky < KERNEL_SIZE; ky++) { for(int kx = 0; kx < KERNEL_SIZE; kx++) { int idx = f * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx; cnn->m_conv1[f][c][ky][kx] = MOMENTUM * cnn->m_conv1[f][c][ky][kx] - learning_rate * grad_conv1[idx] / BATCH_SIZE; cnn->conv1[f][c][ky][kx] += cnn->m_conv1[f][c][ky][kx]; } } } cnn->m_bn1_gamma[f] = MOMENTUM * cnn->m_bn1_gamma[f] - learning_rate * grad_bn1_gamma[f] / BATCH_SIZE; cnn->bn1_gamma[f] += cnn->m_bn1_gamma[f]; cnn->m_bn1_beta[f] = MOMENTUM * cnn->m_bn1_beta[f] - learning_rate * grad_bn1_beta[f] / BATCH_SIZE; cnn->bn1_beta[f] += cnn->m_bn1_beta[f]; } } printf("エポック %d: 損失=%.4f, 精度=%.2f%%\n", epoch, loss / num_samples, 100.0f * correct / num_samples); } *train_time = omp_get_wtime() - start_time; // free all free(grad_fc2); free(grad_fc1); free(grad_conv2); free(grad_conv1); free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out); free(pool_out); free(fc1_out); free(fc2_out); free(grad_pool); free(grad_bn2_out); free(grad_bn1_out); free(grad_add); free(grad_from_conv2); free(grad_conv1_out); return 0; } int main(int argc, char *argv[]){ srand(time(NULL)); double start_time = omp_get_wtime(), train_time = 0.0, eval_time = 0.0; int skip_training = 0, save_flag = 0, load_flag = 0, status = 1; char *save_file = DEFAULT_WEIGHT_FILE; char *load_file = DEFAULT_WEIGHT_FILE; argumentAnalysis( argc, argv, &skip_training, &save_flag, &save_file, &load_flag, &load_file ); ImageData *train_images = NULL; ImageData *test_images = NULL; ConvolutionalNeuralNetwork *cnn = (ConvolutionalNeuralNetwork *)calloc(1,sizeof(ConvolutionalNeuralNetwork)); if(!cnn){ fprintf(stderr, "Failed to allocate memory for ConvolutionalNeuralNetwork.\n"); goto cleanup; } else if(load_flag) load_cnn(cnn, load_file); else init_cnn(cnn); int num_train_samples = 0; if(!skip_training) goto SKIP_LOAD_TRAINING; if(!sample("Learning", &train_images, &num_train_samples)) { fprintf(stderr, "トレーニングデータ読み込み失敗\n"); goto cleanup;; } SKIP_LOAD_TRAINING: int num_test_samples = 0; if(!sample("Evaluation", &test_images, &num_test_samples)) { fprintf(stderr, "テストデータ読み込み失敗\n"); goto cleanup;; } if(!skip_training) goto SKIP_THIS_TRAINING; train(cnn, train_images, num_train_samples, 0, &train_time); printf("トレーニング時間: %.2f秒\n", train_time); if(save_flag) save_cnn(cnn, save_file); SKIP_THIS_TRAINING: evaluate(cnn, test_images, num_test_samples, &eval_time); printf("評価時間: %.2f秒\n", eval_time); double total_time = omp_get_wtime() - start_time; char msg[256]; snprintf(msg, sizeof(msg), "Total processing time: %.2f seconds (Training: %.2f, Evaluation: %.2f)", total_time, skip_training ? 0.0 : train_time, eval_time); printf("\n%s\n", msg); status = 0; cleanup: free(train_images); free(test_images); free(cnn); return status; } //////////////////////////////////////////////////////////////////////////////////////////////// // これ以降の部分は CNN プログラム(上部のコード)を実行するのに必要な環境を提供する // プログラム群 CNN とは、分離して考える事。 #include <string.h> // コマンドライン引数解析 void argumentAnalysis( int argc, char *argv[], int *skip_training, int *save_flag, char **save_name, int *load_flag, char **load_name ){ int v_flg=0, s_flg=0, l_flg=0, r_flg=0, t_flg=0; for(int i=1; i<argc; i++ ){ // プログラム起動時の引数解析処理 if( argv[i][0]=='-' ) { switch(argv[i][1]){ case 's': if( s_flg++ ){ fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] ); goto GOOD_BY; } else if( r_flg ){ fprintf(stderr, "Error: -s and -r cannot be used together\n" ); goto GOOD_BY; } else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;} *save_flag = 1; *save_name = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i] : DEFAULT_WEIGHT_FILE; break; case 'l': if( l_flg++ ){ fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] ); goto GOOD_BY; } else if( r_flg ) { fprintf(stderr, "Error: -l and -r cannot be used together\n" ); goto GOOD_BY; } else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;} *load_flag = 1; *load_name = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i] : DEFAULT_WEIGHT_FILE; break; case 'r': if( r_flg++ ){ printf( "++r_flg = %d\n", r_flg ); fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] ); goto GOOD_BY; } else if( l_flg ){ fprintf(stderr, "Error: -l and -r cannot be used together\n" ); goto GOOD_BY; } else if( s_flg ){ fprintf(stderr, "Error: -s and -r cannot be used together\n" ); goto GOOD_BY; } else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;} *skip_training = 1; *load_flag = 1; *load_name = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i] : DEFAULT_WEIGHT_FILE; break; case 't': if( t_flg++ ){ fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] ); goto GOOD_BY; } else if( (i+1)>=argc || argv[i+1][0]=='-' ){ fprintf(stderr, "Error: No maximum number specified\n" ); goto GOOD_BY; } else if( argv[i][2] ) goto GOOD_BY; if( (ActiveThread = atoi(argv[++i]))<0 ){ fprintf(stderr, "Error: The number of threads must be specified.\n"); goto GOOD_BY; } break; case 'V': //fallthrough case 'v': if( v_flg++ ){ fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] ); goto GOOD_BY; } else if( argv[i][2] ) goto GOOD_BY; verbose = argv[i][1]=='v' ? 1 : 2; break; case 'h': // fallthrough default : printf( "'%c''%c' \n", argv[i][0], argv[i][1] ); goto GOOD_BY; } } } return; GOOD_BY:; int i; char *p=argv[0], *op="[-v] [-t number] [-s [save_file]] [-l [load_file]] [-r [load_file]]"; for(i=strlen(p); i>0 && p[i]!='\\' && p[i]!='/'; i-- );; if(i) i++; fprintf( stderr, "\nusage: %s %s\n\n", &p[i], op ); exit( 1 ); } #pragma pack(push,1) typedef struct PIXEL { uint8_t B, G, R, A; } PIXEL ; typedef struct STBPX { uint8_t R, G, B, A; } STBPX ; typedef struct ARRAY { uint8_t v[4]; } ARRAY ; #pragma pack(pop) #define v2D(type,lx,name) ((type(*)[(lx)])name) #define v2Dc(lx,name) v2D(char, lx, name) #define v2Di(lx,name) v2D(int,lx,name) #define v2Df(lx,name) v2D(float,lx,name) #define v2Dp(lx,name) v2D(PIXEL,lx,name) #define v2Da(lx,name) v2D(ARRAY,lx,name) #define v3D(type,lx,ly,name) ((type(*)[(lx)][(ly)])name) #define v3Df(lx,ly,name) v3D(float, lx, ly, name) #define HERE fprintf(stderr,"% 6d: %s\n",__LINE__, __func__); #define PROCESS_ALL_PIXELS for(int y=0;y<ly;y++)for(int x=0;x<lx;x++) #define saveArray2bmp32(name,a,w,h) saveDataAsBMP32((name),(&(a)[0][0]),(w),(h)) // BMPヘッダ構造体 #pragma pack(push, 2) typedef struct HEADER { unsigned short bfType; // 'BM' (0x4D42) unsigned int bfSize; // ファイルサイズ unsigned short bfReserved1; // 0 unsigned short bfReserved2; // 0 unsigned int bfOffBits; // データオフセット unsigned int biSize; // 情報ヘッダサイズ (40) signed int biWidth; // 幅 signed int biHeight; // 高さ(負で上下反転) unsigned short biPlanes; // 1 unsigned short biBitCount; // 32 unsigned int biCompression; // 0 (BI_RGB) unsigned int biSizeImage; // 画像データサイズ signed int biXPelsPerMeter; // 0 signed int biYPelsPerMeter; // 0 unsigned int biClrUsed; // 0 unsigned int biClrImportant; // 0 unsigned char lut[]; // パレット(8ビット用) } HEADER; #pragma pack(pop) // 画像パラメータ #define BMPID 0x4D42 // 'BM' #define BMPIFHSIZE 40 // 情報ヘッダサイズ int saveDataAsBMP32( char *fname, void *b, int lx, int ly ){ HEADER h = {BMPID, lx*abs(ly)*sizeof(PIXEL)+sizeof(HEADER), 0, 0, sizeof(HEADER), BMPIFHSIZE, lx, ly, 1, 32, 0, lx*abs(ly)*sizeof(PIXEL), 0, 0, 0, 0 }; FILE *fp=NULL ; int st=-1; if( (fp=fopen(fname, "wb")) == NULL ) { fprintf( stderr, "File '%s': cannot create.\n", fname ); } else if( fwrite( &h, sizeof(HEADER), 1, fp ) != 1 ) { fprintf( stderr, "File '%s': Header write error.\n", fname ); } else if( fwrite( b, h.biSizeImage, 1, fp ) != 1 ) { fprintf( stderr, "File '%s': Data write error.\n", fname ); } else st=0; if(fp)fclose(fp) ; return st; } static inline int ft_size_check(int n){ return (n > 0) && !(n & (n - 1));} void **alloc2Darray(int s, int lx, int ly){ if(s<1 || lx<1 || ly<1) return NULL; long long w = lx, h = ly, next = s*w, space = h*sizeof(void *) + w*h*s + 64; void **d = (void **)calloc(1, space); if(!d){fprintf(stderr,"alloc2Darray failed: %lld bytes\n",space); return NULL;} void *base = (void*)((uintptr_t)((char*)d + h * sizeof(void*) + 63) & ~63LL); for(long long i=0; i<h; i++) d[i] = (void*)((char*)base + next * i); return d; // C99: uintptr_t same as (void *) } void ***alloc3Darray(int size, int lx, int ly, int lz) { if(size < 1 || lx < 1 || ly < 1 || lz < 1) return NULL; long long s = size, llx = lx, lly = ly, llz = lz, lyz = lly * llz, line = s * llx; long long index = (llz * lly + llz) * sizeof(void *), body = llx * lly * llz * s, space; void ***d = (void ***)calloc(1, space = (index + body + 64)); if(!d){ fprintf(stderr, "alloc3Darray failed: %lld bytes\n", space); return NULL; }else{ void **area=(void **)&d[llz], *base=(void*)(((uintptr_t)((char*)d+index+63))&~63LL); for(int i=0; i<llz; i++) d[i] = &area[lly * i]; for(int i=0; i<lyz; i++) area[i] = (void *)((char *)base + line * i); } return d; } void FourierTransform2D(COMPLEX **c, int lx, int ly, int flag) { int elements = lx; double unit = (flag == IFFT) ? 2 * M_PI / lx : -2 * M_PI / lx; COMPLEX rotor[lx>ly?lx:ly]; for(int e=0;e<lx;rotor[e]=RotorCOMPLEX(unit*e),e++); if(ft_size_check(elements)) { #pragma omp parallel for for(int y=0; y<ly; y++){ COMPLEX W, temp; for(int i=0, j=1; j<elements-1; j++){// scrambler, for initial placement for(int k = elements>>1; k>(i^=k); k>>=1); if(j<i){temp=c[y][j]; c[y][j]=c[y][i]; c[y][i]=temp;} // exchange } for(int ir, i, k, m, mh=1; (m=mh<<1)<=elements; mh=m) { // Operations for(ir=0, i=0; i<elements; i+=m) { // ir=0: Rotation angle is 0rad. for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 ); for(int j=i; j<mh+i; j++ ) { int a = j, b = j + mh; temp = SubCOMPLEX(c[y][a], c[y][b]); // For butterfly operation c[y][a] = AddCOMPLEX(c[y][a], c[y][b]); // a ->- (a + b) ->- a c[y][b] = MulCOMPLEX(temp, W); // b ->- (a-b)*W ->- b } } } double reciprocal = 1.0 / elements; if(flag==IFFT) for(int e=0; e<elements; c[y][e]=ImeMulCOMPLEX(c[y][e],reciprocal), e++); } } else {// DFT #pragma omp parallel for for(int y=0; y<ly; y++ ) { COMPLEX tmp[elements]; // 32xsizeof(float) is enough small for(int u=0; u<elements; u++ ){int v; COMPLEX t = (COMPLEX){0,0}; for( v=0; v<elements; v++ ) AppendCOMPLEX( t, MulCOMPLEX(c[y][v],rotor[(u*v)%elements]) ); tmp[u] = t; } double reciprocal = 1./elements; // tmp[u]/elements -> tmp[u]*reciprocal if(flag==IDFT) for(int e=0; e<elements; e++ ) c[y][e] = ImeMulCOMPLEX(tmp[e],reciprocal); else for(int e=0; e<elements; e++ ) c[y][e] = tmp[e]; } } if( lx!=(elements=ly )){ unit = (flag==IFFT) ? 2 * M_PI /elements : -2 * M_PI /elements; for(int e=0; e<elements; rotor[e]=RotorCOMPLEX(unit*e), e++); } if(ft_size_check(elements)){ #pragma omp parallel for for(int x=0; x<lx; x++ ){ COMPLEX W, temp; for(int i=0, j=1; j<elements-1; j++ ){// scrambler, for initial placement for(int k=elements>>1; k>(i^=k); k>>=1 ); if(j<i){temp=c[j][x], c[j][x]=c[i][x], c[i][x]=temp;} // exchange } for(int ir, i, k, m, mh=1; (m=mh<<1)<=elements; mh=m) { // Operations for(ir=0, i=0; i<elements; i+=m ) { // ir=0: Rotation angle is 0rad. for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 ); for(int j=i; j<mh+i; j++ ) { int a = j, b = j + mh; temp = SubCOMPLEX(c[a][x], c[b][x]); // For butterfly operation c[a][x] = AddCOMPLEX(c[a][x], c[b][x]); // a ->- (a + b) ->- a c[b][x] = MulCOMPLEX(temp, W); // b ->- (a-b)*W ->- b } } } double reciprocal=1./elements; if(flag==IDFT) for(int e=0; e<elements; c[e][x]=ImeMulCOMPLEX(c[e][x],reciprocal), e++); } } else {// DFT #pragma omp parallel for for(int x=0; x<lx; x++ ) { COMPLEX tmp[elements]; // 32xsizeof(float) is enough small for(int u=0; u<elements; u++ ){int v; COMPLEX t = (COMPLEX){0,0}; for( v=0; v<elements; v++ ) AppendCOMPLEX( t, MulCOMPLEX(c[v][x],rotor[(u*v)%elements]) ); tmp[u] = t; } double reciprocal = 1./elements; // tmp[u]/elements -> tmp[u]*reciprocal if(flag==IDFT) for(int e=0; e<elements; e++ ) c[e][x] = ImeMulCOMPLEX(tmp[e],reciprocal); else for(int e=0; e<elements; e++ ) c[e][x] = tmp[e]; } } } void extract_labels(const char *filename, int *label1, int *label2) { int dots = 0, len = 0, start = 0; if(label1) *label1 = -1; else { fprintf(stderr, "invalid argument\n"); return; } if(label2) *label2 = -1; if(!filename || (len = strlen(filename)) < 1) { fprintf(stderr, "invalid argument\n"); return; } for(int e = 0; e < len; e++) if(filename[e] == '.') dots++; if(dots <= 1) return; char string[256] = {0}; for(int e = 0; e < len; e++) { if(filename[start = e] == '.') break; } snprintf(string, sizeof(string), "%s", &filename[start]); *strrchr(string, '.') = 0; int head, tail = strlen(string) - 1; for(head = tail; 0 <= head; head--) { if(string[head] != '.' && isdigit(string[head])) continue; if(string[head] != '.') { fprintf(stderr, "invalid label character 0x%2.2X\n", string[head]); return; } else { *label1 = atoi(&string[head + 1]); string[head--] = 0; } if(!label2 || dots < 3) break; for(; 0 <= head && isdigit(string[head]); head--); if(string[head] != '.') { fprintf(stderr, "invalid label character 0x%2.2X\n", string[head]); return; } else { *label2 = atoi(&string[head + 1]); return; } } } int load_image(char *filepath, ImageData *image, int width, int height, int channels) { int lx = 0, ly = 0, bpp = 0; unsigned char *data = stbi_load(filepath, &lx, &ly, &bpp, 4); if(!data || lx != width || ly != height || bpp != 4) { if(data) stbi_image_free(data); fprintf(stderr, "画像サイズ不一致または読み込み失敗: %s\n", image->filename); return 1; } for(int ey = (ly + 1) / 2, ty = ly - 1, y = 0; y < ey; y++) { for(int w = ty - y, x = 0; x < lx; x++) { PIXEL p = { v2Dp(lx, data)[y][x].R, v2Dp(lx, data)[y][x].G, v2Dp(lx, data)[y][x].B, 0 }; v2Dp(lx, data)[y][x] = (PIXEL){ v2Dp(lx, data)[w][x].R, v2Dp(lx, data)[w][x].G, v2Dp(lx, data)[w][x].B, 0 }; v2Dp(lx, data)[w][x] = p; } } for(int end = (4 < channels ? 4 : channels), c = 0; c < end; c++) { for(int y=0; y<ly; y++) { for(int x=0; x<lx; x++) { v3Df(lx, ly, image->input)[c][y][x] = v2Da(lx, data)[y][x].v[c] / 255.0f; } } } stbi_image_free(data); extract_labels(image->filename, &image->label, &image->label2); return 0; } int sample(const char *dirname, ImageData **images, int *num_samples) { DIR *dir = opendir(dirname); if(!dir) { fprintf(stderr, "ディレクトリ %s のオープンに失敗\n", dirname); return 0; } char *ext = ".bmp"; struct dirent *entry; for(*num_samples = 0; (entry = readdir(dir)); ) if(strstr(entry->d_name, ext)) (*num_samples)++; *images = (ImageData *)calloc((*num_samples) + 2, sizeof(ImageData)); if(!*images) { perror("メモリ割り当て失敗"); closedir(dir); return 0; } rewinddir(dir); int idx = 0; while ((entry = readdir(dir))) { if(!strstr(entry->d_name, ext)) continue; char filepath[256]; snprintf(filepath, sizeof(filepath), "%s/%s", dirname, entry->d_name); snprintf((*images)[idx].filename, sizeof((*images)[idx].filename), "%s", entry->d_name); if(load_image(filepath, &(*images)[idx], INPUT_WIDTH, INPUT_HEIGHT, INPUT_CHANNELS)) continue; idx++; if(verbose && idx % 1000 == 0) printf("\r% 8d\t%8.2f%%", idx, 0.005+100.0f*(idx)/(*num_samples)); } if(verbose) printf("\n"); closedir(dir); *num_samples = idx; return 1; } void show_image_data(ImageData *images, int here){ char fname[256]; snprintf(fname,250,"debug_image_%4.4d.bmp",here); printf(" '%s' save as '%s'\n\n",images[here].filename, fname ); PIXEL d[INPUT_HEIGHT][INPUT_WIDTH]; int lx=INPUT_WIDTH, ly=INPUT_HEIGHT ; PROCESS_ALL_PIXELS d[y][x] = (PIXEL){ images[here].input[0][y][x]*255, images[here].input[1][y][x]*255,images[here].input[2][y][x]*255, 0 }; saveDataAsBMP32(fname, &d[0][0],lx, ly ); }