cnn_CIFAR.c C99でスクラッチ実装した、CIFAR-10、CIFAR-100 用畳み込みニューラルネット(学習+評価)
検証環境が存在しないので、雛形としての提示
#define OUTPUT_SIZE CIFAR-10:10 CIFAR-100:100
EX.
CIFAR-10
+project/
├── Learning/
│ ├── image1.0.png
│ ├── image2.5.png
│ └── ...
├── Evaluation/
│ ├── test1.png
│ ├── test2.png
│ └── ...
├── stb_image.h
└── cifar.c
CIFAR-100
+project/
├── Learning/
│ ├── image1.5.0.png
│ ├── image2.8.5.png
│ └── ...
├── Evaluation/ (ラベルが有れば正解率の算出)
│ ├── test1.5.2.png
│ ├── test2.5.1.png
│ └── ...
├── stb_image.h
└── cifar.c
// https://github.com/nothings/stb/blob/master/stb_image.h
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <dirent.h>
#include <stdint.h>
#include <ctype.h>
int verbose = 0, ActiveThread = 0;
#define PI 3.1415926535897932384626433832795L
#define M_PI PI
typedef struct COMPLEX { float r, i;} COMPLEX;
#define RotorCOMPLEX(a) (COMPLEX){ cos(a), sin(a) }
#define ConjCOMPLEX(a) (COMPLEX){ (a).r, -((a).i) }
#define AddCOMPLEX(a,b) (COMPLEX){ (a).r + (b).r, (a).i + (b).i }
#define SubCOMPLEX(a,b) (COMPLEX){ (a).r - (b).r, (a).i - (b).i }
#define MulCOMPLEX(a,b) (COMPLEX){ (a).r*(b).r - (a).i*(b).i, (a).r*(b).i + (a).i*(b).r }
#define ImeMulCOMPLEX(a,b) (COMPLEX){ (a).r * (b), (a).i * (b) }
#define AppendCOMPLEX(a,b) { (a).r += (b).r, (a).i += (b).i ; }
void FourierTransform2D(COMPLEX **c, int lx, int ly, int flag);
void **alloc2Darray(int size, int lx, int ly);
void ***alloc3Darray(int size, int lx, int ly, int lz);
void extract_labels(const char *filename, int *label1, int *label2);
int saveDataAsBMP32( char *fname, void *b, int lx, int ly );
void argumentAnalysis( int argc, char *argv[], int *skip_training,
int *save_flag, char **save_name, int *load_flag, char **load_name );
#define DEFAULT_WEIGHT_FILE "modelWeight.bin"
#define INPUT_CHANNELS 3
#define INPUT_HEIGHT 32
#define INPUT_WIDTH 32
#define CONV_FILTERS 128
#define KERNEL_SIZE 3
#define POOL_SIZE 2
#define FC_SIZE 512
#define OUTPUT_SIZE 10 // OUTPUT_SIZE ( mode==CIFAR-10 ? 10 : ( mode==CIFAR-100 ? 100 : 0 )
#define BATCH_SIZE 256
#define EPOCHS 20 // The minimum number of learning sessions is 100.
#define LEARNING_RATE (0.035*4)
#define MOMENTUM 0.9
#define FFT -1
#define IFFT 1
#define DFT FFT
#define IDFT IFFT
typedef struct {
char filename[256];
int label2;
int label;
float input[INPUT_CHANNELS][INPUT_HEIGHT][INPUT_WIDTH];
} ImageData;
void show_image_data(ImageData *images, int here);
int load_image(char *filepath, ImageData *image, int width, int height, int channels);
int sample(const char *dirname, ImageData **images, int *num_samples);
typedef struct {
int max_idx[CONV_FILTERS][INPUT_HEIGHT / POOL_SIZE][INPUT_WIDTH / POOL_SIZE][2];
} PoolIndices;
typedef struct {
float conv1[CONV_FILTERS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE];
float bias1[CONV_FILTERS];
float m_conv1[CONV_FILTERS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE];
float m_bias1[CONV_FILTERS];
float conv2[CONV_FILTERS][CONV_FILTERS][KERNEL_SIZE][KERNEL_SIZE];
float bias2[CONV_FILTERS];
float m_conv2[CONV_FILTERS][CONV_FILTERS][KERNEL_SIZE][KERNEL_SIZE];
float m_bias2[CONV_FILTERS];
float fc1[CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)][FC_SIZE];
float bias_fc1[FC_SIZE];
float m_fc1[CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)][FC_SIZE];
float m_bias_fc1[FC_SIZE];
float fc2[FC_SIZE][OUTPUT_SIZE];
float bias_fc2[OUTPUT_SIZE];
float m_fc2[FC_SIZE][OUTPUT_SIZE];
float m_bias_fc2[OUTPUT_SIZE];
float bn1_mean[CONV_FILTERS], bn1_var[CONV_FILTERS];
float bn1_gamma[CONV_FILTERS], bn1_beta[CONV_FILTERS];
float bn2_mean[CONV_FILTERS], bn2_var[CONV_FILTERS];
float bn2_gamma[CONV_FILTERS], bn2_beta[CONV_FILTERS];
float m_bn1_gamma[CONV_FILTERS], m_bn1_beta[CONV_FILTERS];
float m_bn2_gamma[CONV_FILTERS], m_bn2_beta[CONV_FILTERS];
} ConvolutionalNeuralNetwork;
void init_cnn(ConvolutionalNeuralNetwork *cnn) {
float limit;
limit = sqrtf(6.0f / (INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE));
for(int f=0; f<CONV_FILTERS; f++) {
for(int c=0; c<INPUT_CHANNELS; c++) {
for(int y=0; y<KERNEL_SIZE; y++) {
for(int x=0; x<KERNEL_SIZE; x++) {
cnn->conv1[f][c][y][x] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
cnn->m_conv1[f][c][y][x] = 0.0f;
}
}
}
cnn->bias1[f] = 0.0f;
cnn->m_bias1[f] = 0.0f;
cnn->bn1_mean[f] = 0.0f;
cnn->bn1_var[f] = 1.0f;
cnn->bn1_gamma[f] = 1.0f;
cnn->bn1_beta[f] = 0.0f;
cnn->m_bn1_gamma[f] = 0.0f;
cnn->m_bn1_beta[f] = 0.0f;
}
limit = sqrtf(6.0f / (CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE));
for(int f=0; f<CONV_FILTERS; f++) {
for(int c=0; c<CONV_FILTERS; c++) {
for(int y=0; y<KERNEL_SIZE; y++) {
for(int x=0; x<KERNEL_SIZE; x++) {
cnn->conv2[f][c][y][x] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
cnn->m_conv2[f][c][y][x] = 0.0f;
}
}
}
cnn->bias2[f] = 0.0f;
cnn->m_bias2[f] = 0.0f;
cnn->bn2_mean[f] = 0.0f;
cnn->bn2_var[f] = 1.0f;
cnn->bn2_gamma[f] = 1.0f;
cnn->bn2_beta[f] = 0.0f;
cnn->m_bn2_gamma[f] = 0.0f;
cnn->m_bn2_beta[f] = 0.0f;
}
limit = sqrtf(6.0f / (CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE)));
for(int i=0; i<CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE); i++) {
for(int j=0; j<FC_SIZE; j++) {
cnn->fc1[i][j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
cnn->m_fc1[i][j] = 0.0f;
}
}
for(int i=0; i<FC_SIZE; i++) {
cnn->bias_fc1[i] = 0.0f;
cnn->m_bias_fc1[i] = 0.0f;
}
limit = sqrtf(6.0f / FC_SIZE);
for(int i=0; i<FC_SIZE; i++) {
for(int j=0; j<OUTPUT_SIZE; j++) {
cnn->fc2[i][j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * limit;
cnn->m_fc2[i][j] = 0.0f;
}
}
for(int i=0; i<OUTPUT_SIZE; i++) {
cnn->bias_fc2[i] = 0.0f;
cnn->m_bias_fc2[i] = 0.0f;
}
}
void batch_norm(float *input, float *mean, float *var, float *gamma, float *beta, float *output, int size) {
#pragma omp parallel for
for(int f=0; f<size; f++) {
for(int y=0; y<INPUT_HEIGHT; y++)for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
output[idx] = gamma[f] * (input[idx] - mean[f]) / sqrtf(var[f] + 1e-5) + beta[f];
}
}
}
void softmax(float *input, float *output, int size) {
float max_val = input[0];
for(int i=1; i<size; i++){ if(input[i]>max_val) max_val = input[i]; }
float sum = 0.0f;
for(int i=0; i<size; i++) sum += ( output[i] = expf(input[i] - max_val) );
for(int i=0; i<size; i++) output[i] /= sum;
}
void conv_layer_fft( int in_channels, int out_channels, float input[in_channels][INPUT_HEIGHT][INPUT_WIDTH],
float weights[out_channels][in_channels][KERNEL_SIZE][KERNEL_SIZE], float biases[out_channels], float *output) {
const int pad = KERNEL_SIZE / 2;
const int fft_size = 64;
#pragma omp parallel for
for(int f=0; f<out_channels; f++) {
COMPLEX **fft_input = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
COMPLEX **fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
if(!fft_input || !fft_weight){ perror("メモリ割り当て失敗"); free(fft_input); free(fft_weight); continue; }
for(int c=0; c<in_channels; c++) {
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_input[y][x].r = 0.0;
fft_input[y][x].i = 0.0;
fft_weight[y][x].r = 0.0;
fft_weight[y][x].i = 0.0;
}
}
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
fft_input[y + pad][x + pad].r = input[c][y][x];
}
}
for(int y=0; y<KERNEL_SIZE; y++) {
for(int x=0; x<KERNEL_SIZE; x++) {
fft_weight[y][x].r = weights[f][c][y][x];
}
}
FourierTransform2D(fft_input, fft_size, fft_size, FFT);
FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_input[y][x] = MulCOMPLEX(fft_input[y][x], fft_weight[y][x]);
}
}
FourierTransform2D(fft_input, fft_size, fft_size, IFFT);
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
output[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += fft_input[y + pad][x + pad].r;
}
}
}
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
output[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += biases[f];
}
}
free(fft_input); free(fft_weight);
}
}
void fc_layer(int input_size, int output_size, float *input, float weights[input_size][output_size], float biases[output_size], float *output) {
#pragma omp parallel for
for(int j=0; j<output_size; j++) { output[j] = biases[j];
for(int i=0; i<input_size; i++) output[j] += input[i] * weights[i][j];
output[j] = output[j] > 0 ? output[j] : 0;
}
}
#define save_cnn save_weights
void save_weights(ConvolutionalNeuralNetwork *cnn, const char *filename) {
FILE *fp = fopen(filename, "wb");
if(!fp){ perror("重み保存失敗"); return; }
fwrite(cnn->conv1, sizeof(cnn->conv1), 1, fp);
fwrite(cnn->bias1, sizeof(cnn->bias1), 1, fp);
fwrite(cnn->conv2, sizeof(cnn->conv2), 1, fp);
fwrite(cnn->bias2, sizeof(cnn->bias2), 1, fp);
fwrite(cnn->fc1, sizeof(cnn->fc1), 1, fp);
fwrite(cnn->bias_fc1, sizeof(cnn->bias_fc1), 1, fp);
fwrite(cnn->fc2, sizeof(cnn->fc2), 1, fp);
fwrite(cnn->bias_fc2, sizeof(cnn->bias_fc2), 1, fp);
fwrite(cnn->bn1_mean, sizeof(cnn->bn1_mean), 1, fp);
fwrite(cnn->bn1_var, sizeof(cnn->bn1_var), 1, fp);
fwrite(cnn->bn1_gamma, sizeof(cnn->bn1_gamma), 1, fp);
fwrite(cnn->bn1_beta, sizeof(cnn->bn1_beta), 1, fp);
fwrite(cnn->bn2_mean, sizeof(cnn->bn2_mean), 1, fp);
fwrite(cnn->bn2_var, sizeof(cnn->bn2_var), 1, fp);
fwrite(cnn->bn2_gamma, sizeof(cnn->bn2_gamma), 1, fp);
fwrite(cnn->bn2_beta, sizeof(cnn->bn2_beta), 1, fp);
fclose(fp);
}
#define load_cnn load_weights
void load_weights(ConvolutionalNeuralNetwork *cnn, const char *filename) {
FILE *fp = fopen(filename, "rb");
if(!fp){ perror("重み読み込み失敗"); return; }
fread(cnn->conv1, sizeof(cnn->conv1), 1, fp);
fread(cnn->bias1, sizeof(cnn->bias1), 1, fp);
fread(cnn->conv2, sizeof(cnn->conv2), 1, fp);
fread(cnn->bias2, sizeof(cnn->bias2), 1, fp);
fread(cnn->fc1, sizeof(cnn->fc1), 1, fp);
fread(cnn->bias_fc1, sizeof(cnn->bias_fc1), 1, fp);
fread(cnn->fc2, sizeof(cnn->fc2), 1, fp);
fread(cnn->bias_fc2, sizeof(cnn->bias_fc2), 1, fp);
fread(cnn->bn1_mean, sizeof(cnn->bn1_mean), 1, fp);
fread(cnn->bn1_var, sizeof(cnn->bn1_var), 1, fp);
fread(cnn->bn1_gamma, sizeof(cnn->bn1_gamma), 1, fp);
fread(cnn->bn1_beta, sizeof(cnn->bn1_beta), 1, fp);
fread(cnn->bn2_mean, sizeof(cnn->bn2_mean), 1, fp);
fread(cnn->bn2_var, sizeof(cnn->bn2_var), 1, fp);
fread(cnn->bn2_gamma, sizeof(cnn->bn2_gamma), 1, fp);
fread(cnn->bn2_beta, sizeof(cnn->bn2_beta), 1, fp);
fclose(fp);
}
int evaluate(ConvolutionalNeuralNetwork *cnn, ImageData *images, int num_samples, double *eval_time) {
double start_time = omp_get_wtime();
float *conv1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *bn1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *conv2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *bn2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *pool_out = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE), sizeof(float));
float *fc1_out = (float *)calloc(FC_SIZE, sizeof(float));
float *fc2_out = (float *)calloc(OUTPUT_SIZE, sizeof(float));
if(!conv1_out || !bn1_out || !conv2_out || !bn2_out || !pool_out || !fc1_out || !fc2_out) {
perror("メモリ割り当て失敗");
free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out);
free(pool_out); free(fc1_out); free(fc2_out);
return 1;
}
int correct = 0;
for(int i=0; i<num_samples; i++) {
conv_layer_fft(INPUT_CHANNELS, CONV_FILTERS, images[i].input, cnn->conv1, cnn->bias1, conv1_out);
batch_norm(conv1_out, cnn->bn1_mean, cnn->bn1_var, cnn->bn1_gamma, cnn->bn1_beta, bn1_out, CONV_FILTERS);
#pragma omp parallel for
for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn1_out[idx] = fmaxf(0.0f, bn1_out[idx]);
conv_layer_fft(CONV_FILTERS, CONV_FILTERS, (float (*)[INPUT_HEIGHT][INPUT_WIDTH])bn1_out, cnn->conv2, cnn->bias2, conv2_out);
#pragma omp parallel for
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
conv2_out[idx] += bn1_out[idx];
}
}
}
batch_norm(conv2_out, cnn->bn2_mean, cnn->bn2_var, cnn->bn2_gamma, cnn->bn2_beta, bn2_out, CONV_FILTERS);
#pragma omp parallel for
for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn2_out[idx] = fmaxf(0.0f, bn2_out[idx]);
#pragma omp parallel for collapse(2)
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) {
for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) {
float max_val = -INFINITY;
for(int py = 0; py < POOL_SIZE; py++) {
for(int px = 0; px < POOL_SIZE; px++) {
int iy = y * POOL_SIZE + py;
int ix = x * POOL_SIZE + px;
float val = bn2_out[f * INPUT_HEIGHT * INPUT_WIDTH + iy * INPUT_WIDTH + ix];
if(val > max_val) max_val = val;
}
}
pool_out[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x] = max_val;
}
}
}
int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE);
fc_layer(pool_size, FC_SIZE, pool_out, cnn->fc1, cnn->bias_fc1, fc1_out);
fc_layer(FC_SIZE, OUTPUT_SIZE, fc1_out, cnn->fc2, cnn->bias_fc2, fc2_out);
float output[OUTPUT_SIZE];
softmax(fc2_out, output, OUTPUT_SIZE);
int pred = 0;
for(int j=1; j<OUTPUT_SIZE; j++) {
if(output[j] > output[pred]) pred = j;
}
if(pred == images[i].label) correct++;
}
*eval_time = omp_get_wtime() - start_time;
printf("評価精度: %.2f%%\n", 100.0f * correct / num_samples);
free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out);
free(pool_out); free(fc1_out); free(fc2_out);
return 0;
}
int train(ConvolutionalNeuralNetwork *cnn, ImageData *images, int num_samples, int start_epoch, double *train_time) {
double start_time = omp_get_wtime();
float *grad_fc2 = (float *)calloc(FC_SIZE * OUTPUT_SIZE, sizeof(float));
float grad_bias_fc2[OUTPUT_SIZE] = {0};
float *grad_fc1 = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE)
* (INPUT_WIDTH / POOL_SIZE)
* FC_SIZE, sizeof(float));
float grad_bias_fc1[FC_SIZE] = {0};
float *grad_conv2 = (float *)calloc(CONV_FILTERS * CONV_FILTERS * KERNEL_SIZE
* KERNEL_SIZE, sizeof(float));
float grad_bias2[CONV_FILTERS] = {0};
float *grad_conv1 = (float *)calloc(CONV_FILTERS * INPUT_CHANNELS * KERNEL_SIZE
* KERNEL_SIZE, sizeof(float));
float grad_bias1[CONV_FILTERS] = {0};
float grad_bn1_gamma[CONV_FILTERS] = {0}, grad_bn1_beta[CONV_FILTERS] = {0};
float grad_bn2_gamma[CONV_FILTERS] = {0}, grad_bn2_beta[CONV_FILTERS] = {0};
float *conv1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *bn1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *conv2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *bn2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *pool_out = (float *)calloc(CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE)
* (INPUT_WIDTH / POOL_SIZE), sizeof(float));
float *fc1_out = (float *)calloc(FC_SIZE, sizeof(float));
float *fc2_out = (float *)calloc(OUTPUT_SIZE, sizeof(float));
float *grad_pool = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *grad_bn2_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *grad_bn1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *grad_add = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *grad_from_conv2 = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
float *grad_conv1_out = (float *)calloc(CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH, sizeof(float));
PoolIndices pool_indices = {0};
if( !grad_fc2 || !grad_fc1 || !grad_conv2 || !grad_conv1 || !conv1_out || !bn1_out || !conv2_out ||
!bn2_out || !pool_out || !fc1_out || !fc2_out || !grad_pool || !grad_bn2_out || !grad_bn1_out ||
!grad_add || !grad_from_conv2 || !grad_conv1_out) {
perror("メモリ割り当て失敗"); // free all
free(grad_fc2); free(grad_fc1); free(grad_conv2); free(grad_conv1);
free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out);
free(pool_out); free(fc1_out); free(fc2_out);
free(grad_pool); free(grad_bn2_out); free(grad_bn1_out);
free(grad_add); free(grad_from_conv2); free(grad_conv1_out);
return 1;
}
float learning_rate = LEARNING_RATE;
for(int epoch = start_epoch; epoch < EPOCHS; epoch++) {
if(epoch % 10 == 0 && epoch > start_epoch) learning_rate *= 0.1;
float loss = 0.0f;
int correct = 0;
for(int b=0; b<num_samples; b+=BATCH_SIZE){ printf(" % 6d: % 6d (% 6d / %d ) % 4d / %d elapsed time = %.2fsec\n",
b, num_samples/BATCH_SIZE, num_samples, BATCH_SIZE, epoch, EPOCHS,
omp_get_wtime() - start_time );
memset(grad_fc2, 0, sizeof(float) * FC_SIZE * OUTPUT_SIZE );
memset(grad_bias_fc2, 0, sizeof(grad_bias_fc2));
memset(grad_fc1, 0, sizeof(float) * CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) * FC_SIZE);
memset(grad_bias_fc1, 0, sizeof(grad_bias_fc1));
memset(grad_conv2, 0, sizeof(float) * CONV_FILTERS * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE);
memset(grad_bias2, 0, sizeof(grad_bias2));
memset(grad_conv1, 0, sizeof(float) * CONV_FILTERS * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE);
memset(grad_bias1, 0, sizeof(grad_bias1));
memset(grad_bn1_gamma, 0, sizeof(grad_bn1_gamma));
memset(grad_bn1_beta, 0, sizeof(grad_bn1_beta));
memset(grad_bn2_gamma, 0, sizeof(grad_bn2_gamma));
memset(grad_bn2_beta, 0, sizeof(grad_bn2_beta));
memset(grad_add, 0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH);
memset(grad_from_conv2,0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH);
memset(grad_conv1_out, 0, sizeof(float) * CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH);
memset(&pool_indices, 0, sizeof(PoolIndices));
for(int i = b; i < b + BATCH_SIZE && i < num_samples; i++) {
conv_layer_fft(INPUT_CHANNELS, CONV_FILTERS, images[i].input, cnn->conv1, cnn->bias1, conv1_out);
batch_norm(conv1_out, cnn->bn1_mean, cnn->bn1_var, cnn->bn1_gamma, cnn->bn1_beta, bn1_out, CONV_FILTERS);
#pragma omp parallel for
for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn1_out[idx] = fmaxf(0.0f, bn1_out[idx]);
conv_layer_fft(CONV_FILTERS, CONV_FILTERS, (float (*)[INPUT_HEIGHT][INPUT_WIDTH])bn1_out, cnn->conv2, cnn->bias2, conv2_out);
#pragma omp parallel for
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
conv2_out[idx] += bn1_out[idx];
}
}
}
batch_norm(conv2_out, cnn->bn2_mean, cnn->bn2_var, cnn->bn2_gamma, cnn->bn2_beta, bn2_out, CONV_FILTERS);
#pragma omp parallel for
for(int idx=0; idx < CONV_FILTERS * INPUT_HEIGHT * INPUT_WIDTH; idx++) bn2_out[idx] = fmaxf(0.0f, bn2_out[idx]);
#pragma omp parallel for collapse(2)
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) {
for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) {
float max_val = -INFINITY;
int max_y = y * POOL_SIZE, max_x = x * POOL_SIZE;
for(int py = 0; py < POOL_SIZE; py++) {
for(int px = 0; px < POOL_SIZE; px++) {
int iy = y * POOL_SIZE + py;
int ix = x * POOL_SIZE + px;
float val = bn2_out[f * INPUT_HEIGHT * INPUT_WIDTH + iy * INPUT_WIDTH + ix];
if(isnan(val) || isinf(val)) continue;
if(val > max_val) {
max_val = val;
max_y = iy;
max_x = ix;
}
}
}
pool_out[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x] = max_val;
pool_indices.max_idx[f][y][x][0] = max_y;
pool_indices.max_idx[f][y][x][1] = max_x;
}
}
}
int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE);
fc_layer(pool_size, FC_SIZE, pool_out, cnn->fc1, cnn->bias_fc1, fc1_out);
fc_layer(FC_SIZE, OUTPUT_SIZE, fc1_out, cnn->fc2, cnn->bias_fc2, fc2_out);
float output[OUTPUT_SIZE];
softmax(fc2_out, output, OUTPUT_SIZE);
int label = images[i].label;
if(label < 0 || label >= OUTPUT_SIZE) continue;
loss -= logf(output[label] + 1e-10);
int pred = 0;
for(int j=1; j<OUTPUT_SIZE; j++) {
if(output[j] > output[pred]) pred = j;
}
correct += (pred == label) ? 1 : 0;
float grad_fc2_in[OUTPUT_SIZE];
for(int j=0; j<OUTPUT_SIZE; j++) {
grad_fc2_in[j] = output[j] - (j == label ? 1.0f : 0.0f);
}
#pragma omp parallel for
for(int j=0; j<OUTPUT_SIZE; j++) {
grad_bias_fc2[j] += grad_fc2_in[j];
for(int k=0; k<FC_SIZE; k++) {
grad_fc2[k * OUTPUT_SIZE + j] += grad_fc2_in[j] * fc1_out[k];
}
}
float grad_fc1_in[FC_SIZE] = {0};
#pragma omp parallel for
for(int k=0; k<FC_SIZE; k++) {
grad_fc1_in[k] = 0.0f;
if(fc1_out[k] > 0) {
for(int j=0; j<OUTPUT_SIZE; j++) {
grad_fc1_in[k] += grad_fc2_in[j] * cnn->fc2[k][j];
}
}
grad_bias_fc1[k] += grad_fc1_in[k];
for(int m=0; m<pool_size; m++) {
grad_fc1[m * FC_SIZE + k] += grad_fc1_in[k] * pool_out[m];
}
}
#pragma omp parallel for
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT / POOL_SIZE; y++) {
for(int x=0; x<INPUT_WIDTH / POOL_SIZE; x++) {
int max_y = pool_indices.max_idx[f][y][x][0];
int max_x = pool_indices.max_idx[f][y][x][1];
if(max_y >= 0 && max_y < INPUT_HEIGHT && max_x >= 0 && max_x < INPUT_WIDTH) {
grad_pool[f * INPUT_HEIGHT * INPUT_WIDTH + max_y * INPUT_WIDTH + max_x] =
grad_fc1_in[f * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE) + y * (INPUT_WIDTH / POOL_SIZE) + x];
}
}
}
}
#pragma omp parallel for collapse(2)
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
grad_bn2_out[idx] = grad_pool[idx] * (bn2_out[idx] > 0 ? 1.0f : 0.0f);
}
}
}
#pragma omp parallel for collapse(2)
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
grad_bn2_gamma[f] += grad_bn2_out[idx] * (conv2_out[idx] - cnn->bn2_mean[f]) / sqrtf(cnn->bn2_var[f] + 1e-5);
grad_bn2_beta[f] += grad_bn2_out[idx];
}
}
}
#pragma omp parallel for collapse(2)
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
grad_add[idx] = grad_bn2_out[idx] * cnn->bn2_gamma[f] / sqrtf(cnn->bn2_var[f] + 1e-5);
}
}
}
const int pad = KERNEL_SIZE / 2;
const int fft_size = 64;
COMPLEX **fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
COMPLEX **fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; }
#pragma omp parallel for
for(int f=0; f<CONV_FILTERS; f++) {
for(int c=0; c<CONV_FILTERS; c++) {
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_grad[y][x].r = 0.0f;
fft_grad[y][x].i = 0.0f;
fft_weight[y][x].r = 0.0f;
fft_weight[y][x].i = 0.0f;
}
}
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
fft_grad[y + pad][x + pad].r = grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
}
}
for(int y=0; y<KERNEL_SIZE; y++) {
for(int x=0; x<KERNEL_SIZE; x++) {
fft_weight[y][x].r = cnn->conv2[f][c][y][x];
}
}
FourierTransform2D(fft_grad, fft_size, fft_size, FFT);
FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], ConjCOMPLEX(fft_weight[y][x]));
}
}
FourierTransform2D(fft_grad, fft_size, fft_size, IFFT);
for(int ky = 0; ky < KERNEL_SIZE; ky++) {
for(int kx = 0; kx < KERNEL_SIZE; kx++) {
grad_conv2[f * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx] += fft_grad[ky][kx].r;
}
}
}
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
grad_bias2[f] += grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
}
}
}
free(fft_grad);
free(fft_weight);
// Add the missing grad_from_conv2 (backprop for input of conv2)
fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; }
#pragma omp parallel for
for(int c=0; c<CONV_FILTERS; c++) {
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_grad[y][x].r = 0.0f;
fft_grad[y][x].i = 0.0f;
fft_weight[y][x].r = 0.0f;
fft_weight[y][x].i = 0.0f;
}
}
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
fft_grad[y + pad][x + pad].r = grad_add[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
}
}
for(int y=0; y<KERNEL_SIZE; y++) {
for(int x=0; x<KERNEL_SIZE; x++) {
fft_weight[y][x].r = cnn->conv2[f][c][y][x];
}
}
FourierTransform2D(fft_grad, fft_size, fft_size, FFT);
FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], fft_weight[y][x]);
}
}
FourierTransform2D(fft_grad, fft_size, fft_size, IFFT);
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
grad_from_conv2[c * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x] += fft_grad[y + pad][x + pad].r;
}
}
}
}
free(fft_grad);
free(fft_weight);
#pragma omp parallel for collapse(2)
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
grad_bn1_out[idx] = grad_from_conv2[idx] * (bn1_out[idx] > 0.0f ? 1.0f : 0.0f) + grad_add[idx];
grad_bn1_gamma[f] += grad_bn1_out[idx] * (conv1_out[idx] - cnn->bn1_mean[f]) / sqrtf(cnn->bn1_var[f] + 1e-5);
grad_bn1_beta[f] += grad_bn1_out[idx];
}
}
}
#pragma omp parallel for collapse(2)
for(int f=0; f<CONV_FILTERS; f++) {
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
int idx = f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x;
grad_conv1_out[idx] = grad_bn1_out[idx] * cnn->bn1_gamma[f] / sqrtf(cnn->bn1_var[f] + 1e-5);
}
}
}
fft_grad = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
fft_weight = (COMPLEX **)alloc2Darray(sizeof(COMPLEX), fft_size, fft_size);
if(!fft_grad || !fft_weight) { perror("メモリ割り当て失敗"); free(fft_grad); free(fft_weight); continue; }
#pragma omp parallel for
for(int f=0; f<CONV_FILTERS; f++) {
for(int c=0; c<INPUT_CHANNELS; c++) {
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_grad[y][x].r = 0.0f;
fft_grad[y][x].i = 0.0f;
fft_weight[y][x].r = 0.0f;
fft_weight[y][x].i = 0.0f;
}
}
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
fft_grad[y + pad][x + pad].r = grad_conv1_out[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
fft_weight[y + pad][x + pad].r = images[i].input[c][y][x];
}
}
FourierTransform2D(fft_grad, fft_size, fft_size, FFT);
FourierTransform2D(fft_weight, fft_size, fft_size, FFT);
for(int y=0; y<fft_size; y++) {
for(int x=0; x<fft_size; x++) {
fft_grad[y][x] = MulCOMPLEX(fft_grad[y][x], ConjCOMPLEX(fft_weight[y][x]));
}
}
FourierTransform2D(fft_grad, fft_size, fft_size, IFFT);
for(int ky = 0; ky < KERNEL_SIZE; ky++) {
for(int kx = 0; kx < KERNEL_SIZE; kx++) {
grad_conv1[f * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx] += fft_grad[ky][kx].r;
}
}
}
for(int y=0; y<INPUT_HEIGHT; y++) {
for(int x=0; x<INPUT_WIDTH; x++) {
grad_bias1[f] += grad_conv1_out[f * INPUT_HEIGHT * INPUT_WIDTH + y * INPUT_WIDTH + x];
}
}
}
free(fft_grad);
free(fft_weight);
}
// Update parameters
#pragma omp parallel for
for(int j=0; j<OUTPUT_SIZE; j++) {
cnn->m_bias_fc2[j] = MOMENTUM * cnn->m_bias_fc2[j] - learning_rate * grad_bias_fc2[j] / BATCH_SIZE;
cnn->bias_fc2[j] += cnn->m_bias_fc2[j];
for(int k = 0; k < FC_SIZE; k++) {
cnn->m_fc2[k][j] = MOMENTUM * cnn->m_fc2[k][j] - learning_rate * grad_fc2[k * OUTPUT_SIZE + j] / BATCH_SIZE;
cnn->fc2[k][j] += cnn->m_fc2[k][j];
}
}
int pool_size = CONV_FILTERS * (INPUT_HEIGHT / POOL_SIZE) * (INPUT_WIDTH / POOL_SIZE);
#pragma omp parallel for
for(int k = 0; k < FC_SIZE; k++) {
cnn->m_bias_fc1[k] = MOMENTUM * cnn->m_bias_fc1[k] - learning_rate * grad_bias_fc1[k] / BATCH_SIZE;
cnn->bias_fc1[k] += cnn->m_bias_fc1[k];
for(int m = 0; m < pool_size; m++) {
cnn->m_fc1[m][k] = MOMENTUM * cnn->m_fc1[m][k] - learning_rate * grad_fc1[m * FC_SIZE + k] / BATCH_SIZE;
cnn->fc1[m][k] += cnn->m_fc1[m][k];
}
}
#pragma omp parallel for
for(int f=0; f<CONV_FILTERS; f++) {
cnn->m_bias2[f] = MOMENTUM * cnn->m_bias2[f] - learning_rate * grad_bias2[f] / BATCH_SIZE;
cnn->bias2[f] += cnn->m_bias2[f];
for(int c=0; c<CONV_FILTERS; c++) {
for(int ky = 0; ky < KERNEL_SIZE; ky++) {
for(int kx = 0; kx < KERNEL_SIZE; kx++) {
int idx = f * CONV_FILTERS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx;
cnn->m_conv2[f][c][ky][kx] = MOMENTUM * cnn->m_conv2[f][c][ky][kx] - learning_rate * grad_conv2[idx] / BATCH_SIZE;
cnn->conv2[f][c][ky][kx] += cnn->m_conv2[f][c][ky][kx];
}
}
}
cnn->m_bn2_gamma[f] = MOMENTUM * cnn->m_bn2_gamma[f] - learning_rate * grad_bn2_gamma[f] / BATCH_SIZE;
cnn->bn2_gamma[f] += cnn->m_bn2_gamma[f];
cnn->m_bn2_beta[f] = MOMENTUM * cnn->m_bn2_beta[f] - learning_rate * grad_bn2_beta[f] / BATCH_SIZE;
cnn->bn2_beta[f] += cnn->m_bn2_beta[f];
}
#pragma omp parallel for
for(int f=0; f<CONV_FILTERS; f++) {
cnn->m_bias1[f] = MOMENTUM * cnn->m_bias1[f] - learning_rate * grad_bias1[f] / BATCH_SIZE;
cnn->bias1[f] += cnn->m_bias1[f];
for(int c=0; c<INPUT_CHANNELS; c++) {
for(int ky = 0; ky < KERNEL_SIZE; ky++) {
for(int kx = 0; kx < KERNEL_SIZE; kx++) {
int idx = f * INPUT_CHANNELS * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + ky * KERNEL_SIZE + kx;
cnn->m_conv1[f][c][ky][kx] = MOMENTUM * cnn->m_conv1[f][c][ky][kx] - learning_rate * grad_conv1[idx] / BATCH_SIZE;
cnn->conv1[f][c][ky][kx] += cnn->m_conv1[f][c][ky][kx];
}
}
}
cnn->m_bn1_gamma[f] = MOMENTUM * cnn->m_bn1_gamma[f] - learning_rate * grad_bn1_gamma[f] / BATCH_SIZE;
cnn->bn1_gamma[f] += cnn->m_bn1_gamma[f];
cnn->m_bn1_beta[f] = MOMENTUM * cnn->m_bn1_beta[f] - learning_rate * grad_bn1_beta[f] / BATCH_SIZE;
cnn->bn1_beta[f] += cnn->m_bn1_beta[f];
}
}
printf("エポック %d: 損失=%.4f, 精度=%.2f%%\n", epoch, loss / num_samples, 100.0f * correct / num_samples);
}
*train_time = omp_get_wtime() - start_time;
// free all
free(grad_fc2); free(grad_fc1); free(grad_conv2); free(grad_conv1);
free(conv1_out); free(bn1_out); free(conv2_out); free(bn2_out);
free(pool_out); free(fc1_out); free(fc2_out);
free(grad_pool); free(grad_bn2_out); free(grad_bn1_out);
free(grad_add); free(grad_from_conv2); free(grad_conv1_out);
return 0;
}
int main(int argc, char *argv[]){ srand(time(NULL));
double start_time = omp_get_wtime(), train_time = 0.0, eval_time = 0.0;
int skip_training = 0, save_flag = 0, load_flag = 0, status = 1;
char *save_file = DEFAULT_WEIGHT_FILE;
char *load_file = DEFAULT_WEIGHT_FILE;
argumentAnalysis( argc, argv, &skip_training, &save_flag, &save_file, &load_flag, &load_file );
ImageData *train_images = NULL;
ImageData *test_images = NULL;
ConvolutionalNeuralNetwork *cnn = (ConvolutionalNeuralNetwork *)calloc(1,sizeof(ConvolutionalNeuralNetwork));
if(!cnn){ fprintf(stderr, "Failed to allocate memory for ConvolutionalNeuralNetwork.\n");
goto cleanup; }
else if(load_flag) load_cnn(cnn, load_file);
else init_cnn(cnn);
int num_train_samples = 0;
if(!skip_training) goto SKIP_LOAD_TRAINING;
if(!sample("Learning", &train_images, &num_train_samples)) {
fprintf(stderr, "トレーニングデータ読み込み失敗\n");
goto cleanup;;
}
SKIP_LOAD_TRAINING:
int num_test_samples = 0;
if(!sample("Evaluation", &test_images, &num_test_samples)) {
fprintf(stderr, "テストデータ読み込み失敗\n");
goto cleanup;;
}
if(!skip_training) goto SKIP_THIS_TRAINING;
train(cnn, train_images, num_train_samples, 0, &train_time);
printf("トレーニング時間: %.2f秒\n", train_time);
if(save_flag) save_cnn(cnn, save_file);
SKIP_THIS_TRAINING:
evaluate(cnn, test_images, num_test_samples, &eval_time);
printf("評価時間: %.2f秒\n", eval_time);
double total_time = omp_get_wtime() - start_time;
char msg[256];
snprintf(msg, sizeof(msg), "Total processing time: %.2f seconds (Training: %.2f, Evaluation: %.2f)",
total_time, skip_training ? 0.0 : train_time, eval_time); printf("\n%s\n", msg);
status = 0;
cleanup:
free(train_images);
free(test_images);
free(cnn);
return status;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// これ以降の部分は CNN プログラム(上部のコード)を実行するのに必要な環境を提供する
// プログラム群 CNN とは、分離して考える事。
#include <string.h>
// コマンドライン引数解析
void argumentAnalysis( int argc, char *argv[], int *skip_training,
int *save_flag, char **save_name, int *load_flag, char **load_name ){
int v_flg=0, s_flg=0, l_flg=0, r_flg=0, t_flg=0;
for(int i=1; i<argc; i++ ){ // プログラム起動時の引数解析処理
if( argv[i][0]=='-' ) {
switch(argv[i][1]){
case 's': if( s_flg++ ){
fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
goto GOOD_BY;
} else if( r_flg ){
fprintf(stderr, "Error: -s and -r cannot be used together\n" );
goto GOOD_BY;
} else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;}
*save_flag = 1;
*save_name = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i]
: DEFAULT_WEIGHT_FILE;
break;
case 'l': if( l_flg++ ){
fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
goto GOOD_BY;
} else if( r_flg ) {
fprintf(stderr, "Error: -l and -r cannot be used together\n" );
goto GOOD_BY;
} else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;}
*load_flag = 1;
*load_name = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i]
: DEFAULT_WEIGHT_FILE;
break;
case 'r':
if( r_flg++ ){ printf( "++r_flg = %d\n", r_flg );
fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
goto GOOD_BY;
} else if( l_flg ){
fprintf(stderr, "Error: -l and -r cannot be used together\n" );
goto GOOD_BY;
} else if( s_flg ){
fprintf(stderr, "Error: -s and -r cannot be used together\n" );
goto GOOD_BY;
} else if( argv[i][2] ){ printf("'%c'\n",argv[i][2]); goto GOOD_BY;}
*skip_training = 1;
*load_flag = 1;
*load_name = ((i+1)<argc && argv[i+1][0]!='-') ? argv[++i]
: DEFAULT_WEIGHT_FILE;
break;
case 't': if( t_flg++ ){
fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
goto GOOD_BY;
} else if( (i+1)>=argc || argv[i+1][0]=='-' ){
fprintf(stderr, "Error: No maximum number specified\n" );
goto GOOD_BY;
} else if( argv[i][2] ) goto GOOD_BY;
if( (ActiveThread = atoi(argv[++i]))<0 ){
fprintf(stderr, "Error: The number of threads must be specified.\n");
goto GOOD_BY;
}
break;
case 'V': //fallthrough
case 'v': if( v_flg++ ){
fprintf(stderr, "Error: '%c' is already specified.\n", argv[i][1] );
goto GOOD_BY;
} else if( argv[i][2] ) goto GOOD_BY;
verbose = argv[i][1]=='v' ? 1 : 2;
break;
case 'h': // fallthrough
default : printf( "'%c''%c' \n", argv[i][0], argv[i][1] );
goto GOOD_BY;
}
}
} return;
GOOD_BY:; int i; char *p=argv[0], *op="[-v] [-t number] [-s [save_file]] [-l [load_file]] [-r [load_file]]";
for(i=strlen(p); i>0 && p[i]!='\\' && p[i]!='/'; i-- );; if(i) i++;
fprintf( stderr, "\nusage: %s %s\n\n", &p[i], op );
exit( 1 );
}
#pragma pack(push,1)
typedef struct PIXEL { uint8_t B, G, R, A; } PIXEL ;
typedef struct STBPX { uint8_t R, G, B, A; } STBPX ;
typedef struct ARRAY { uint8_t v[4]; } ARRAY ;
#pragma pack(pop)
#define v2D(type,lx,name) ((type(*)[(lx)])name)
#define v2Dc(lx,name) v2D(char, lx, name)
#define v2Di(lx,name) v2D(int,lx,name)
#define v2Df(lx,name) v2D(float,lx,name)
#define v2Dp(lx,name) v2D(PIXEL,lx,name)
#define v2Da(lx,name) v2D(ARRAY,lx,name)
#define v3D(type,lx,ly,name) ((type(*)[(lx)][(ly)])name)
#define v3Df(lx,ly,name) v3D(float, lx, ly, name)
#define HERE fprintf(stderr,"% 6d: %s\n",__LINE__, __func__);
#define PROCESS_ALL_PIXELS for(int y=0;y<ly;y++)for(int x=0;x<lx;x++)
#define saveArray2bmp32(name,a,w,h) saveDataAsBMP32((name),(&(a)[0][0]),(w),(h))
// BMPヘッダ構造体
#pragma pack(push, 2)
typedef struct HEADER {
unsigned short bfType; // 'BM' (0x4D42)
unsigned int bfSize; // ファイルサイズ
unsigned short bfReserved1; // 0
unsigned short bfReserved2; // 0
unsigned int bfOffBits; // データオフセット
unsigned int biSize; // 情報ヘッダサイズ (40)
signed int biWidth; // 幅
signed int biHeight; // 高さ(負で上下反転)
unsigned short biPlanes; // 1
unsigned short biBitCount; // 32
unsigned int biCompression; // 0 (BI_RGB)
unsigned int biSizeImage; // 画像データサイズ
signed int biXPelsPerMeter; // 0
signed int biYPelsPerMeter; // 0
unsigned int biClrUsed; // 0
unsigned int biClrImportant; // 0
unsigned char lut[]; // パレット(8ビット用)
} HEADER;
#pragma pack(pop)
// 画像パラメータ
#define BMPID 0x4D42 // 'BM'
#define BMPIFHSIZE 40 // 情報ヘッダサイズ
int saveDataAsBMP32( char *fname, void *b, int lx, int ly ){
HEADER h = {BMPID, lx*abs(ly)*sizeof(PIXEL)+sizeof(HEADER), 0, 0, sizeof(HEADER),
BMPIFHSIZE, lx, ly, 1, 32, 0, lx*abs(ly)*sizeof(PIXEL), 0, 0, 0, 0 };
FILE *fp=NULL ;
int st=-1;
if( (fp=fopen(fname, "wb")) == NULL ) {
fprintf( stderr, "File '%s': cannot create.\n", fname );
} else if( fwrite( &h, sizeof(HEADER), 1, fp ) != 1 ) {
fprintf( stderr, "File '%s': Header write error.\n", fname );
} else if( fwrite( b, h.biSizeImage, 1, fp ) != 1 ) {
fprintf( stderr, "File '%s': Data write error.\n", fname );
} else st=0;
if(fp)fclose(fp) ;
return st;
}
static inline int ft_size_check(int n){ return (n > 0) && !(n & (n - 1));}
void **alloc2Darray(int s, int lx, int ly){ if(s<1 || lx<1 || ly<1) return NULL;
long long w = lx, h = ly, next = s*w, space = h*sizeof(void *) + w*h*s + 64;
void **d = (void **)calloc(1, space);
if(!d){fprintf(stderr,"alloc2Darray failed: %lld bytes\n",space); return NULL;}
void *base = (void*)((uintptr_t)((char*)d + h * sizeof(void*) + 63) & ~63LL);
for(long long i=0; i<h; i++) d[i] = (void*)((char*)base + next * i);
return d; // C99: uintptr_t same as (void *)
}
void ***alloc3Darray(int size, int lx, int ly, int lz) {
if(size < 1 || lx < 1 || ly < 1 || lz < 1) return NULL;
long long s = size, llx = lx, lly = ly, llz = lz, lyz = lly * llz, line = s * llx;
long long index = (llz * lly + llz) * sizeof(void *), body = llx * lly * llz * s, space;
void ***d = (void ***)calloc(1, space = (index + body + 64));
if(!d){ fprintf(stderr, "alloc3Darray failed: %lld bytes\n", space);
return NULL;
}else{ void **area=(void **)&d[llz], *base=(void*)(((uintptr_t)((char*)d+index+63))&~63LL);
for(int i=0; i<llz; i++) d[i] = &area[lly * i];
for(int i=0; i<lyz; i++) area[i] = (void *)((char *)base + line * i);
}
return d;
}
void FourierTransform2D(COMPLEX **c, int lx, int ly, int flag) {
int elements = lx;
double unit = (flag == IFFT) ? 2 * M_PI / lx : -2 * M_PI / lx;
COMPLEX rotor[lx>ly?lx:ly]; for(int e=0;e<lx;rotor[e]=RotorCOMPLEX(unit*e),e++);
if(ft_size_check(elements)) {
#pragma omp parallel for
for(int y=0; y<ly; y++){ COMPLEX W, temp;
for(int i=0, j=1; j<elements-1; j++){// scrambler, for initial placement
for(int k = elements>>1; k>(i^=k); k>>=1);
if(j<i){temp=c[y][j]; c[y][j]=c[y][i]; c[y][i]=temp;} // exchange
}
for(int ir, i, k, m, mh=1; (m=mh<<1)<=elements; mh=m) { // Operations
for(ir=0, i=0; i<elements; i+=m) { // ir=0: Rotation angle is 0rad.
for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 );
for(int j=i; j<mh+i; j++ ) { int a = j, b = j + mh;
temp = SubCOMPLEX(c[y][a], c[y][b]); // For butterfly operation
c[y][a] = AddCOMPLEX(c[y][a], c[y][b]); // a ->- (a + b) ->- a
c[y][b] = MulCOMPLEX(temp, W); // b ->- (a-b)*W ->- b
} } } double reciprocal = 1.0 / elements;
if(flag==IFFT) for(int e=0; e<elements; c[y][e]=ImeMulCOMPLEX(c[y][e],reciprocal), e++);
}
} else {// DFT
#pragma omp parallel for
for(int y=0; y<ly; y++ ) { COMPLEX tmp[elements]; // 32xsizeof(float) is enough small
for(int u=0; u<elements; u++ ){int v; COMPLEX t = (COMPLEX){0,0};
for( v=0; v<elements; v++ ) AppendCOMPLEX( t, MulCOMPLEX(c[y][v],rotor[(u*v)%elements]) );
tmp[u] = t;
} double reciprocal = 1./elements; // tmp[u]/elements -> tmp[u]*reciprocal
if(flag==IDFT) for(int e=0; e<elements; e++ ) c[y][e] = ImeMulCOMPLEX(tmp[e],reciprocal);
else for(int e=0; e<elements; e++ ) c[y][e] = tmp[e];
}
}
if( lx!=(elements=ly )){ unit = (flag==IFFT) ? 2 * M_PI /elements : -2 * M_PI /elements;
for(int e=0; e<elements; rotor[e]=RotorCOMPLEX(unit*e), e++);
}
if(ft_size_check(elements)){
#pragma omp parallel for
for(int x=0; x<lx; x++ ){ COMPLEX W, temp;
for(int i=0, j=1; j<elements-1; j++ ){// scrambler, for initial placement
for(int k=elements>>1; k>(i^=k); k>>=1 );
if(j<i){temp=c[j][x], c[j][x]=c[i][x], c[i][x]=temp;} // exchange
}
for(int ir, i, k, m, mh=1; (m=mh<<1)<=elements; mh=m) { // Operations
for(ir=0, i=0; i<elements; i+=m ) { // ir=0: Rotation angle is 0rad.
for( W=rotor[ir], k=elements>>2; k>(ir^=k); k>>=1 );
for(int j=i; j<mh+i; j++ ) { int a = j, b = j + mh;
temp = SubCOMPLEX(c[a][x], c[b][x]); // For butterfly operation
c[a][x] = AddCOMPLEX(c[a][x], c[b][x]); // a ->- (a + b) ->- a
c[b][x] = MulCOMPLEX(temp, W); // b ->- (a-b)*W ->- b
} } } double reciprocal=1./elements;
if(flag==IDFT) for(int e=0; e<elements; c[e][x]=ImeMulCOMPLEX(c[e][x],reciprocal), e++);
}
} else {// DFT
#pragma omp parallel for
for(int x=0; x<lx; x++ ) { COMPLEX tmp[elements]; // 32xsizeof(float) is enough small
for(int u=0; u<elements; u++ ){int v; COMPLEX t = (COMPLEX){0,0};
for( v=0; v<elements; v++ ) AppendCOMPLEX( t, MulCOMPLEX(c[v][x],rotor[(u*v)%elements]) );
tmp[u] = t;
} double reciprocal = 1./elements; // tmp[u]/elements -> tmp[u]*reciprocal
if(flag==IDFT) for(int e=0; e<elements; e++ ) c[e][x] = ImeMulCOMPLEX(tmp[e],reciprocal);
else for(int e=0; e<elements; e++ ) c[e][x] = tmp[e];
}
}
}
void extract_labels(const char *filename, int *label1, int *label2) {
int dots = 0, len = 0, start = 0;
if(label1) *label1 = -1;
else { fprintf(stderr, "invalid argument\n"); return; }
if(label2) *label2 = -1;
if(!filename || (len = strlen(filename)) < 1) {
fprintf(stderr, "invalid argument\n");
return;
}
for(int e = 0; e < len; e++) if(filename[e] == '.') dots++;
if(dots <= 1) return;
char string[256] = {0};
for(int e = 0; e < len; e++) { if(filename[start = e] == '.') break; }
snprintf(string, sizeof(string), "%s", &filename[start]);
*strrchr(string, '.') = 0;
int head, tail = strlen(string) - 1;
for(head = tail; 0 <= head; head--) {
if(string[head] != '.' && isdigit(string[head])) continue;
if(string[head] != '.') {
fprintf(stderr, "invalid label character 0x%2.2X\n", string[head]);
return;
} else {
*label1 = atoi(&string[head + 1]);
string[head--] = 0;
}
if(!label2 || dots < 3) break;
for(; 0 <= head && isdigit(string[head]); head--);
if(string[head] != '.') {
fprintf(stderr, "invalid label character 0x%2.2X\n", string[head]);
return;
} else {
*label2 = atoi(&string[head + 1]);
return;
}
}
}
int load_image(char *filepath, ImageData *image, int width, int height, int channels) {
int lx = 0, ly = 0, bpp = 0;
unsigned char *data = stbi_load(filepath, &lx, &ly, &bpp, 4);
if(!data || lx != width || ly != height || bpp != 4) {
if(data) stbi_image_free(data);
fprintf(stderr, "画像サイズ不一致または読み込み失敗: %s\n", image->filename);
return 1;
}
for(int ey = (ly + 1) / 2, ty = ly - 1, y = 0; y < ey; y++) {
for(int w = ty - y, x = 0; x < lx; x++) {
PIXEL p = { v2Dp(lx, data)[y][x].R, v2Dp(lx, data)[y][x].G, v2Dp(lx, data)[y][x].B, 0 };
v2Dp(lx, data)[y][x] = (PIXEL){ v2Dp(lx, data)[w][x].R, v2Dp(lx, data)[w][x].G, v2Dp(lx, data)[w][x].B, 0 };
v2Dp(lx, data)[w][x] = p;
}
}
for(int end = (4 < channels ? 4 : channels), c = 0; c < end; c++) {
for(int y=0; y<ly; y++) {
for(int x=0; x<lx; x++) {
v3Df(lx, ly, image->input)[c][y][x] = v2Da(lx, data)[y][x].v[c] / 255.0f;
}
}
}
stbi_image_free(data);
extract_labels(image->filename, &image->label, &image->label2);
return 0;
}
int sample(const char *dirname, ImageData **images, int *num_samples) {
DIR *dir = opendir(dirname);
if(!dir) { fprintf(stderr, "ディレクトリ %s のオープンに失敗\n", dirname); return 0; }
char *ext = ".bmp";
struct dirent *entry;
for(*num_samples = 0; (entry = readdir(dir)); ) if(strstr(entry->d_name, ext)) (*num_samples)++;
*images = (ImageData *)calloc((*num_samples) + 2, sizeof(ImageData));
if(!*images) { perror("メモリ割り当て失敗"); closedir(dir); return 0; }
rewinddir(dir);
int idx = 0;
while ((entry = readdir(dir))) {
if(!strstr(entry->d_name, ext)) continue;
char filepath[256];
snprintf(filepath, sizeof(filepath), "%s/%s", dirname, entry->d_name);
snprintf((*images)[idx].filename, sizeof((*images)[idx].filename), "%s", entry->d_name);
if(load_image(filepath, &(*images)[idx], INPUT_WIDTH, INPUT_HEIGHT, INPUT_CHANNELS)) continue;
idx++;
if(verbose && idx % 1000 == 0) printf("\r% 8d\t%8.2f%%", idx, 0.005+100.0f*(idx)/(*num_samples));
} if(verbose) printf("\n");
closedir(dir);
*num_samples = idx;
return 1;
}
void show_image_data(ImageData *images, int here){
char fname[256]; snprintf(fname,250,"debug_image_%4.4d.bmp",here);
printf(" '%s' save as '%s'\n\n",images[here].filename, fname );
PIXEL d[INPUT_HEIGHT][INPUT_WIDTH];
int lx=INPUT_WIDTH, ly=INPUT_HEIGHT ;
PROCESS_ALL_PIXELS d[y][x] = (PIXEL){ images[here].input[0][y][x]*255,
images[here].input[1][y][x]*255,images[here].input[2][y][x]*255, 0 };
saveDataAsBMP32(fname, &d[0][0],lx, ly );
}