I chose “Dropped out auto-encoder” as my final project topic in the last semester deep learning course, it was simply dropping out units in regular sparse auto-encoder, and furthermore, in stacked sparse auto-encoder, both in visible layer and hidden layer. It does not work well on auto-encoders, except can be used in fine-tune process of stacked sparse auto-encoder. I think the right thing to do is using denoising auto-encoder, instead.
Denoising auto-encoder was raised by Pascal Vincent et al, the basic idea is to force the hidden layer to discover more robust features and prevent it from simply learning the identity, by training the auto-encoder to reconstruct the input from a corrupted version of it.
The corrupted version of input data can be simply gotten by randomly disable (set zero) some of the input data. For example, say our input is an image, we randomly set some of the pixels zero (pepper noise), and then like regular auto-encoders, train the encoder and decoder by try to reconstruct the original image (without pepper noise).
For stacked auto-encoder, we use denoising auto-encoder in each step we train the auto-encoder layer by layer (corrupt the reconstructed result of previous layer as input of next layer), and do not corrupt in the fine-tune process.
Denoising auto-encoder shares advantages of dropout neural nets, and is more adaptable to sparse auto-encoder. By randomly disable input data, means each single unit can be disabled by chance, the network we get is not only more robust, but also can be seen as an approximation of many networks with massive weight sharing.
CODE
// denoising auto-encoder // single layer dae // About corrupting part, check getBernoulliMatrix() // // Author: Eric Yuan // Blog: http://eric-yuan.me // You are FREE to use the following code for ANY purpose. // // To run this code, you should have OpenCV in your computer. // Have fun with it 🙂 #include "opencv2/core/core.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/highgui/highgui.hpp" #include <math.h> #include <fstream> #include <iostream> #include <random> using namespace cv; using namespace std; #define IS_TEST_SA 0 #define ATD at<double> #define elif else if #define RELU 0 #define SIGMOID 1 #define TANH 2 int NL_Method = SIGMOID; double probDestruction = 1.0; int batch; typedef struct SparseAutoencoder{ Mat W1; Mat W2; Mat b1; Mat b2; Mat W1grad; Mat W2grad; Mat b1grad; Mat b2grad; double cost; }SA; typedef struct SparseAutoencoderActivation{ Mat aInput; Mat aHidden; Mat aOutput; Mat zHidden; Mat zOutput; }SAA; Mat concatenateMat(vector<Mat> &vec){ int height = vec[0].rows; int width = vec[0].cols; Mat res = Mat::zeros(height * width, vec.size(), CV_64FC1); for(int i=0; i<vec.size(); i++){ Mat img(height, width, CV_64FC1); vec[i].convertTo(img, CV_64FC1); // reshape(int cn, int rows=0), cn is num of channels. Mat ptmat = img.reshape(0, height * width); Rect roi = cv::Rect(i, 0, ptmat.cols, ptmat.rows); Mat subView = res(roi); ptmat.copyTo(subView); } divide(res, 255.0, res); return res; } int ReverseInt (int i){ unsigned char ch1, ch2, ch3, ch4; ch1 = i & 255; ch2 = (i >> 8) & 255; ch3 = (i >> 16) & 255; ch4 = (i >> 24) & 255; return((int) ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4; } void read_Mnist(string filename, vector<Mat> &vec){ ifstream file(filename, ios::binary); if (file.is_open()){ int magic_number = 0; int number_of_images = 0; int n_rows = 0; int n_cols = 0; file.read((char*) &magic_number, sizeof(magic_number)); magic_number = ReverseInt(magic_number); file.read((char*) &number_of_images,sizeof(number_of_images)); number_of_images = ReverseInt(number_of_images); file.read((char*) &n_rows, sizeof(n_rows)); n_rows = ReverseInt(n_rows); file.read((char*) &n_cols, sizeof(n_cols)); n_cols = ReverseInt(n_cols); for(int i = 0; i < number_of_images; ++i){ Mat tpmat = Mat::zeros(n_rows, n_cols, CV_8UC1); for(int r = 0; r < n_rows; ++r){ for(int c = 0; c < n_cols; ++c){ unsigned char temp = 0; file.read((char*) &temp, sizeof(temp)); tpmat.at<uchar>(r, c) = (int) temp; } } vec.push_back(tpmat); } } } void read_Mnist_Label(string filename, Mat &mat) { ifstream file(filename, ios::binary); if (file.is_open()){ int magic_number = 0; int number_of_images = 0; int n_rows = 0; int n_cols = 0; file.read((char*) &magic_number, sizeof(magic_number)); magic_number = ReverseInt(magic_number); file.read((char*) &number_of_images,sizeof(number_of_images)); number_of_images = ReverseInt(number_of_images); for(int i = 0; i < number_of_images; ++i){ unsigned char temp = 0; file.read((char*) &temp, sizeof(temp)); mat.ATD(0, i) = (double)temp; } } } Mat getBernoulliMatrix(int height, int width, double prob){ Mat res = Mat::zeros(height, width, CV_64FC1); std::default_random_engine e; std::bernoulli_distribution b(prob); for(int i = 0; i < height; i++){ for(int j = 0; j < width; j++){ bool tmp = b(e); if(tmp) res.ATD(i, j) = 1.0; } } return res; } Mat sigmoid(Mat &M){ Mat temp; exp(-M, temp); return 1.0 / (temp + 1.0); } Mat dsigmoid(Mat &a){ Mat res = 1.0 - a; res = res.mul(a); return res; } Mat Tanh(Mat &M){ Mat res(M); for(int i=0; i<res.rows; i++){ for(int j=0; j<res.cols; j++){ res.ATD(i, j) = tanh(M.ATD(i, j)); } } return res; } Mat dTanh(Mat &M){ Mat res = Mat::ones(M.rows, M.cols, CV_64FC1); Mat temp; pow(M, 2.0, temp); res -= temp; return res; } Mat ReLU(Mat& M){ Mat res(M); for(int i=0; i<M.rows; i++){ for(int j=0; j<M.cols; j++){ if(M.ATD(i, j) < 0.0) res.ATD(i, j) = 0.0; } } return res; } Mat dReLU(Mat& M){ Mat res = Mat::zeros(M.rows, M.cols, CV_64FC1); for(int i=0; i<M.rows; i++){ for(int j=0; j<M.cols; j++){ if(M.ATD(i, j) > 0.0) res.ATD(i, j) = 1.0; } } return res; } Mat non_Linearity(Mat &M){ if(NL_Method == RELU){ return ReLU(M); }elif(NL_Method == TANH){ return Tanh(M); }else{ return sigmoid(M); } } Mat d_non_Linearity(Mat &M){ if(NL_Method == RELU){ return dReLU(M); }elif(NL_Method == TANH){ return dTanh(M); }else{ return dsigmoid(M); } } void weightRandomInit(SA &sa, int inputsize, int hiddensize, int nsamples, double epsilon){ double *pData; sa.W1 = Mat::ones(hiddensize, inputsize, CV_64FC1); for(int i=0; i<hiddensize; i++){ pData = sa.W1.ptr<double>(i); for(int j=0; j<inputsize; j++){ pData[j] = randu<double>(); } } sa.W1 = sa.W1 * (2 * epsilon) - epsilon; sa.W2 = Mat::ones(inputsize, hiddensize, CV_64FC1); for(int i=0; i<inputsize; i++){ pData = sa.W2.ptr<double>(i); for(int j=0; j<hiddensize; j++){ pData[j] = randu<double>(); } } sa.W2 = sa.W2 * (2 * epsilon) - epsilon; sa.b1 = Mat::ones(hiddensize, 1, CV_64FC1); for(int j=0; j<hiddensize; j++){ sa.b1.ATD(j, 0) = randu<double>(); } sa.b1 = sa.b1 * (2 * epsilon) - epsilon; sa.b2 = Mat::ones(inputsize, 1, CV_64FC1); for(int j=0; j<inputsize; j++){ sa.b2.ATD(j, 0) = randu<double>(); } sa.b2 = sa.b2 * (2 * epsilon) - epsilon; sa.W1grad = Mat::zeros(hiddensize, inputsize, CV_64FC1); sa.W2grad = Mat::zeros(inputsize, hiddensize, CV_64FC1); sa.b1grad = Mat::zeros(hiddensize, 1, CV_64FC1); sa.b2grad = Mat::zeros(inputsize, 1, CV_64FC1); sa.cost = 0.0; } void getSparseAutoencoderActivation(SA &sa, Mat &data, SAA &acti){ data.copyTo(acti.aInput); acti.zHidden = sa.W1 * acti.aInput + repeat(sa.b1, 1, data.cols); acti.aHidden = non_Linearity(acti.zHidden); acti.zOutput = sa.W2 * acti.aHidden + repeat(sa.b2, 1, data.cols); acti.aOutput = non_Linearity(acti.zOutput); } void sparseAutoencoderCost(SA &sa, Mat &corruptedData, Mat &data, double lambda, double sparsityParam, double beta){ int nfeatures = data.rows; int nsamples = data.cols; SAA acti; getSparseAutoencoderActivation(sa, corruptedData, acti); Mat errtp = acti.aOutput - data; pow(errtp, 2.0, errtp); errtp /= 2.0; double Jcost = sum(errtp)[0] / nsamples; // now calculate pj which is the average activation of hidden units Mat pj; reduce(acti.aHidden, pj, 1, CV_REDUCE_SUM); pj /= nsamples; // the second part is weight decay part double Jweight = 0.0; Mat temp; pow(sa.W1, 2.0, temp); Jweight += sum(temp)[0]; pow(sa.W2, 2.0, temp); Jweight += sum(temp)[0]; Jweight *= lambda / 2.0; // the third part of overall cost function is the sparsity part temp = sparsityParam / pj; log(temp, temp); errtp = temp * sparsityParam; temp = (1 - sparsityParam) / (1 - pj); log(temp, temp); errtp += temp * (1 - sparsityParam); double Jsparse = sum(errtp)[0] * beta; sa.cost = Jcost + Jweight + Jsparse; // following are for calculating the grad of weights. Mat delta3 = -(data - acti.aOutput); delta3 = delta3.mul(d_non_Linearity(acti.aOutput)); Mat temp2 = -sparsityParam / pj + (1 - sparsityParam) / (1 - pj); temp2 *= beta; Mat delta2 = sa.W2.t() * delta3 + repeat(temp2, 1, nsamples); delta2 = delta2.mul(d_non_Linearity(acti.aHidden)); Mat nablaW1 = delta2 * acti.aInput.t(); Mat nablaW2 = delta3 * acti.aHidden.t(); Mat nablab1, nablab2; delta3.copyTo(nablab2); delta2.copyTo(nablab1); sa.W1grad = nablaW1 / nsamples + lambda * sa.W1; sa.W2grad = nablaW2 / nsamples + lambda * sa.W2; reduce(nablab1, sa.b1grad, 1, CV_REDUCE_SUM); reduce(nablab2, sa.b2grad, 1, CV_REDUCE_SUM); sa.b1grad /= nsamples; sa.b2grad /= nsamples; } void gradientChecking(SA &sa, Mat &data, double lambda, double sparsityParam, double beta){ //Gradient Checking (remember to disable this part after you're sure the //cost function and dJ function are correct) sparseAutoencoderCost(sa, data, data, lambda, sparsityParam, beta); Mat w1g(sa.W1grad); cout<<"test sparse autoencoder !!!!"<<endl; double epsilon = 1e-4; for(int i=0; i<sa.W1.rows; i++){ for(int j=0; j<sa.W1.cols; j++){ double memo = sa.W1.ATD(i, j); sa.W1.ATD(i, j) = memo + epsilon; sparseAutoencoderCost(sa, data, data, lambda, sparsityParam, beta); double value1 = sa.cost; sa.W1.ATD(i, j) = memo - epsilon; sparseAutoencoderCost(sa, data, data, lambda, sparsityParam, beta); double value2 = sa.cost; double tp = (value1 - value2) / (2 * epsilon); cout<<i<<", "<<j<<", "<<tp<<", "<<w1g.ATD(i, j)<<", "<<w1g.ATD(i, j) / tp<<endl; sa.W1.ATD(i, j) = memo; } } } void trainSparseAutoencoder(SA &sa, Mat &data, int hiddenSize, double lambda, double sparsityParam, double beta){ int nfeatures = data.rows; int nsamples = data.cols; weightRandomInit(sa, nfeatures, hiddenSize, nsamples, 0.12); if (IS_TEST_SA){ gradientChecking(sa, data, lambda, sparsityParam, beta); }else{ cout<<"Sparse Autoencoder Learning................"<<endl; // define the velocity vectors. Mat v_W1 = Mat::zeros(sa.W1.rows, sa.W1.cols, CV_64FC1); Mat v_W2 = Mat::zeros(sa.W2.rows, sa.W2.cols, CV_64FC1); Mat v_b1 = Mat::zeros(sa.b1.rows, sa.b1.cols, CV_64FC1); Mat v_b2 = Mat::zeros(sa.b2.rows, sa.b2.cols, CV_64FC1); int epochs = 3000; double lrate = 0.1; int T = 200; double epsilon0 = 80.0; double epsilont; double f = 0.999; double pi = 0.5; double pf = 0.99; double pt; for(int t = 0; t < epochs; t++){ if(t > T) pt = pf; else pt = (double)t / T * pi + (1 - (double)t / T) * pf; epsilont = epsilon0 * pow(f, t); int randomNum = ((long)rand() + (long)rand()) % (data.cols - batch); Rect roi = Rect(randomNum, 0, batch, data.rows); Mat batchX = data(roi); Mat corrupted = getBernoulliMatrix(batchX.rows, batchX.cols, probDestruction); corrupted = batchX.mul(corrupted); sparseAutoencoderCost(sa, corrupted, batchX, lambda, sparsityParam, beta); v_W1 = pt * v_W1 - (1 - pt) * epsilont * (lrate * sa.W1grad); v_W2 = pt * v_W2 - (1 - pt) * epsilont * (lrate * sa.W2grad); v_b1 = pt * v_b1 - (1 - pt) * epsilont * (lrate * sa.b1grad); v_b2 = pt * v_b2 - (1 - pt) * epsilont * (lrate * sa.b2grad); sa.W1 += v_W1; sa.W2 += v_W2; sa.b1 += v_b1; sa.b2 += v_b2; cout<<"epoch: "<<t<<", Cost function value = "<<sa.cost<<endl; } } } void readData(Mat &x, Mat &y, string xpath, string ypath, int number_of_images){ //read MNIST iamge into OpenCV Mat vector int image_size = 28 * 28; vector<Mat> vec; //vec.resize(number_of_images, cv::Mat(28, 28, CV_8UC1)); read_Mnist(xpath, vec); //read MNIST label into double vector y = Mat::zeros(1, number_of_images, CV_64FC1); read_Mnist_Label(ypath, y); x = concatenateMat(vec); } void saveWeight2txt(Mat &data, SA &sa){ FILE *pOut = fopen("weights.txt", "w"); for(int i=0; i<sa.W1.rows; i++){ for(int j=0; j<sa.W1.cols; j++){ fprintf(pOut, "%lf", sa.W1.ATD(i, j)); if(j == sa.W1.cols - 1) fprintf(pOut, "\n"); else fprintf(pOut, " "); } } fclose(pOut); } int main(int argc, char** argv) { long start, end; start = clock(); Mat trainX, trainY; readData(trainX, trainY, "mnist/train-images-idx3-ubyte", "mnist/train-labels-idx1-ubyte", 60000); cout<<"Read trainX successfully, including "<<trainX.rows<<" features and "<<trainX.cols<<" samples."<<endl; batch = trainX.cols / 100; // Finished reading data // pre-processing data. Scalar mean, stddev; meanStdDev(trainX, mean, stddev); Mat normX = trainX - mean[0]; normX.copyTo(trainX); SA sa; trainSparseAutoencoder(sa, trainX, 200, 5e-4, 0.1, 3); saveWeight2txt(trainX, sa); end = clock(); cout<<"Totally used time: "<<((double)(end - start)) / CLOCKS_PER_SEC<<" second"<<endl; //waitKey(0); return 0; }
RESULT
Filters obtained after training the single layer auto-encoder:
Encoder with no destroyed inputs:
Encoder with 25% destruction:
Encoder with 50% destruction:
As we can see, without noise, some filters remain uninteresting or very little interesting; as we increase the noise level, the we get better filters.
Enjoy it 🙂