본문 바로가기

Research/DeepLearning

Softmax Regression

[펌] http://eric-yuan.me/softmax/


Softmax Regression

WHAT IS SOFTMAX

Softmax regression always as one module in a deep learning network, and most likely to be the last module, the output module. What is it? It is a generalized version of logistic regression. Just like logistic regression, it belongs to supervised learning, and the superiority is, the class label y can be more than two values. Which means, compare to logistic regression, the amount of class will not be restricted to two. However, someone may recall that we can use “1 VS others” method to deal with multi-class problems, I hold this question for a while and let’s see formulae of softmax regression.

First let’s see its hypothesis hθ(x)


in which, k means given a test input x, the output y can be any value in set [1, 2, … , k], we want to estimate the probability of the class label taking on each of the k different possible values. So the hθ(x) will be a k dimensional vector.

However, when the products \theta_i^T x^{(i)} are large, the exponential function e^{\theta_i^T x^{(i)}} will become very very large and possibly overflow. To prevent overflow, we can simply subtract some large constant value from each of the \theta_j^T x^{(i)} terms before computing the exponential. In practice, for each example, you can use the maximum of the \theta_j^T x^{(i)} terms as the constant.

333

And followed by cost function and gradient:

666

 

444

555

 

In which, the λ part is called weight decay, which penalizes large values of the parameters. After implementing these part, we can use gradient descent or other advanced methods to get optimal θ by minimizing J(θ) with respect to θ.

SOURCE CODE

001// Softmax regression
002 
003#include <iostream>
004#include <armadillo>
005#include <math.h>
006using namespace arma;
007using namespace std;
008 
009#define elif else if
010#define MAX_ITER 100000
011 
012double cost = 0.0;
013mat grad;
014double lrate = 0.1;
015double lambda = 0.0;
016int nclasses = 2;
017 
018colvec vec2colvec(vector<double>& vec){
019    int length = vec.size();
020    colvec A(length);
021    for(int i=0; i<length; i++){
022        A(i) = vec[i];
023    }
024    return A;
025}
026 
027rowvec vec2rowvec(vector<double>& vec){
028    colvec A = vec2colvec(vec);
029    return A.t();
030}
031 
032mat vec2mat(vector<vector<double> >&vec){
033    int cols = vec.size();
034    int rows = vec[0].size();
035    mat A(rows, cols);
036    for(int i = 0; i<rows; i++){
037        for(int j=0; j<cols; j++){
038            A(i, j) = vec[j][i];
039        }
040    }
041    return A;
042}
043 
044void update_CostFunction_and_Gradient(mat x, rowvec y, mat weightsMatrix, double lambda){
045 
046    int nsamples = x.n_cols;
047    int nfeatures = x.n_rows;
048    //calculate cost function
049    mat theta(weightsMatrix);
050    mat M = theta * x;
051    mat temp = repmat(max(M, 0), nclasses, 1);
052    M = M - temp;
053    M = arma::exp(M);
054    temp = repmat(sum(M, 0), nclasses, 1);
055    M = M / temp;
056 
057    mat groundTruth = zeros<mat> (nclasses, nsamples);
058    for(int i=0; i<y.size(); i++){
059        groundTruth(y(i), i) = 1;
060    }
061    temp = groundTruth % (arma::log(M));
062    cost = -accu(temp) / nsamples;
063    cost += accu(arma::pow(theta, 2)) * lambda / 2;
064 
065    //calculate gradient
066    temp = groundTruth - M;
067    temp = temp * x.t();
068    grad = - temp / nsamples;
069    grad += lambda * theta;
070}
071 
072rowvec calculateY(mat x, mat weightsMatrix){
073 
074    mat theta(weightsMatrix);
075    mat M = theta * x;
076    mat temp = repmat(max(M, 0), nclasses, 1);
077    M = M - temp;
078    M = arma::exp(M);
079    temp = repmat(sum(M, 0), nclasses, 1);
080    M = M / temp;
081    M = arma::log(M);
082    rowvec result = zeros<rowvec>(M.n_cols);
083    for(int i=0; i<M.n_cols; i++){
084        int maxele = INT_MIN;
085        int which = 0;
086        for(int j=0; j<M.n_rows; j++){
087            if(M(j, i) > maxele){
088                maxele = M(j, i);
089                which = j;
090            }
091        }
092        result(i) = which;
093    }
094    return result;
095}
096 
097void softmax(vector<vector<double> >&vecX, vector<double> &vecY, vector<vector<double> >& testX, vector<double>& testY){
098 
099    int nsamples = vecX.size();
100    int nfeatures = vecX[0].size();
101    //change vecX and vecY into matrix or vector.
102    rowvec y = vec2rowvec(vecY);
103    mat x = vec2mat(vecX);
104 
105    double init_epsilon = 0.12;
106    mat weightsMatrix = randu<mat>(nclasses, nfeatures);
107    weightsMatrix = weightsMatrix * (2 * init_epsilon) - init_epsilon;
108    grad = zeros<mat>(nclasses, nfeatures);
109 
110/*
111    //Gradient Checking (remember to disable this part after you're sure the
112    //cost function and dJ function are correct)
113    update_CostFunction_and_Gradient(x, y, weightsMatrix, lambda);
114    mat dJ(grad);
115    cout<<"test!!!!"<<endl;
116    double epsilon = 1e-4;
117    for(int i=0; i<weightsMatrix.n_rows; i++){
118        for(int j=0; j<weightsMatrix.n_cols; j++){
119            double memo = weightsMatrix(i, j);
120            weightsMatrix(i, j) = memo + epsilon;
121            update_CostFunction_and_Gradient(x, y, weightsMatrix, lambda);
122            double value1 = cost;
123            weightsMatrix(i, j) = memo - epsilon;
124            update_CostFunction_and_Gradient(x, y, weightsMatrix, lambda);
125            double value2 = cost;
126            double tp = (value1 - value2) / (2 * epsilon);
127            cout<<i<<", "<<j<<", "<<tp<<", "<<dJ(i, j)<<endl;
128            weightsMatrix(i, j) = memo;
129        }
130    }
131    */
132 
133    int converge = 0;
134    double lastcost = 0.0;
135    while(converge < MAX_ITER){
136        update_CostFunction_and_Gradient(x, y, weightsMatrix, lambda);
137        weightsMatrix -= lrate * grad;
138 
139        cout<<"learning step: "<<converge<<", Cost function value = "<<cost<<endl;
140        if(fabs((cost - lastcost) ) <= 5e-6 && converge > 0) break;
141        lastcost = cost;
142        ++ converge;
143    }
144    cout<<"############result#############"<<endl;
145 
146    rowvec yT = vec2rowvec(testY);
147    mat xT = vec2mat(testX);
148    rowvec result = calculateY(xT, weightsMatrix);
149    rowvec error = yT - result;
150    int correct = error.size();
151    for(int i=0; i<error.size(); i++){
152        if(error(i) != 0) --correct;
153    }
154    cout<<"correct: "<<correct<<", total: "<<error.size()<<", accuracy: "<<double(correct) / (double)(error.size())<<endl;
155 
156}
157 
158int main(int argc, char** argv)
159{
160    long start, end;
161    //read training X from .txt file
162    FILE *streamX, *streamY;
163    streamX = fopen("trainX.txt""r");
164    int numofX = 30;
165    vector<vector<double> > vecX;
166    double tpdouble;
167    int counter = 0;
168    while(1){
169        if(fscanf(streamX, "%lf", &tpdouble)==EOF) break;
170        if(counter / numofX >= vecX.size()){
171            vector<double> tpvec;
172            vecX.push_back(tpvec);
173        }
174        vecX[counter / numofX].push_back(tpdouble);
175        ++ counter;
176    }
177    fclose(streamX);
178 
179    cout<<vecX.size()<<", "<<vecX[0].size()<<endl;
180 
181    //read training Y from .txt file
182    streamY = fopen("trainY.txt""r");
183    vector<double> vecY;
184    while(1){
185        if(fscanf(streamY, "%lf", &tpdouble)==EOF) break;
186        vecY.push_back(tpdouble);
187    }
188    fclose(streamY);
189 
190    for(int i = 1; i<vecX.size(); i++){
191        if(vecX[i].size() != vecX[i - 1].size()) return 0;
192    }
193    if(vecX.size() != vecY.size()) return 0;
194 
195    streamX = fopen("testX.txt""r");
196    vector<vector<double> > vecTX;
197    counter = 0;
198    while(1){
199        if(fscanf(streamX, "%lf", &tpdouble)==EOF) break;
200        if(counter / numofX >= vecTX.size()){
201            vector<double> tpvec;
202            vecTX.push_back(tpvec);
203        }
204        vecTX[counter / numofX].push_back(tpdouble);
205        ++ counter;
206    }
207    fclose(streamX);
208 
209    streamY = fopen("testY.txt""r");
210    vector<double> vecTY;
211    while(1){
212        if(fscanf(streamY, "%lf", &tpdouble)==EOF) break;
213        vecTY.push_back(tpdouble);
214    }
215    fclose(streamY);
216 
217    start = clock();
218    softmax(vecX, vecY, vecTX, vecTY);
219    end = clock();
220    cout<<"Training used time: "<<((double)(end - start)) / CLOCKS_PER_SEC<<" second"<<endl;
221    return 0;
222}

 TEST RESULT

I used Wisconsin Breast Cancer dataset, which has 284 training samples and 284 test samples, there are 30 features in each sample, you can download the dataset here:

trainX.txt   trainY.txt

testX.txt     testY.txt

Here’s the result.

777

 

You can see the speed is very fast, that is because I used Vectorization method during the training process, which means minimize the amount of loop I use. Check my update_CostFunction_and_Gradient function, you will see the trick.

SOFTMAX VS MULTI-BINARY CLASSIFIERS

Now let’s retrieve back to the above question about softmax VS multi-binary classifiers. As Prof. Andrew Ng says, which algorithm to use depend on whether the classes are mutually exclusive, which means, whether the classes are mixed. For example:

  • Case 1. classes are [1, 2, 3, 4, 5, 6]
  • Case 2. classes are [1, 2, 3, odd, even, positive]

For case 1, Softmax regression classifier would be fine, in the second case, use multi-logistic regression.

:)

This entry was posted in AlgorithmMachine Learning and tagged . Bookmark the permalinkPost a comment or leave a trackback: Trackback URL.


'Research > DeepLearning' 카테고리의 다른 글

convolutional neural network 무식하게 분석하기 - 1  (0) 2014.12.24