You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/02/01 03:08:53 UTC
[GitHub] theSparta opened a new issue #8126: Not able to train a neural network [XOR added]

theSparta opened a new issue #8126: Not able to train a neural network [XOR added]
URL: https://github.com/apache/incubator-mxnet/issues/8126
 
 
   ## Environment info
   Operating System: Ubuntu 16.04 
   Compiler: g++
   Package used (Python/R/Scala/Julia): C++
   
   MXNet version: 0.11
   
   Or if installed from source:
   MXNet commit hash (`git rev-parse HEAD`):  branch 0.11.0
   
   ## Error Message:
   **There is no error but the training accuracy always remains ZERO.** Is there a problem with the code for constructing the neural network (the `siamese` symbol) or/and is the training procedure correct, i.e. the gradient updates are correctly specified?
   
   ## Minimum reproducible example
   
   1. I defined a siamese net like architecture with `LogisticRegressionOutput` as final layer using the code provided below:
   
   ```C++
   #include <iostream>
   #include <map>
   #include <string>
   #include "mxnet-cpp/MxNetCpp.h"
   #include "../include/mxnet-cpp/op.h"
   
   using namespace std;
   using namespace mxnet::cpp;
   
   Symbol mlp(const vector<int> &layers, const vector<Symbol> & weights,
               const std::vector<Symbol> & biases, const string & inp_name )
   {
     auto x = Symbol::Variable(inp_name);
     vector<Symbol> outputs(layers.size());
   
     for (size_t i = 0; i < layers.size(); ++i)
     {
       string istr = to_string(i);
       Symbol fc = FullyConnected(
         i == 0? x : outputs[i-1],  // data
         weights[i],
         biases[i],
         layers[i]);
       outputs[i] = i == layers.size()-1 ? fc : LeakyReLU(string("act") + istr, fc, 
       LeakyReLUActType::kLeaky);
     }
     return outputs.back();
   }
   
   int main(int argc, char** argv)
   {
       const int feature_size = 2;
       const vector<int> layers{32, 16, 1};
       const int batch_size = 100;
       const int max_epoch = 10;
       const float learning_rate = 0.01;
       const float weight_decay = 1e-2;
   
       auto ctx = Context::gpu(); // Use GPU for training
       auto ctx_cpu = Context::cpu();
   
       vector<Symbol> weights(layers.size());
       vector<Symbol> biases(layers.size());
   
       for (size_t i = 0; i < layers.size(); ++i)
       {
           string istr = to_string(i);
           weights[i] = Symbol::Variable("w" + istr);
           biases[i] = Symbol::Variable("b" + istr);
       }
       
       // CONSTRUCT the network
       /**********************************************************************/
       auto Net = mlp(layers, weights, biases, "X");
       auto Net2 = mlp(layers, weights, biases, "X1");
   
       auto sym_label = Symbol::Variable("label");
       auto siamese = LogisticRegressionOutput(string("sigmoid"), Net - Net2, sym_label);
   
       map<string, NDArray> args_map;
       map<string, NDArray> aux_map;
   
       /*we should tell mxnet the shape of data and label*/
       args_map["X"] = NDArray(Shape(batch_size, feature_size) , ctx);
       args_map["X1"] = NDArray(Shape(batch_size, feature_size) , ctx);
       args_map["label"] = NDArray(Shape(batch_size, 1), ctx);
   
       /*with data and label, executor can be generated automatically*/
       auto *exec = siamese.SimpleBind(ctx, args_map);
       siamese.InferArgsMap(ctx, &args_map, args_map);
       auto arg_names = siamese.ListArguments();
       /**********************************************************************/
   
       Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
       for (auto &arg : args_map)
       {
           if (arg.first == "X" || arg.first == "X1" || arg.first == "label") continue;
           xavier(arg.first, &arg.second);
       }
   
       Optimizer* opt = OptimizerRegistry::Find("adam");
       // opt->SetParam("rescale_grad", 1.0 / batch_size)
           opt->SetParam("lr", learning_rate)
           ->SetParam("wd", weight_decay);
   
        /** DATA SETUP **/
       /**********************************************************************/
       int data_count = batch_size * 100;
       mx_float* aptr_x = new mx_float[data_count * 2];
       mx_float* aptr_x1 = new mx_float[data_count * 2];
       mx_float* aptr_y = new mx_float[data_count];
   
       // we make the data by hand, in 10 classes, with some pattern
       for (int i = 0; i < data_count; i++)
       {
           for (int j = 0; j < 2; j++)
           {
             aptr_x[i * 2 + j] = (i%100 + j) *1.0f;
             aptr_x1[i * 2 + j] = -(i%100 + j) *1.0f;
           }
           aptr_y[i] = 1;
       }
   
       NDArray data_array = NDArray(Shape(data_count, 2), ctx_cpu, false);  
       NDArray data1_array = NDArray(Shape(data_count, 2), ctx_cpu, false);  
       NDArray label_array = NDArray(Shape(data_count), ctx_cpu, false);
       data_array.SyncCopyFromCPU(aptr_x, data_count * 2);
       data1_array.SyncCopyFromCPU(aptr_x1, data_count*2);
       label_array.SyncCopyFromCPU(aptr_y, data_count);
       data_array.WaitToRead();
       data1_array.WaitToRead();
       label_array.WaitToRead();
   
       int val_fold = 1;
       size_t train_num = data_count * (1 - val_fold / 10.0);
       NDArray train_data, train1_data, val_data, val1_data;
       NDArray train_label, val_label;
       train_data = data_array.Slice(0, train_num);
       train1_data = data1_array.Slice(0, train_num);
       train_label = label_array.Slice(0, train_num);
       /**********************************************************************/
   
       // TRAINING the network
       Accuracy acu_train;
       for (int ITER = 0; ITER < max_epoch*100; ++ITER)
       {
           size_t start_index = 0;
           /*reset the metric every epoch*/
           acu_train.Reset();
           while (start_index < train_num)
           {
               if (start_index + batch_size > train_num)
               {
                 start_index = train_num - batch_size;
               }
               args_map["X"] =
                   train_data.Slice(start_index, start_index + batch_size).Copy(ctx);
               args_map["X1"] =
                   train1_data.Slice(start_index, start_index + batch_size).Copy(ctx);
               args_map["label"] =
                   train_label.Slice(start_index, start_index + batch_size).Copy(ctx);
               start_index += batch_size;
               NDArray::WaitAll();
   
               exec->Forward(true);
               exec->Backward();
               // Update parameters
               for (size_t i = 0; i < arg_names.size(); ++i)
               {
                   if (arg_names[i] == "X" || arg_names[i] == "X1" || arg_names[i] == "label") continue;
                   opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
               }
               NDArray::WaitAll();
               acu_train.Update(args_map["label"], exec->outputs[0]);
           }
           LG << "ITER: " << ITER << " Train Accuracy: " << acu_train.Get();
       }
   
       delete exec;
       delete [] aptr_x;
       delete [] aptr_x1;
       delete [] aptr_y;
       MXNotifyShutdown();
       return 0;
    }
   ```
   
   The output of the code is always something like:
   ```
   [18:43:06] neural_net_eval.cpp:172: ITER: 0 Train Accuracy: 0
   [18:43:06] neural_net_eval.cpp:172: ITER: 1 Train Accuracy: 0
   [18:43:06] neural_net_eval.cpp:172: ITER: 2 Train Accuracy: 0
   [18:43:06] neural_net_eval.cpp:172: ITER: 3 Train Accuracy: 0
   [18:43:06] neural_net_eval.cpp:172: ITER: 4 Train Accuracy: 0
   [18:43:06] neural_net_eval.cpp:172: ITER: 5 Train Accuracy: 0
   [18:43:06] neural_net_eval.cpp:172: ITER: 6 Train Accuracy: 0
   ```
   
   ## What have you tried to solve it?
   
   1. Tried different variations on training data but the network is always predicting ZERO as its output.
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services