You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/09/16 06:19:59 UTC

[20/21] incubator-singa git commit: SINGA-10 Add Support for Recurrent Neural Networks (RNN)

SINGA-10 Add Support for Recurrent Neural Networks (RNN)

Add user-defined records for word (word string, word id, class id,
startpos, endpos);
Implement RnnDataLayer, WordLayer and RnnLabelLayer;
Implement create_shard.cc for the sample dataset of rnnlmlib;


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/b4a8d2b2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/b4a8d2b2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/b4a8d2b2

Branch: refs/heads/tutorial
Commit: b4a8d2b2beb077e6488569791a1581add4a957bc
Parents: e53a23c
Author: kaiping <ka...@comp.nus.edu.sg>
Authored: Sun Sep 13 20:07:11 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Wed Sep 16 11:39:16 2015 +0800

----------------------------------------------------------------------
 examples/rnnlm/Makefile.example |  26 +++
 examples/rnnlm/create_shard.cc  | 400 +++++++++++++++++++++++++++++++++++
 examples/rnnlm/main.cc          |   3 +
 examples/rnnlm/rnnlm.cc         |  77 +++++++
 examples/rnnlm/rnnlm.h          |  47 ++++
 examples/rnnlm/rnnlm.proto      |  15 ++
 6 files changed, 568 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/rnnlm/Makefile.example b/examples/rnnlm/Makefile.example
index 5eeca78..b4505cf 100644
--- a/examples/rnnlm/Makefile.example
+++ b/examples/rnnlm/Makefile.example
@@ -1,5 +1,31 @@
 MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
 
+libs :=singa glog protobuf
+filename = rnnlm-0.4b.tgz
+# note: filelink for rnnlm-0.4b may change
+filelink = https://f25ea9ccb7d3346ce6891573d543960492b92c30.googledrive.com/host/0ByxdPXuxLPS5RFM5dVNvWVhTd0U
+dirname = $(patsubst %.tgz,%, $(filename))
+numclass = 100
+dirshards = train_shard valid_shard test_shard
+
+
+.PHONY: all download create
+
+download: rnnlm
+
+rnnlm:
+	wget $(filelink)/$(filename)
+	tar zxf $(filename)
+	rm $(filename)
+
+create:
+	$(CXX) create_shard.cc -std=c++11 -lsinga -lprotobuf -lzookeeper_mt -lglog -I../../include \
+		-L../../.libs/ -L/usr/local/lib -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \
+		-o create_shard.bin
+	for d in $(dirshards); do mkdir -p $${d}; done
+	./create_shard.bin -train $(dirname)/train -class_size $(numclass) -test $(dirname)/test
+
+
 all:
 	protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. rnnlm.proto
 	$(CXX) main.cc rnnlm.cc rnnlm.pb.cc $(MSHADOW_FLAGS) -std=c++11 -lsinga -lglog -lprotobuf -lopenblas -I../../include\

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/create_shard.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/create_shard.cc b/examples/rnnlm/create_shard.cc
new file mode 100644
index 0000000..dd56a84
--- /dev/null
+++ b/examples/rnnlm/create_shard.cc
@@ -0,0 +1,400 @@
+//
+// This code creates DataShard for RNNLM dataset.
+// It is adapted from the convert_mnist_data from Caffe
+// The RNNLM dataset could be downloaded at
+//    http://www.rnnlm.org/
+//
+// Usage:
+//    create_shard.bin -train train_file -class_size [-debug] [-valid valid_file] [-test test_file]
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+
+#include "utils/data_shard.h"
+#include "utils/common.h"
+#include "proto/common.pb.h"
+#include "singa.h"
+#include "rnnlm.pb.h"
+
+#define MAX_STRING 100
+
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include <algorithm>
+#include <fstream>
+
+using namespace std;
+using singa::DataShard;
+
+struct vocab_word {
+    int cn;
+    char word[MAX_STRING];
+    int class_index;
+};
+
+struct vocab_word *vocab;
+int vocab_max_size;
+int vocab_size;
+int *vocab_hash;
+int vocab_hash_size;
+int debug_mode;
+int old_classes;
+int *class_start;
+int *class_end;
+int class_size;
+
+char train_file[MAX_STRING];
+char valid_file[MAX_STRING];
+char test_file[MAX_STRING];
+
+int valid_mode;
+int test_mode;
+
+unsigned int getWordHash(char *word) {
+    unsigned int hash, a;
+
+    hash = 0;
+    for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a];
+    hash = hash % vocab_hash_size;
+
+    return hash;
+}
+
+int searchVocab(char *word) {
+    int a;
+    unsigned int hash;
+
+    hash = getWordHash(word);
+
+    if (vocab_hash[hash] == -1) return -1;
+    if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
+
+    for (a = 0; a < vocab_size; a++) {                //search in vocabulary
+        if (!strcmp(word, vocab[a].word)) {
+            vocab_hash[hash] = a;
+            return a;
+        }
+    }
+
+    return -1;                            //return OOV if not found
+}
+
+int addWordToVocab(char *word) {
+    unsigned int hash;
+
+    strcpy(vocab[vocab_size].word, word);
+    vocab[vocab_size].cn = 0;
+    vocab_size++;
+
+    if (vocab_size + 2 >= vocab_max_size) {        //reallocate memory if needed
+        vocab_max_size += 100;
+        vocab = (struct vocab_word *) realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
+    }
+
+    hash = getWordHash(word);
+    vocab_hash[hash] = vocab_size - 1;
+
+    return vocab_size - 1;
+}
+
+void readWord(char *word, FILE *fin) {
+    int a = 0, ch;
+
+    while (!feof(fin)) {
+        ch = fgetc(fin);
+
+        if (ch == 13) continue;
+
+        if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
+            if (a > 0) {
+                if (ch == '\n') ungetc(ch, fin);
+                break;
+            }
+
+            if (ch == '\n') {
+                strcpy(word, (char *) "</s>");
+                return;
+            }
+            else continue;
+        }
+
+        word[a] = char(ch);
+        a++;
+
+        if (a >= MAX_STRING) {
+            //printf("Too long word found!\n");   //truncate too long words
+            a--;
+        }
+    }
+    word[a] = 0;
+}
+
+void sortVocab() {
+    int a, b, max;
+    vocab_word swap;
+
+    for (a = 1; a < vocab_size; a++) {
+        max = a;
+        for (b = a + 1; b < vocab_size; b++) if (vocab[max].cn < vocab[b].cn) max = b;
+
+        swap = vocab[max];
+        vocab[max] = vocab[a];
+        vocab[a] = swap;
+    }
+}
+
+int learnVocabFromTrainFile() {
+    char word[MAX_STRING];
+    FILE *fin;
+    int a, i, train_wcn;
+
+    for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+
+    fin = fopen(train_file, "rb");
+
+    vocab_size = 0;
+
+    addWordToVocab((char *) "</s>");
+
+    train_wcn = 0;
+    while (1) {
+        readWord(word, fin);
+        if (feof(fin)) break;
+
+        train_wcn++;
+
+        i = searchVocab(word);
+        if (i == -1) {
+            a = addWordToVocab(word);
+            vocab[a].cn = 1;
+        } else vocab[i].cn++;
+    }
+
+    sortVocab();
+
+    if (debug_mode > 0) {
+        printf("Vocab size: %d\n", vocab_size);
+        printf("Words in train file: %d\n", train_wcn);
+    }
+
+    //train_words = train_wcn;
+
+    fclose(fin);
+    return 0;
+}
+
+int splitClasses() {
+    double df, dd;
+    int i, a, b;
+
+    df = 0;
+    dd = 0;
+    a = 0;
+    b = 0;
+
+    class_start = (int *) calloc(class_size, sizeof(int));
+    memset(class_start, 0x7f, sizeof(int) * class_size);
+    class_end = (int *) calloc(class_size, sizeof(int));
+    memset(class_end, 0, sizeof(int) * class_size);
+
+    if (old_classes) {    // old classes
+        for (i = 0; i < vocab_size; i++) b += vocab[i].cn;
+        for (i = 0; i < vocab_size; i++) {
+            df += vocab[i].cn / (double) b;
+            if (df > 1) df = 1;
+            if (df > (a + 1) / (double) class_size) {
+                vocab[i].class_index = a;
+                if (a < class_size - 1) a++;
+            } else {
+                vocab[i].class_index = a;
+            }
+        }
+    } else {            // new classes
+        for (i = 0; i < vocab_size; i++) b += vocab[i].cn;
+        for (i = 0; i < vocab_size; i++) dd += sqrt(vocab[i].cn / (double) b);
+        for (i = 0; i < vocab_size; i++) {
+            df += sqrt(vocab[i].cn / (double) b) / dd;
+            if (df > 1) df = 1;
+            if (df > (a + 1) / (double) class_size) {
+                vocab[i].class_index = a;
+                if (a < class_size - 1) a++;
+            } else {
+                vocab[i].class_index = a;
+            }
+        }
+    }
+
+    // after dividing classes, update class start and class end information
+    for(i = 0; i < vocab_size; i++)  {
+        a = vocab[i].class_index;
+        class_start[a] = min(i, class_start[a]);
+        class_end[a] = max(i + 1, class_end[a]);
+    }
+    return 0;
+}
+
+int init_class() {
+    //debug_mode = 1;
+    debug_mode = 0;
+    vocab_max_size = 100;  // largest length value for each word
+    vocab_size = 0;
+    vocab = (struct vocab_word *) calloc(vocab_max_size, sizeof(struct vocab_word));
+    vocab_hash_size = 100000000;
+    vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
+    old_classes = 1;
+
+    // read vocab
+    learnVocabFromTrainFile();
+
+    // split classes
+    splitClasses();
+
+    return 0;
+}
+
+int create_shard(char *input_file, char *output_file) {
+    DataShard dataShard(output_file, DataShard::kCreate);
+    singa::WordRecord wordRecord;
+
+    char word[MAX_STRING];
+    FILE *fin;
+    int a, i;
+    fin = fopen(input_file, "rb");
+    while (1) {
+        readWord(word, fin);
+        if (feof(fin)) break;
+        i = searchVocab(word);
+        if (i == -1) {
+            if (debug_mode) printf("unknown word [%s] detected!", word);
+        } else {
+            wordRecord.set_word(string(word));
+            wordRecord.set_word_index(i);
+            int class_idx = vocab[i].class_index;
+            wordRecord.set_class_index(class_idx);
+            wordRecord.set_class_start(class_start[class_idx]);
+            wordRecord.set_class_end(class_end[class_idx]);
+            dataShard.Insert(word, wordRecord);
+        }
+    }
+
+    dataShard.Flush();
+    fclose(fin);
+    return 0;
+}
+
+int argPos(char *str, int argc, char **argv) {
+    int a;
+
+    for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) return a;
+
+    return -1;
+}
+
+int main(int argc, char **argv) {
+    int i;
+    FILE *f;
+
+    //set debug mode
+    i = argPos((char *) "-debug", argc, argv);
+    if (i > 0) {
+        debug_mode = 1;
+        if (debug_mode > 0)
+            printf("debug mode: %d\n", debug_mode);
+    }
+
+    //search for train file
+    i = argPos((char *) "-train", argc, argv);
+    if (i > 0) {
+        if (i + 1 == argc) {
+            printf("ERROR: training data file not specified!\n");
+            return 0;
+        }
+
+        strcpy(train_file, argv[i + 1]);
+
+        if (debug_mode > 0)
+            printf("train file: %s\n", train_file);
+
+        f = fopen(train_file, "rb");
+        if (f == NULL) {
+            printf("ERROR: training data file not found!\n");
+            return 0;
+        }
+        fclose(f);
+    } else {
+        printf("ERROR: training data must be set.\n");
+    }
+
+    //search for valid file
+    i = argPos((char *) "-valid", argc, argv);
+    if (i > 0) {
+        if (i + 1 == argc) {
+            printf("ERROR: validating data file not specified!\n");
+            return 0;
+        }
+
+        strcpy(valid_file, argv[i + 1]);
+
+        if (debug_mode > 0)
+            printf("valid file: %s\n", valid_file);
+
+        f = fopen(valid_file, "rb");
+        if (f == NULL) {
+            printf("ERROR: validating data file not found!\n");
+            return 0;
+        }
+        fclose(f);
+        valid_mode = 1;
+    }
+
+    //search for test file
+    i = argPos((char *) "-test", argc, argv);
+    if (i > 0) {
+        if (i + 1 == argc) {
+            printf("ERROR: testing data file not specified!\n");
+            return 0;
+        }
+
+        strcpy(test_file, argv[i + 1]);
+
+        if (debug_mode > 0)
+            printf("test file: %s\n", test_file);
+
+        f = fopen(test_file, "rb");
+        if (f == NULL) {
+            printf("ERROR: testing data file not found!\n");
+            return 0;
+        }
+        fclose(f);
+        test_mode = 1;
+    }
+
+    //search for class size
+    i = argPos((char *) "-class_size", argc, argv);
+    if (i > 0) {
+        if (i + 1 == argc) {
+            printf("ERROR: class size not specified!\n");
+            return 0;
+        }
+
+        class_size = atoi(argv[i + 1]);
+
+        if (debug_mode > 0)
+            printf("class size: %d\n", class_size);
+    }
+    if (class_size <= 0) {
+        printf("ERROR: no or invalid class size received!\n");
+        return 0;
+    }
+
+    init_class();
+
+    create_shard(train_file, "train_shard");
+    if (valid_mode) create_shard(valid_file, "valid_shard");
+    if (test_mode) create_shard(test_file, "test_shard");
+
+    return 0;
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/main.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/main.cc b/examples/rnnlm/main.cc
index 690c158..3cb59f1 100644
--- a/examples/rnnlm/main.cc
+++ b/examples/rnnlm/main.cc
@@ -15,6 +15,9 @@ int main(int argc, char **argv) {
   driver.RegisterLayer<singa::EmbeddingLayer, std::string>("kEmbedding");
   driver.RegisterLayer<singa::HiddenLayer, std::string>("kHidden");
   driver.RegisterLayer<singa::OutputLayer, std::string>("kOutput");
+  driver.RegisterLayer<singa::RnnDataLayer, std::string>("kRnnData");
+  driver.RegisterLayer<singa::WordLayer, std::string>("kWord");
+  driver.RegisterLayer<singa::RnnLabelLayer, std::string>("kRnnLabel");
 
   singa::JobProto jobConf = driver.job_conf();
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/rnnlm.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.cc b/examples/rnnlm/rnnlm.cc
index ddb0f63..180300f 100644
--- a/examples/rnnlm/rnnlm.cc
+++ b/examples/rnnlm/rnnlm.cc
@@ -25,6 +25,83 @@ inline Tensor<cpu, 1> RTensor1(Blob<float>* blob) {
   return tensor;
 }
 
+
+/*******InputLayer**************/
+RnnDataLayer::~RnnDataLayer() {
+  if (shard_ != nullptr)
+    delete shard_;
+  shard_ = nullptr;
+}
+
+void RnnDataLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  shard_ = new DataShard(proto.GetExtension(input_conf).path(), DataShard::kRead);
+  string key;
+  max_window_ = proto.GetExtension(input_conf).max_window();
+  records_.resize(max_window_ + 1);  // # of records in data layer is max_window_ + 1
+  window_ = 0;
+  shard_->Next(&key, &records_[window_]);
+}
+
+void RnnDataLayer::ComputeFeature(int flag, Metric *perf) {
+  CHECK(records_.size() <= shard_->Count());
+  records_[0] = records_[window_];
+  window_ = max_window_;
+  singa::WordRecord wr;
+  for (int i = 1; i <= max_window_; i++) {
+    string key;
+    if (shard_->Next(&key, &records_[i])) {
+      wr = records_[i];
+      if(wr.word_index() == 0) {
+        window_ = i;
+        break;
+      }
+    }
+    else{
+      shard_->SeekToFirst();
+      CHECK(shard_->Next(&key, &records_[i]));
+    }
+  }
+}
+
+/*******WordLayer**************/
+void WordLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(), 1);
+  int max_window = static_cast<RnnDataLayer*>(srclayers_[0])->max_window();
+  data_.Reshape(vector<int>{max_window});
+}
+
+void WordLayer::ComputeFeature(int flag, Metric *perf) {
+  auto records = static_cast<RnnDataLayer*>(srclayers_[0])->records();
+  float *word = data_.mutable_cpu_data();
+  window_ = static_cast<RNNLayer*>(srclayers_[0])->window();
+  for(int i = 0; i < window_; i++) {
+    word[i] = records[i].word_index();
+  }
+}
+
+
+/*******LabelLayer**************/
+void RnnLabelLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(), 1);
+  int max_window = static_cast<RnnDataLayer*>(srclayers_[0])->max_window();
+  data_.Reshape(vector<int>{max_window, 4});
+}
+
+void RnnLabelLayer::ComputeFeature(int flag, Metric *perf) {
+  auto records = static_cast<RnnDataLayer*>(srclayers_[0])->records();
+  float *label = data_.mutable_cpu_data();
+  window_ = static_cast<RNNLayer*>(srclayers_[0])->window();
+  for (int i = 0; i < window_; i++) {
+    label[4 * i + 0] = records[i + 1].class_start();
+    label[4 * i + 1] = records[i + 1].class_end();
+    label[4 * i + 2] = records[i + 1].word_index();
+    label[4 * i + 3] = records[i + 1].class_index();
+  }
+}
+
 /*******EmbeddingLayer**************/
 EmbeddingLayer::~EmbeddingLayer() {
   delete embed_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/rnnlm.h
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h
index 14d947c..e9b7c55 100644
--- a/examples/rnnlm/rnnlm.h
+++ b/examples/rnnlm/rnnlm.h
@@ -1,4 +1,5 @@
 #include "singa.h"
+#include "rnnlm.pb.h"
 namespace singa {
 
 /**
@@ -23,6 +24,52 @@ class RNNLayer : public NeuronLayer {
 };
 
 /**
+ * Input layer that get read records from data shard
+ */
+class RnnDataLayer : public RNNLayer {
+ public:
+  ~RnnDataLayer();
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag, Metric* perf) override {}
+  int max_window() const {
+    return max_window_;
+  }
+
+  const std::vector<singa::WordRecord>& records() const {
+    return records_;
+  }
+
+ private:
+  int max_window_;
+  DataShard* shard_;
+  std::vector<singa::WordRecord> records_;
+};
+
+
+/**
+ * WordLayer that read records_[0] to records_[window_ - 1] from RnnDataLayer to offer data for computation
+ */
+class WordLayer : public RNNLayer {
+ public:
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag, Metric* perf) override {}
+};
+
+
+/**
+ * LabelLayer that read records_[1] to records_[window_] from RnnDataLayer to offer label information
+ */
+class RnnLabelLayer : public RNNLayer {
+ public:
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag, Metric* perf) override {}
+};
+
+
+/**
  * Word embedding layer that get one row from the embedding matrix for each
  * word based on the word index
  */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/rnnlm.proto
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.proto b/examples/rnnlm/rnnlm.proto
index 35b6bc2..65c34ec 100644
--- a/examples/rnnlm/rnnlm.proto
+++ b/examples/rnnlm/rnnlm.proto
@@ -1,5 +1,6 @@
 package singa;
 import "job.proto";
+import "common.proto";
 
 
 message EmbeddingProto {
@@ -12,7 +13,21 @@ message OutputProto {
   optional int32 vocab_size = 2;
 }
 
+message InputProto {
+  required string path = 1;
+  optional int32 max_window = 2;
+}
+
 extend LayerProto {
   optional EmbeddingProto embedding_conf = 101;
   optional OutputProto output_conf = 102;
+  optional InputProto input_conf = 103;
 }
+
+message WordRecord {
+  optional string word = 1;
+  optional int32 word_index = 2;
+  optional int32 class_index = 3;
+  optional int32 class_start = 4;
+  optional int32 class_end = 5;
+}
\ No newline at end of file