You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/09/16 06:19:59 UTC
[20/21] incubator-singa git commit: SINGA-10 Add Support for
Recurrent Neural Networks (RNN)
SINGA-10 Add Support for Recurrent Neural Networks (RNN)
Add user-defined records for word (word string, word id, class id,
startpos, endpos);
Implement RnnDataLayer, WordLayer and RnnLabelLayer;
Implement create_shard.cc for the sample dataset of rnnlmlib;
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/b4a8d2b2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/b4a8d2b2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/b4a8d2b2
Branch: refs/heads/tutorial
Commit: b4a8d2b2beb077e6488569791a1581add4a957bc
Parents: e53a23c
Author: kaiping <ka...@comp.nus.edu.sg>
Authored: Sun Sep 13 20:07:11 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Wed Sep 16 11:39:16 2015 +0800
----------------------------------------------------------------------
examples/rnnlm/Makefile.example | 26 +++
examples/rnnlm/create_shard.cc | 400 +++++++++++++++++++++++++++++++++++
examples/rnnlm/main.cc | 3 +
examples/rnnlm/rnnlm.cc | 77 +++++++
examples/rnnlm/rnnlm.h | 47 ++++
examples/rnnlm/rnnlm.proto | 15 ++
6 files changed, 568 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/rnnlm/Makefile.example b/examples/rnnlm/Makefile.example
index 5eeca78..b4505cf 100644
--- a/examples/rnnlm/Makefile.example
+++ b/examples/rnnlm/Makefile.example
@@ -1,5 +1,31 @@
MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+libs :=singa glog protobuf
+filename = rnnlm-0.4b.tgz
+# note: filelink for rnnlm-0.4b may change
+filelink = https://f25ea9ccb7d3346ce6891573d543960492b92c30.googledrive.com/host/0ByxdPXuxLPS5RFM5dVNvWVhTd0U
+dirname = $(patsubst %.tgz,%, $(filename))
+numclass = 100
+dirshards = train_shard valid_shard test_shard
+
+
+.PHONY: all download create
+
+download: rnnlm
+
+rnnlm:
+ wget $(filelink)/$(filename)
+ tar zxf $(filename)
+ rm $(filename)
+
+create:
+ $(CXX) create_shard.cc -std=c++11 -lsinga -lprotobuf -lzookeeper_mt -lglog -I../../include \
+ -L../../.libs/ -L/usr/local/lib -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \
+ -o create_shard.bin
+ for d in $(dirshards); do mkdir -p $${d}; done
+ ./create_shard.bin -train $(dirname)/train -class_size $(numclass) -test $(dirname)/test
+
+
all:
protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. rnnlm.proto
$(CXX) main.cc rnnlm.cc rnnlm.pb.cc $(MSHADOW_FLAGS) -std=c++11 -lsinga -lglog -lprotobuf -lopenblas -I../../include\
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/create_shard.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/create_shard.cc b/examples/rnnlm/create_shard.cc
new file mode 100644
index 0000000..dd56a84
--- /dev/null
+++ b/examples/rnnlm/create_shard.cc
@@ -0,0 +1,400 @@
+//
+// This code creates DataShard for RNNLM dataset.
+// It is adapted from the convert_mnist_data from Caffe
+// The RNNLM dataset could be downloaded at
+// http://www.rnnlm.org/
+//
+// Usage:
+// create_shard.bin -train train_file -class_size [-debug] [-valid valid_file] [-test test_file]
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+
+#include "utils/data_shard.h"
+#include "utils/common.h"
+#include "proto/common.pb.h"
+#include "singa.h"
+#include "rnnlm.pb.h"
+
+#define MAX_STRING 100
+
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include <algorithm>
+#include <fstream>
+
+using namespace std;
+using singa::DataShard;
+
+struct vocab_word {
+ int cn;
+ char word[MAX_STRING];
+ int class_index;
+};
+
+struct vocab_word *vocab;
+int vocab_max_size;
+int vocab_size;
+int *vocab_hash;
+int vocab_hash_size;
+int debug_mode;
+int old_classes;
+int *class_start;
+int *class_end;
+int class_size;
+
+char train_file[MAX_STRING];
+char valid_file[MAX_STRING];
+char test_file[MAX_STRING];
+
+int valid_mode;
+int test_mode;
+
+unsigned int getWordHash(char *word) {
+ unsigned int hash, a;
+
+ hash = 0;
+ for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a];
+ hash = hash % vocab_hash_size;
+
+ return hash;
+}
+
+int searchVocab(char *word) {
+ int a;
+ unsigned int hash;
+
+ hash = getWordHash(word);
+
+ if (vocab_hash[hash] == -1) return -1;
+ if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
+
+ for (a = 0; a < vocab_size; a++) { //search in vocabulary
+ if (!strcmp(word, vocab[a].word)) {
+ vocab_hash[hash] = a;
+ return a;
+ }
+ }
+
+ return -1; //return OOV if not found
+}
+
+int addWordToVocab(char *word) {
+ unsigned int hash;
+
+ strcpy(vocab[vocab_size].word, word);
+ vocab[vocab_size].cn = 0;
+ vocab_size++;
+
+ if (vocab_size + 2 >= vocab_max_size) { //reallocate memory if needed
+ vocab_max_size += 100;
+ vocab = (struct vocab_word *) realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
+ }
+
+ hash = getWordHash(word);
+ vocab_hash[hash] = vocab_size - 1;
+
+ return vocab_size - 1;
+}
+
+void readWord(char *word, FILE *fin) {
+ int a = 0, ch;
+
+ while (!feof(fin)) {
+ ch = fgetc(fin);
+
+ if (ch == 13) continue;
+
+ if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
+ if (a > 0) {
+ if (ch == '\n') ungetc(ch, fin);
+ break;
+ }
+
+ if (ch == '\n') {
+ strcpy(word, (char *) "</s>");
+ return;
+ }
+ else continue;
+ }
+
+ word[a] = char(ch);
+ a++;
+
+ if (a >= MAX_STRING) {
+ //printf("Too long word found!\n"); //truncate too long words
+ a--;
+ }
+ }
+ word[a] = 0;
+}
+
+void sortVocab() {
+ int a, b, max;
+ vocab_word swap;
+
+ for (a = 1; a < vocab_size; a++) {
+ max = a;
+ for (b = a + 1; b < vocab_size; b++) if (vocab[max].cn < vocab[b].cn) max = b;
+
+ swap = vocab[max];
+ vocab[max] = vocab[a];
+ vocab[a] = swap;
+ }
+}
+
+int learnVocabFromTrainFile() {
+ char word[MAX_STRING];
+ FILE *fin;
+ int a, i, train_wcn;
+
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+
+ fin = fopen(train_file, "rb");
+
+ vocab_size = 0;
+
+ addWordToVocab((char *) "</s>");
+
+ train_wcn = 0;
+ while (1) {
+ readWord(word, fin);
+ if (feof(fin)) break;
+
+ train_wcn++;
+
+ i = searchVocab(word);
+ if (i == -1) {
+ a = addWordToVocab(word);
+ vocab[a].cn = 1;
+ } else vocab[i].cn++;
+ }
+
+ sortVocab();
+
+ if (debug_mode > 0) {
+ printf("Vocab size: %d\n", vocab_size);
+ printf("Words in train file: %d\n", train_wcn);
+ }
+
+ //train_words = train_wcn;
+
+ fclose(fin);
+ return 0;
+}
+
+int splitClasses() {
+ double df, dd;
+ int i, a, b;
+
+ df = 0;
+ dd = 0;
+ a = 0;
+ b = 0;
+
+ class_start = (int *) calloc(class_size, sizeof(int));
+ memset(class_start, 0x7f, sizeof(int) * class_size);
+ class_end = (int *) calloc(class_size, sizeof(int));
+ memset(class_end, 0, sizeof(int) * class_size);
+
+ if (old_classes) { // old classes
+ for (i = 0; i < vocab_size; i++) b += vocab[i].cn;
+ for (i = 0; i < vocab_size; i++) {
+ df += vocab[i].cn / (double) b;
+ if (df > 1) df = 1;
+ if (df > (a + 1) / (double) class_size) {
+ vocab[i].class_index = a;
+ if (a < class_size - 1) a++;
+ } else {
+ vocab[i].class_index = a;
+ }
+ }
+ } else { // new classes
+ for (i = 0; i < vocab_size; i++) b += vocab[i].cn;
+ for (i = 0; i < vocab_size; i++) dd += sqrt(vocab[i].cn / (double) b);
+ for (i = 0; i < vocab_size; i++) {
+ df += sqrt(vocab[i].cn / (double) b) / dd;
+ if (df > 1) df = 1;
+ if (df > (a + 1) / (double) class_size) {
+ vocab[i].class_index = a;
+ if (a < class_size - 1) a++;
+ } else {
+ vocab[i].class_index = a;
+ }
+ }
+ }
+
+ // after dividing classes, update class start and class end information
+ for(i = 0; i < vocab_size; i++) {
+ a = vocab[i].class_index;
+ class_start[a] = min(i, class_start[a]);
+ class_end[a] = max(i + 1, class_end[a]);
+ }
+ return 0;
+}
+
+int init_class() {
+ //debug_mode = 1;
+ debug_mode = 0;
+ vocab_max_size = 100; // largest length value for each word
+ vocab_size = 0;
+ vocab = (struct vocab_word *) calloc(vocab_max_size, sizeof(struct vocab_word));
+ vocab_hash_size = 100000000;
+ vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
+ old_classes = 1;
+
+ // read vocab
+ learnVocabFromTrainFile();
+
+ // split classes
+ splitClasses();
+
+ return 0;
+}
+
+int create_shard(char *input_file, char *output_file) {
+ DataShard dataShard(output_file, DataShard::kCreate);
+ singa::WordRecord wordRecord;
+
+ char word[MAX_STRING];
+ FILE *fin;
+ int a, i;
+ fin = fopen(input_file, "rb");
+ while (1) {
+ readWord(word, fin);
+ if (feof(fin)) break;
+ i = searchVocab(word);
+ if (i == -1) {
+ if (debug_mode) printf("unknown word [%s] detected!", word);
+ } else {
+ wordRecord.set_word(string(word));
+ wordRecord.set_word_index(i);
+ int class_idx = vocab[i].class_index;
+ wordRecord.set_class_index(class_idx);
+ wordRecord.set_class_start(class_start[class_idx]);
+ wordRecord.set_class_end(class_end[class_idx]);
+ dataShard.Insert(word, wordRecord);
+ }
+ }
+
+ dataShard.Flush();
+ fclose(fin);
+ return 0;
+}
+
+int argPos(char *str, int argc, char **argv) {
+ int a;
+
+ for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) return a;
+
+ return -1;
+}
+
+int main(int argc, char **argv) {
+ int i;
+ FILE *f;
+
+ //set debug mode
+ i = argPos((char *) "-debug", argc, argv);
+ if (i > 0) {
+ debug_mode = 1;
+ if (debug_mode > 0)
+ printf("debug mode: %d\n", debug_mode);
+ }
+
+ //search for train file
+ i = argPos((char *) "-train", argc, argv);
+ if (i > 0) {
+ if (i + 1 == argc) {
+ printf("ERROR: training data file not specified!\n");
+ return 0;
+ }
+
+ strcpy(train_file, argv[i + 1]);
+
+ if (debug_mode > 0)
+ printf("train file: %s\n", train_file);
+
+ f = fopen(train_file, "rb");
+ if (f == NULL) {
+ printf("ERROR: training data file not found!\n");
+ return 0;
+ }
+ fclose(f);
+ } else {
+ printf("ERROR: training data must be set.\n");
+ }
+
+ //search for valid file
+ i = argPos((char *) "-valid", argc, argv);
+ if (i > 0) {
+ if (i + 1 == argc) {
+ printf("ERROR: validating data file not specified!\n");
+ return 0;
+ }
+
+ strcpy(valid_file, argv[i + 1]);
+
+ if (debug_mode > 0)
+ printf("valid file: %s\n", valid_file);
+
+ f = fopen(valid_file, "rb");
+ if (f == NULL) {
+ printf("ERROR: validating data file not found!\n");
+ return 0;
+ }
+ fclose(f);
+ valid_mode = 1;
+ }
+
+ //search for test file
+ i = argPos((char *) "-test", argc, argv);
+ if (i > 0) {
+ if (i + 1 == argc) {
+ printf("ERROR: testing data file not specified!\n");
+ return 0;
+ }
+
+ strcpy(test_file, argv[i + 1]);
+
+ if (debug_mode > 0)
+ printf("test file: %s\n", test_file);
+
+ f = fopen(test_file, "rb");
+ if (f == NULL) {
+ printf("ERROR: testing data file not found!\n");
+ return 0;
+ }
+ fclose(f);
+ test_mode = 1;
+ }
+
+ //search for class size
+ i = argPos((char *) "-class_size", argc, argv);
+ if (i > 0) {
+ if (i + 1 == argc) {
+ printf("ERROR: class size not specified!\n");
+ return 0;
+ }
+
+ class_size = atoi(argv[i + 1]);
+
+ if (debug_mode > 0)
+ printf("class size: %d\n", class_size);
+ }
+ if (class_size <= 0) {
+ printf("ERROR: no or invalid class size received!\n");
+ return 0;
+ }
+
+ init_class();
+
+ create_shard(train_file, "train_shard");
+ if (valid_mode) create_shard(valid_file, "valid_shard");
+ if (test_mode) create_shard(test_file, "test_shard");
+
+ return 0;
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/main.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/main.cc b/examples/rnnlm/main.cc
index 690c158..3cb59f1 100644
--- a/examples/rnnlm/main.cc
+++ b/examples/rnnlm/main.cc
@@ -15,6 +15,9 @@ int main(int argc, char **argv) {
driver.RegisterLayer<singa::EmbeddingLayer, std::string>("kEmbedding");
driver.RegisterLayer<singa::HiddenLayer, std::string>("kHidden");
driver.RegisterLayer<singa::OutputLayer, std::string>("kOutput");
+ driver.RegisterLayer<singa::RnnDataLayer, std::string>("kRnnData");
+ driver.RegisterLayer<singa::WordLayer, std::string>("kWord");
+ driver.RegisterLayer<singa::RnnLabelLayer, std::string>("kRnnLabel");
singa::JobProto jobConf = driver.job_conf();
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/rnnlm.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.cc b/examples/rnnlm/rnnlm.cc
index ddb0f63..180300f 100644
--- a/examples/rnnlm/rnnlm.cc
+++ b/examples/rnnlm/rnnlm.cc
@@ -25,6 +25,83 @@ inline Tensor<cpu, 1> RTensor1(Blob<float>* blob) {
return tensor;
}
+
+/*******InputLayer**************/
+RnnDataLayer::~RnnDataLayer() {
+ if (shard_ != nullptr)
+ delete shard_;
+ shard_ = nullptr;
+}
+
+void RnnDataLayer::Setup(const LayerProto& proto, int npartitions) {
+ Layer::Setup(proto, npartitions);
+ shard_ = new DataShard(proto.GetExtension(input_conf).path(), DataShard::kRead);
+ string key;
+ max_window_ = proto.GetExtension(input_conf).max_window();
+ records_.resize(max_window_ + 1); // # of records in data layer is max_window_ + 1
+ window_ = 0;
+ shard_->Next(&key, &records_[window_]);
+}
+
+void RnnDataLayer::ComputeFeature(int flag, Metric *perf) {
+ CHECK(records_.size() <= shard_->Count());
+ records_[0] = records_[window_];
+ window_ = max_window_;
+ singa::WordRecord wr;
+ for (int i = 1; i <= max_window_; i++) {
+ string key;
+ if (shard_->Next(&key, &records_[i])) {
+ wr = records_[i];
+ if(wr.word_index() == 0) {
+ window_ = i;
+ break;
+ }
+ }
+ else{
+ shard_->SeekToFirst();
+ CHECK(shard_->Next(&key, &records_[i]));
+ }
+ }
+}
+
+/*******WordLayer**************/
+void WordLayer::Setup(const LayerProto& proto, int npartitions) {
+ Layer::Setup(proto, npartitions);
+ CHECK_EQ(srclayers_.size(), 1);
+ int max_window = static_cast<RnnDataLayer*>(srclayers_[0])->max_window();
+ data_.Reshape(vector<int>{max_window});
+}
+
+void WordLayer::ComputeFeature(int flag, Metric *perf) {
+ auto records = static_cast<RnnDataLayer*>(srclayers_[0])->records();
+ float *word = data_.mutable_cpu_data();
+ window_ = static_cast<RNNLayer*>(srclayers_[0])->window();
+ for(int i = 0; i < window_; i++) {
+ word[i] = records[i].word_index();
+ }
+}
+
+
+/*******LabelLayer**************/
+void RnnLabelLayer::Setup(const LayerProto& proto, int npartitions) {
+ Layer::Setup(proto, npartitions);
+ CHECK_EQ(srclayers_.size(), 1);
+ int max_window = static_cast<RnnDataLayer*>(srclayers_[0])->max_window();
+ data_.Reshape(vector<int>{max_window, 4});
+}
+
+void RnnLabelLayer::ComputeFeature(int flag, Metric *perf) {
+ auto records = static_cast<RnnDataLayer*>(srclayers_[0])->records();
+ float *label = data_.mutable_cpu_data();
+ window_ = static_cast<RNNLayer*>(srclayers_[0])->window();
+ for (int i = 0; i < window_; i++) {
+ label[4 * i + 0] = records[i + 1].class_start();
+ label[4 * i + 1] = records[i + 1].class_end();
+ label[4 * i + 2] = records[i + 1].word_index();
+ label[4 * i + 3] = records[i + 1].class_index();
+ }
+}
+
/*******EmbeddingLayer**************/
EmbeddingLayer::~EmbeddingLayer() {
delete embed_;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/rnnlm.h
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h
index 14d947c..e9b7c55 100644
--- a/examples/rnnlm/rnnlm.h
+++ b/examples/rnnlm/rnnlm.h
@@ -1,4 +1,5 @@
#include "singa.h"
+#include "rnnlm.pb.h"
namespace singa {
/**
@@ -23,6 +24,52 @@ class RNNLayer : public NeuronLayer {
};
/**
+ * Input layer that get read records from data shard
+ */
+class RnnDataLayer : public RNNLayer {
+ public:
+ ~RnnDataLayer();
+ void Setup(const LayerProto& proto, int npartitions) override;
+ void ComputeFeature(int flag, Metric *perf) override;
+ void ComputeGradient(int flag, Metric* perf) override {}
+ int max_window() const {
+ return max_window_;
+ }
+
+ const std::vector<singa::WordRecord>& records() const {
+ return records_;
+ }
+
+ private:
+ int max_window_;
+ DataShard* shard_;
+ std::vector<singa::WordRecord> records_;
+};
+
+
+/**
+ * WordLayer that read records_[0] to records_[window_ - 1] from RnnDataLayer to offer data for computation
+ */
+class WordLayer : public RNNLayer {
+ public:
+ void Setup(const LayerProto& proto, int npartitions) override;
+ void ComputeFeature(int flag, Metric *perf) override;
+ void ComputeGradient(int flag, Metric* perf) override {}
+};
+
+
+/**
+ * LabelLayer that read records_[1] to records_[window_] from RnnDataLayer to offer label information
+ */
+class RnnLabelLayer : public RNNLayer {
+ public:
+ void Setup(const LayerProto& proto, int npartitions) override;
+ void ComputeFeature(int flag, Metric *perf) override;
+ void ComputeGradient(int flag, Metric* perf) override {}
+};
+
+
+/**
* Word embedding layer that get one row from the embedding matrix for each
* word based on the word index
*/
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b4a8d2b2/examples/rnnlm/rnnlm.proto
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.proto b/examples/rnnlm/rnnlm.proto
index 35b6bc2..65c34ec 100644
--- a/examples/rnnlm/rnnlm.proto
+++ b/examples/rnnlm/rnnlm.proto
@@ -1,5 +1,6 @@
package singa;
import "job.proto";
+import "common.proto";
message EmbeddingProto {
@@ -12,7 +13,21 @@ message OutputProto {
optional int32 vocab_size = 2;
}
+message InputProto {
+ required string path = 1;
+ optional int32 max_window = 2;
+}
+
extend LayerProto {
optional EmbeddingProto embedding_conf = 101;
optional OutputProto output_conf = 102;
+ optional InputProto input_conf = 103;
}
+
+message WordRecord {
+ optional string word = 1;
+ optional int32 word_index = 2;
+ optional int32 class_index = 3;
+ optional int32 class_start = 4;
+ optional int32 class_end = 5;
+}
\ No newline at end of file