You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/08/12 11:49:43 UTC
incubator-singa git commit: SINGA-47 Fix a bug in data layers that
leads to out-of-memory when group size is too large
Repository: incubator-singa
Updated Branches:
refs/heads/master 538736c4a -> 7a61a687c
SINGA-47 Fix a bug in data layers that leads to out-of-memory when group size is too large
The bug is fixed by closing the data source (e.g., lmdb or datashard) after reading a sample record in the Setup function.
The data source would cacahe memory which eat up all memory if there are many data layers.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/7a61a687
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/7a61a687
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/7a61a687
Branch: refs/heads/master
Commit: 7a61a687c2ceb4fc7e05c2d3bbd9817e8ba59e3f
Parents: 538736c
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Wed Aug 12 17:28:50 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Wed Aug 12 17:32:50 2015 +0800
----------------------------------------------------------------------
include/neuralnet/layer.h | 4 +++-
include/neuralnet/optional_layer.h | 2 ++
src/neuralnet/layer.cc | 14 +++++++++++--
src/neuralnet/neuralnet.cc | 3 +++
src/neuralnet/optional_layer.cc | 36 +++++++++++++++++++++------------
5 files changed, 43 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
index 05db916..118da56 100644
--- a/include/neuralnet/layer.h
+++ b/include/neuralnet/layer.h
@@ -335,10 +335,12 @@ class ShardDataLayer: public DataLayer{
public:
using Layer::ComputeFeature;
+ ~ShardDataLayer();
void Setup(const LayerProto& proto, int npartitions) override;
void ComputeFeature(Phase phase, Metric *perf) override;
+
private:
- shared_ptr<DataShard> shard_;
+ DataShard* shard_;
};
/**
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/include/neuralnet/optional_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/optional_layer.h b/include/neuralnet/optional_layer.h
index 2cbcdb8..f6b60d3 100644
--- a/include/neuralnet/optional_layer.h
+++ b/include/neuralnet/optional_layer.h
@@ -9,6 +9,8 @@ class LMDBDataLayer: public DataLayer{
public:
using Layer::ComputeFeature;
+ ~LMDBDataLayer();
+ void OpenLMDB(const std::string& path);
void Setup(const LayerProto& proto, int npartitions) override;
void ComputeFeature(Phase phase, Metric *perf) override;
void ConvertCaffeDatumToRecord(const CaffeDatum& datum,
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index 1fa92fb..314bb14 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -666,6 +666,9 @@ void RGBImageLayer::Setup(const LayerProto& proto, int npartitions) {
/***************Implementation for ShardDataLayer**************************/
void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){
+ if (shard_ == nullptr)
+ shard_ = new DataShard(layer_proto_.sharddata_conf().path(),
+ DataShard::kRead);
if(random_skip_){
int nskip = rand() % random_skip_;
LOG(INFO)<<"Random Skip "<<nskip<<" records, there are "<<shard_->Count()
@@ -687,10 +690,11 @@ void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){
void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) {
Layer::Setup(proto, npartitions);
- shard_= std::make_shared<DataShard>(proto.sharddata_conf().path(),
- DataShard::kRead);
+ shard_= new DataShard(proto.sharddata_conf().path(), DataShard::kRead);
string key;
shard_->Next(&key, &sample_);
+ delete shard_;
+ shard_ = nullptr;
batchsize_=proto.sharddata_conf().batchsize();
if(partition_dim() == 0)
batchsize_ /= npartitions;
@@ -698,6 +702,12 @@ void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) {
records_.resize(batchsize_);
random_skip_=proto.sharddata_conf().random_skip();
}
+
+ShardDataLayer::~ShardDataLayer() {
+ if (shard_ != nullptr)
+ delete shard_;
+ shard_ = nullptr;
+}
/*******************Implementation of TanLayer***************************/
void TanhLayer::Setup(const LayerProto& proto, int npartitions){
Layer::Setup(proto, npartitions);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 10ddcf1..83bd4fd 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -3,6 +3,9 @@
#include "neuralnet/neuralnet.h"
#include "utils/singleton.h"
+#ifdef USE_OPTIONAL_LAYER
+#include "neuralnet/optional_layer.h"
+#endif
namespace singa {
// macros to shorten the code
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/optional_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/optional_layer.cc b/src/neuralnet/optional_layer.cc
index ba85807..06f413f 100644
--- a/src/neuralnet/optional_layer.cc
+++ b/src/neuralnet/optional_layer.cc
@@ -3,7 +3,9 @@
namespace singa {
/*********************LMDBDataLayer**********************************/
-void LMDBDataLayer::ComputeFeature(Phase phase, Metric* perf){
+void LMDBDataLayer::ComputeFeature(Phase phase, Metric* perf) {
+ if (mdb_cursor_ == nullptr)
+ OpenLMDB(layer_proto_.lmdbdata_conf().path());
if(random_skip_){
int nskip = rand() % random_skip_;
int n=0;
@@ -63,29 +65,31 @@ void LMDBDataLayer::ConvertCaffeDatumToRecord(const CaffeDatum& datum,
}
}
-void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
- Layer::Setup(proto, npartitions);
+void LMDBDataLayer::OpenLMDB(const std::string& path) {
CHECK_EQ(mdb_env_create(&mdb_env_), MDB_SUCCESS) << "mdb_env_create failed";
CHECK_EQ(mdb_env_set_mapsize(mdb_env_, 1099511627776), MDB_SUCCESS); // 1TB
- CHECK_EQ(mdb_env_open(mdb_env_,
- proto.lmdbdata_conf().path().c_str(),
- MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb "
- << proto.lmdbdata_conf().path();
+ CHECK_EQ(mdb_env_open(mdb_env_, path.c_str(),
+ MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb " << path;
CHECK_EQ(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_), MDB_SUCCESS)
<< "mdb_txn_begin failed";
CHECK_EQ(mdb_open(mdb_txn_, NULL, 0, &mdb_dbi_), MDB_SUCCESS)
<< "mdb_open failed";
CHECK_EQ(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_), MDB_SUCCESS)
<< "mdb_cursor_open failed";
- LOG(INFO) << "Opening lmdb " << proto.lmdbdata_conf().path();
+ LOG(INFO) << "Opening lmdb " << path;
CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_FIRST),
MDB_SUCCESS) << "mdb_cursor_get failed";
+}
+
+void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
+ Layer::Setup(proto, npartitions);
+ OpenLMDB(proto.lmdbdata_conf().path());
+ CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT),
+ MDB_SUCCESS);
+ mdb_cursor_close(mdb_cursor_);
+ mdb_txn_abort(mdb_txn_);
+ mdb_cursor_ = nullptr;
- if (mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT)
- != MDB_SUCCESS) {
- CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_,
- MDB_FIRST), MDB_SUCCESS);
- }
CaffeDatum datum;
datum.ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
SingleLabelImageRecord* record=sample_.mutable_image();
@@ -98,6 +102,12 @@ void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
random_skip_=proto.lmdbdata_conf().random_skip();
}
+LMDBDataLayer::~LMDBDataLayer() {
+ mdb_cursor_close(mdb_cursor_);
+ mdb_txn_abort(mdb_txn_);
+ mdb_cursor_ = nullptr;
+}
+
} /* singa */
#endif