You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2018/11/30 02:02:26 UTC
[incubator-mxnet] branch master updated: Add DGL subgraph sampling
op (#13392)
This is an automated email from the ASF dual-hosted git repository.
haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new c72a38b Add DGL subgraph sampling op (#13392)
c72a38b is described below
commit c72a38b9afef8dc7261972f7cc43fffb13e7853b
Author: Chao Ma <mc...@gmail.com>
AuthorDate: Fri Nov 30 10:02:09 2018 +0800
Add DGL subgraph sampling op (#13392)
* add csr sample op
* fix compile error in some platform
* update
* update openmp
* speedup sampling
* update csr
* update csr
* update time seed
* update
* fix compiler error
* update doc
* fix ci error
---
src/operator/contrib/dgl_graph.cc | 872 +++++++++++++++++++++++++++++++-
tests/python/unittest/test_dgl_graph.py | 122 +++++
2 files changed, 991 insertions(+), 3 deletions(-)
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index d9bcdd4..1bb47b8 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -32,6 +32,875 @@
namespace mxnet {
namespace op {
+typedef int64_t dgl_id_t;
+
+////////////////////////////// Graph Sampling ///////////////////////////////
+
+/*
+ * ArrayHeap is used to sample elements from vector
+ */
+class ArrayHeap {
+ public:
+ explicit ArrayHeap(const std::vector<float>& prob) {
+ vec_size_ = prob.size();
+ bit_len_ = ceil(log2(vec_size_));
+ limit_ = 1 << bit_len_;
+ // allocate twice the size
+ heap_.resize(limit_ << 1, 0);
+ // allocate the leaves
+ for (int i = limit_; i < vec_size_+limit_; ++i) {
+ heap_[i] = prob[i-limit_];
+ }
+ // iterate up the tree (this is O(m))
+ for (int i = bit_len_-1; i >= 0; --i) {
+ for (int j = (1 << i); j < (1 << (i + 1)); ++j) {
+ heap_[j] = heap_[j << 1] + heap_[(j << 1) + 1];
+ }
+ }
+ }
+ ~ArrayHeap() {}
+
+ /*
+ * Remove term from index (this costs O(log m) steps)
+ */
+ void Delete(size_t index) {
+ size_t i = index + limit_;
+ float w = heap_[i];
+ for (int j = bit_len_; j >= 0; --j) {
+ heap_[i] -= w;
+ i = i >> 1;
+ }
+ }
+
+ /*
+ * Add value w to index (this costs O(log m) steps)
+ */
+ void Add(size_t index, float w) {
+ size_t i = index + limit_;
+ for (int j = bit_len_; j >= 0; --j) {
+ heap_[i] += w;
+ i = i >> 1;
+ }
+ }
+
+ /*
+ * Sample from arrayHeap
+ */
+ size_t Sample(unsigned int* seed) {
+ float xi = heap_[1] * (rand_r(seed)%100/101.0);
+ int i = 1;
+ while (i < limit_) {
+ i = i << 1;
+ if (xi >= heap_[i]) {
+ xi -= heap_[i];
+ i += 1;
+ }
+ }
+ return i - limit_;
+ }
+
+ /*
+ * Sample a vector by given the size n
+ */
+ void SampleWithoutReplacement(size_t n, std::vector<size_t>* samples, unsigned int* seed) {
+ // sample n elements
+ for (size_t i = 0; i < n; ++i) {
+ samples->at(i) = this->Sample(seed);
+ this->Delete(samples->at(i));
+ }
+ }
+
+ private:
+ int vec_size_; // sample size
+ int bit_len_; // bit size
+ int limit_;
+ std::vector<float> heap_;
+};
+
+struct NeighborSampleParam : public dmlc::Parameter<NeighborSampleParam> {
+ int num_args;
+ dgl_id_t num_hops;
+ dgl_id_t num_neighbor;
+ dgl_id_t max_num_vertices;
+ DMLC_DECLARE_PARAMETER(NeighborSampleParam) {
+ DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
+ .describe("Number of input NDArray.");
+ DMLC_DECLARE_FIELD(num_hops)
+ .set_default(1)
+ .describe("Number of hops.");
+ DMLC_DECLARE_FIELD(num_neighbor)
+ .set_default(2)
+ .describe("Number of neighbor.");
+ DMLC_DECLARE_FIELD(max_num_vertices)
+ .set_default(100)
+ .describe("Max number of vertices.");
+ }
+};
+
+DMLC_REGISTER_PARAMETER(NeighborSampleParam);
+
+/*
+ * Check uniform Storage Type
+ */
+static bool CSRNeighborUniformSampleStorageType(const nnvm::NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int> *in_attrs,
+ std::vector<int> *out_attrs) {
+ const NeighborSampleParam& params = nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ size_t num_subgraphs = params.num_args - 1;
+ CHECK_EQ(out_attrs->size(), 3 * num_subgraphs);
+
+ // input[0] is csr_graph
+ CHECK_EQ(in_attrs->at(0), mxnet::kCSRStorage);
+ // the rest input ndarray is seed_vector
+ for (size_t i = 0; i < num_subgraphs; i++)
+ CHECK_EQ(in_attrs->at(1 + i), mxnet::kDefaultStorage);
+
+ bool success = true;
+ // sample_id
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ if (!type_assign(&(*out_attrs)[i], mxnet::kDefaultStorage)) {
+ success = false;
+ }
+ }
+ // sub_graph
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ if (!type_assign(&(*out_attrs)[i + num_subgraphs], mxnet::kCSRStorage)) {
+ success = false;
+ }
+ }
+ // sub_layer
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ if (!type_assign(&(*out_attrs)[i + 2*num_subgraphs], mxnet::kDefaultStorage)) {
+ success = false;
+ }
+ }
+
+ *dispatch_mode = DispatchMode::kFComputeEx;
+
+ return success;
+}
+
+/*
+ * Check non-uniform Storage Type
+ */
+static bool CSRNeighborNonUniformSampleStorageType(const nnvm::NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int> *in_attrs,
+ std::vector<int> *out_attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ size_t num_subgraphs = params.num_args - 2;
+ CHECK_EQ(out_attrs->size(), 4 * num_subgraphs);
+
+ // input[0] is csr_graph
+ CHECK_EQ(in_attrs->at(0), mxnet::kCSRStorage);
+ // input[1] is probability
+ CHECK_EQ(in_attrs->at(1), mxnet::kDefaultStorage);
+
+ // the rest input ndarray is seed_vector
+ for (size_t i = 0; i < num_subgraphs; i++)
+ CHECK_EQ(in_attrs->at(2 + i), mxnet::kDefaultStorage);
+
+ bool success = true;
+ // sample_id
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ if (!type_assign(&(*out_attrs)[i], mxnet::kDefaultStorage)) {
+ success = false;
+ }
+ }
+ // sub_graph
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ if (!type_assign(&(*out_attrs)[i + num_subgraphs], mxnet::kCSRStorage)) {
+ success = false;
+ }
+ }
+ // sub_probability
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ if (!type_assign(&(*out_attrs)[i + 2*num_subgraphs], mxnet::kDefaultStorage)) {
+ success = false;
+ }
+ }
+ // sub_layer
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ if (!type_assign(&(*out_attrs)[i + 3*num_subgraphs], mxnet::kDefaultStorage)) {
+ success = false;
+ }
+ }
+
+ *dispatch_mode = DispatchMode::kFComputeEx;
+
+ return success;
+}
+
+/*
+ * Check uniform Shape
+ */
+static bool CSRNeighborUniformSampleShape(const nnvm::NodeAttrs& attrs,
+ std::vector<TShape> *in_attrs,
+ std::vector<TShape> *out_attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ size_t num_subgraphs = params.num_args - 1;
+ CHECK_EQ(out_attrs->size(), 3 * num_subgraphs);
+ // input[0] is csr graph
+ CHECK_EQ(in_attrs->at(0).ndim(), 2U);
+ CHECK_EQ(in_attrs->at(0)[0], in_attrs->at(0)[1]);
+
+ // the rest input ndarray is seed vector
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ CHECK_EQ(in_attrs->at(1 + i).ndim(), 1U);
+ }
+
+ // Output
+ bool success = true;
+ TShape out_shape(1);
+ // We use the last element to store the actual
+ // number of vertices in the subgraph.
+ out_shape[0] = params.max_num_vertices + 1;
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ SHAPE_ASSIGN_CHECK(*out_attrs, i, out_shape);
+ success = success &&
+ out_attrs->at(i).ndim() != 0U &&
+ out_attrs->at(i).Size() != 0U;
+ }
+ // sub_csr
+ TShape out_csr_shape(2);
+ out_csr_shape[0] = params.max_num_vertices;
+ out_csr_shape[1] = in_attrs->at(0)[1];
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ SHAPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, out_csr_shape);
+ success = success &&
+ out_attrs->at(i + num_subgraphs).ndim() != 0U &&
+ out_attrs->at(i + num_subgraphs).Size() != 0U;
+ }
+ // sub_layer
+ TShape out_layer_shape(1);
+ out_layer_shape[0] = params.max_num_vertices;
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ SHAPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, out_layer_shape);
+ success = success &&
+ out_attrs->at(i + 2*num_subgraphs).ndim() != 0U &&
+ out_attrs->at(i + 2*num_subgraphs).Size() != 0U;
+ }
+
+ return success;
+}
+
+/*
+ * Check non-uniform Shape
+ */
+static bool CSRNeighborNonUniformSampleShape(const nnvm::NodeAttrs& attrs,
+ std::vector<TShape> *in_attrs,
+ std::vector<TShape> *out_attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ size_t num_subgraphs = params.num_args - 2;
+ CHECK_EQ(out_attrs->size(), 4 * num_subgraphs);
+ // input[0] is csr graph
+ CHECK_EQ(in_attrs->at(0).ndim(), 2U);
+ CHECK_EQ(in_attrs->at(0)[0], in_attrs->at(0)[1]);
+
+ // input[1] is probability
+ CHECK_EQ(in_attrs->at(1).ndim(), 1U);
+
+ // the rest ndarray is seed vector
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ CHECK_EQ(in_attrs->at(2 + i).ndim(), 1U);
+ }
+
+ // Output
+ bool success = true;
+ TShape out_shape(1);
+ // We use the last element to store the actual
+ // number of vertices in the subgraph.
+ out_shape[0] = params.max_num_vertices + 1;
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ SHAPE_ASSIGN_CHECK(*out_attrs, i, out_shape);
+ success = success &&
+ out_attrs->at(i).ndim() != 0U &&
+ out_attrs->at(i).Size() != 0U;
+ }
+ // sub_csr
+ TShape out_csr_shape(2);
+ out_csr_shape[0] = params.max_num_vertices;
+ out_csr_shape[1] = in_attrs->at(0)[1];
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ SHAPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, out_csr_shape);
+ success = success &&
+ out_attrs->at(i + num_subgraphs).ndim() != 0U &&
+ out_attrs->at(i + num_subgraphs).Size() != 0U;
+ }
+ // sub_probability
+ TShape out_prob_shape(1);
+ out_prob_shape[0] = params.max_num_vertices;
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ SHAPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, out_prob_shape);
+ success = success &&
+ out_attrs->at(i + 2*num_subgraphs).ndim() != 0U &&
+ out_attrs->at(i + 2*num_subgraphs).Size() != 0U;
+ }
+ // sub_layer
+ TShape out_layer_shape(1);
+ out_layer_shape[0] = params.max_num_vertices;
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ SHAPE_ASSIGN_CHECK(*out_attrs, i + 3*num_subgraphs, out_prob_shape);
+ success = success &&
+ out_attrs->at(i + 3*num_subgraphs).ndim() != 0U &&
+ out_attrs->at(i + 3*num_subgraphs).Size() != 0U;
+ }
+
+ return success;
+}
+
+/*
+ * Check uniform Type
+ */
+static bool CSRNeighborUniformSampleType(const nnvm::NodeAttrs& attrs,
+ std::vector<int> *in_attrs,
+ std::vector<int> *out_attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ size_t num_subgraphs = params.num_args - 1;
+ CHECK_EQ(out_attrs->size(), 3 * num_subgraphs);
+
+ bool success = true;
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ TYPE_ASSIGN_CHECK(*out_attrs, i, in_attrs->at(1));
+ TYPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, in_attrs->at(0));
+ TYPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, in_attrs->at(1));
+ success = success &&
+ out_attrs->at(i) != -1 &&
+ out_attrs->at(i + num_subgraphs) != -1 &&
+ out_attrs->at(i + 2*num_subgraphs) != -1;
+ }
+
+ return success;
+}
+
+/*
+ * Check non-uniform Type
+ */
+static bool CSRNeighborNonUniformSampleType(const nnvm::NodeAttrs& attrs,
+ std::vector<int> *in_attrs,
+ std::vector<int> *out_attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ size_t num_subgraphs = params.num_args - 2;
+ CHECK_EQ(out_attrs->size(), 4 * num_subgraphs);
+
+ bool success = true;
+ for (size_t i = 0; i < num_subgraphs; i++) {
+ TYPE_ASSIGN_CHECK(*out_attrs, i, in_attrs->at(2));
+ TYPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, in_attrs->at(0));
+ TYPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, in_attrs->at(1));
+ TYPE_ASSIGN_CHECK(*out_attrs, i + 3*num_subgraphs, in_attrs->at(2));
+ success = success &&
+ out_attrs->at(i) != -1 &&
+ out_attrs->at(i + num_subgraphs) != -1 &&
+ out_attrs->at(i + 2*num_subgraphs) != -1 &&
+ out_attrs->at(i + 3*num_subgraphs) != -1;
+ }
+
+ return success;
+}
+
+/*
+ * Get src vertex and edge id for a destination vertex
+ */
+static void GetSrcList(const dgl_id_t* val_list,
+ const dgl_id_t* col_list,
+ const dgl_id_t* indptr,
+ const dgl_id_t dst_id,
+ std::vector<dgl_id_t>* src_list,
+ std::vector<dgl_id_t>* edge_list) {
+ for (dgl_id_t i = *(indptr+dst_id); i < *(indptr+dst_id+1); ++i) {
+ src_list->push_back(col_list[i]);
+ edge_list->push_back(val_list[i]);
+ }
+}
+
+static void RandomSample(size_t set_size,
+ size_t num,
+ std::vector<size_t>* out,
+ unsigned int* seed) {
+ std::unordered_set<size_t> sampled_idxs;
+ while (sampled_idxs.size() < num) {
+ sampled_idxs.insert(rand_r(seed) % set_size);
+ }
+ out->clear();
+ for (auto it = sampled_idxs.begin(); it != sampled_idxs.end(); it++) {
+ out->push_back(*it);
+ }
+}
+
+static void NegateSet(const std::vector<size_t> &idxs,
+ size_t set_size,
+ std::vector<size_t>* out) {
+ // idxs must have been sorted.
+ auto it = idxs.begin();
+ size_t i = 0;
+ CHECK_GT(set_size, idxs.back());
+ for (; i < set_size && it != idxs.end(); i++) {
+ if (*it == i) {
+ it++;
+ continue;
+ }
+ out->push_back(i);
+ }
+ for (; i < set_size; i++) {
+ out->push_back(i);
+ }
+}
+
+/*
+ * Uniform sample
+ */
+static void GetUniformSample(const std::vector<dgl_id_t>& ver_list,
+ const std::vector<dgl_id_t>& edge_list,
+ const size_t max_num_neighbor,
+ std::vector<dgl_id_t>* out_ver,
+ std::vector<dgl_id_t>* out_edge,
+ unsigned int* seed) {
+ CHECK_EQ(ver_list.size(), edge_list.size());
+ // Copy ver_list to output
+ if (ver_list.size() <= max_num_neighbor) {
+ for (size_t i = 0; i < ver_list.size(); ++i) {
+ out_ver->push_back(ver_list[i]);
+ out_edge->push_back(edge_list[i]);
+ }
+ return;
+ }
+ // If we just sample a small number of elements from a large neighbor list.
+ std::vector<size_t> sorted_idxs;
+ if (ver_list.size() > max_num_neighbor * 2) {
+ sorted_idxs.reserve(max_num_neighbor);
+ RandomSample(ver_list.size(), max_num_neighbor, &sorted_idxs, seed);
+ std::sort(sorted_idxs.begin(), sorted_idxs.end());
+ } else {
+ std::vector<size_t> negate;
+ negate.reserve(ver_list.size() - max_num_neighbor);
+ RandomSample(ver_list.size(), ver_list.size() - max_num_neighbor,
+ &negate, seed);
+ std::sort(negate.begin(), negate.end());
+ NegateSet(negate, ver_list.size(), &sorted_idxs);
+ }
+ // verify the result.
+ CHECK_EQ(sorted_idxs.size(), max_num_neighbor);
+ for (size_t i = 1; i < sorted_idxs.size(); i++) {
+ CHECK_GT(sorted_idxs[i], sorted_idxs[i - 1]);
+ }
+ for (auto idx : sorted_idxs) {
+ out_ver->push_back(ver_list[idx]);
+ out_edge->push_back(edge_list[idx]);
+ }
+}
+
+/*
+ * Non-uniform sample via ArrayHeap
+ */
+static void GetNonUniformSample(const float* probability,
+ const std::vector<dgl_id_t>& ver_list,
+ const std::vector<dgl_id_t>& edge_list,
+ const size_t max_num_neighbor,
+ std::vector<dgl_id_t>* out_ver,
+ std::vector<dgl_id_t>* out_edge,
+ unsigned int* seed) {
+ CHECK_EQ(ver_list.size(), edge_list.size());
+ // Copy ver_list to output
+ if (ver_list.size() <= max_num_neighbor) {
+ for (size_t i = 0; i < ver_list.size(); ++i) {
+ out_ver->push_back(ver_list[i]);
+ out_edge->push_back(edge_list[i]);
+ }
+ return;
+ }
+ // Make sample
+ std::vector<size_t> sp_index(max_num_neighbor);
+ std::vector<float> sp_prob(ver_list.size());
+ for (size_t i = 0; i < ver_list.size(); ++i) {
+ sp_prob[i] = probability[ver_list[i]];
+ }
+ ArrayHeap arrayHeap(sp_prob);
+ arrayHeap.SampleWithoutReplacement(max_num_neighbor, &sp_index, seed);
+ out_ver->resize(max_num_neighbor);
+ out_edge->resize(max_num_neighbor);
+ for (size_t i = 0; i < max_num_neighbor; ++i) {
+ size_t idx = sp_index[i];
+ out_ver->at(i) = ver_list[idx];
+ out_edge->at(i) = edge_list[idx];
+ }
+ sort(out_ver->begin(), out_ver->end());
+ sort(out_edge->begin(), out_edge->end());
+}
+
+/*
+ * This is used for BFS traversal
+ */
+struct ver_node {
+ dgl_id_t vertex_id;
+ int level;
+};
+
+/*
+ * Used for subgraph sampling
+ */
+struct neigh_list {
+ std::vector<dgl_id_t> neighs;
+ std::vector<dgl_id_t> edges;
+ neigh_list(const std::vector<dgl_id_t> &_neighs,
+ const std::vector<dgl_id_t> &_edges)
+ : neighs(_neighs), edges(_edges) {}
+};
+
+/*
+ * Sample sub-graph from csr graph
+ */
+static void SampleSubgraph(const NDArray &csr,
+ const NDArray &seed_arr,
+ const NDArray &sampled_ids,
+ const NDArray &sub_csr,
+ float* sub_prob,
+ const NDArray &sub_layer,
+ const float* probability,
+ dgl_id_t num_hops,
+ dgl_id_t num_neighbor,
+ dgl_id_t max_num_vertices) {
+ unsigned int time_seed = time(nullptr);
+ size_t num_seeds = seed_arr.shape().Size();
+ CHECK_GE(max_num_vertices, num_seeds);
+
+ const dgl_id_t* val_list = csr.data().dptr<dgl_id_t>();
+ const dgl_id_t* col_list = csr.aux_data(csr::kIdx).dptr<dgl_id_t>();
+ const dgl_id_t* indptr = csr.aux_data(csr::kIndPtr).dptr<dgl_id_t>();
+ const dgl_id_t* seed = seed_arr.data().dptr<dgl_id_t>();
+ dgl_id_t* out = sampled_ids.data().dptr<dgl_id_t>();
+ dgl_id_t* out_layer = sub_layer.data().dptr<dgl_id_t>();
+
+ // BFS traverse the graph and sample vertices
+ dgl_id_t sub_vertices_count = 0;
+ // <vertex_id, layer_id>
+ std::unordered_map<dgl_id_t, int> sub_ver_mp;
+ std::queue<ver_node> node_queue;
+ // add seed vertices
+ for (size_t i = 0; i < num_seeds; ++i) {
+ ver_node node;
+ node.vertex_id = seed[i];
+ node.level = 0;
+ node_queue.push(node);
+ }
+ std::vector<dgl_id_t> tmp_src_list;
+ std::vector<dgl_id_t> tmp_edge_list;
+ std::vector<dgl_id_t> tmp_sampled_src_list;
+ std::vector<dgl_id_t> tmp_sampled_edge_list;
+ std::unordered_map<dgl_id_t, neigh_list> neigh_mp;
+ size_t num_edges = 0;
+ while (!node_queue.empty() &&
+ sub_vertices_count <= max_num_vertices ) {
+ ver_node& cur_node = node_queue.front();
+ dgl_id_t dst_id = cur_node.vertex_id;
+ if (cur_node.level < num_hops) {
+ auto ret = sub_ver_mp.find(dst_id);
+ if (ret != sub_ver_mp.end()) {
+ node_queue.pop();
+ continue;
+ }
+ tmp_src_list.clear();
+ tmp_edge_list.clear();
+ tmp_sampled_src_list.clear();
+ tmp_sampled_edge_list.clear();
+ GetSrcList(val_list,
+ col_list,
+ indptr,
+ dst_id,
+ &tmp_src_list,
+ &tmp_edge_list);
+ if (probability == nullptr) { // uniform-sample
+ GetUniformSample(tmp_src_list,
+ tmp_edge_list,
+ num_neighbor,
+ &tmp_sampled_src_list,
+ &tmp_sampled_edge_list,
+ &time_seed);
+ } else { // non-uniform-sample
+ GetNonUniformSample(probability,
+ tmp_src_list,
+ tmp_edge_list,
+ num_neighbor,
+ &tmp_sampled_src_list,
+ &tmp_sampled_edge_list,
+ &time_seed);
+ }
+ neigh_mp.insert(std::pair<dgl_id_t, neigh_list>(dst_id,
+ neigh_list(tmp_sampled_src_list,
+ tmp_sampled_edge_list)));
+ num_edges += tmp_sampled_src_list.size();
+ sub_ver_mp[cur_node.vertex_id] = cur_node.level;
+ for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) {
+ auto ret = sub_ver_mp.find(tmp_sampled_src_list[i]);
+ if (ret == sub_ver_mp.end()) {
+ ver_node new_node;
+ new_node.vertex_id = tmp_sampled_src_list[i];
+ new_node.level = cur_node.level + 1;
+ node_queue.push(new_node);
+ }
+ }
+ } else { // vertex without any neighbor
+ auto ret = sub_ver_mp.find(dst_id);
+ if (ret != sub_ver_mp.end()) {
+ node_queue.pop();
+ continue;
+ }
+ tmp_sampled_src_list.clear();
+ tmp_sampled_edge_list.clear();
+ neigh_mp.insert(std::pair<dgl_id_t, neigh_list>(dst_id,
+ neigh_list(tmp_sampled_src_list, // empty vector
+ tmp_sampled_edge_list))); // empty vector
+ sub_ver_mp[cur_node.vertex_id] = cur_node.level;
+ }
+ sub_vertices_count++;
+ node_queue.pop();
+ }
+
+ // Copy sub_ver_mp to output[0]
+ size_t idx = 0;
+ for (auto& data : sub_ver_mp) {
+ *(out+idx) = data.first;
+ idx++;
+ }
+ size_t num_vertices = sub_ver_mp.size();
+ std::sort(out, out + num_vertices);
+ // The rest data will be set to -1
+ for (dgl_id_t i = idx; i < max_num_vertices; ++i) {
+ *(out+i) = -1;
+ }
+ // The last element stores the actual
+ // number of vertices in the subgraph.
+ out[max_num_vertices] = sub_ver_mp.size();
+ // Copy sub_probability
+ if (sub_prob != nullptr) {
+ for (dgl_id_t i = 0; i < max_num_vertices; ++i) {
+ dgl_id_t idx = out[i];
+ if (idx != -1) {
+ sub_prob[i] = probability[idx];
+ } else {
+ sub_prob[i] = -1;
+ }
+ }
+ }
+ // Copy layer
+ for (dgl_id_t i = 0; i < max_num_vertices; ++i) {
+ dgl_id_t idx = out[i];
+ if (idx != -1) {
+ out_layer[i] = sub_ver_mp[idx];
+ } else {
+ out_layer[i] = -1;
+ }
+ }
+ // Construct sub_csr_graph
+ TShape shape_1(1);
+ TShape shape_2(1);
+ shape_1[0] = num_edges;
+ shape_2[0] = max_num_vertices+1;
+ sub_csr.CheckAndAllocData(shape_1);
+ sub_csr.CheckAndAllocAuxData(csr::kIdx, shape_1);
+ sub_csr.CheckAndAllocAuxData(csr::kIndPtr, shape_2);
+ dgl_id_t* val_list_out = sub_csr.data().dptr<dgl_id_t>();
+ dgl_id_t* col_list_out = sub_csr.aux_data(1).dptr<dgl_id_t>();
+ dgl_id_t* indptr_out = sub_csr.aux_data(0).dptr<dgl_id_t>();
+ indptr_out[0] = 0;
+ size_t collected_nedges = 0;
+ for (size_t i = 0; i < num_vertices; i++) {
+ dgl_id_t dst_id = *(out + i);
+ auto it = neigh_mp.find(dst_id);
+ const auto &edges = it->second.edges;
+ const auto &neighs = it->second.neighs;
+ CHECK_EQ(edges.size(), neighs.size());
+ if (!edges.empty()) {
+ std::copy(edges.begin(), edges.end(), val_list_out + collected_nedges);
+ std::copy(neighs.begin(), neighs.end(), col_list_out + collected_nedges);
+ collected_nedges += edges.size();
+ }
+ indptr_out[i+1] = indptr_out[i] + edges.size();
+ }
+ for (dgl_id_t i = num_vertices+1; i <= max_num_vertices; ++i) {
+ indptr_out[i] = indptr_out[i-1];
+ }
+}
+
+/*
+ * Operator: contrib_csr_neighbor_uniform_sample
+ */
+static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ int num_subgraphs = inputs.size() - 1;
+ CHECK_EQ(outputs.size(), 3 * num_subgraphs);
+
+#pragma omp parallel for
+ for (int i = 0; i < num_subgraphs; i++) {
+ SampleSubgraph(inputs[0], // graph_csr
+ inputs[i + 1], // seed vector
+ outputs[i], // sample_id
+ outputs[i + 1*num_subgraphs], // sub_csr
+ nullptr, // sample_id_probability
+ outputs[i + 2*num_subgraphs], // sample_id_layer
+ nullptr, // probability
+ params.num_hops,
+ params.num_neighbor,
+ params.max_num_vertices);
+ }
+}
+
+NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_uniform_sample)
+.describe(R"code(This operator samples sub-graph from a csr graph via an
+uniform probability.
+Example::
+
+ shape = (5, 5)
+ data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
+ indices_np = np.array([1,2,3,4,0,2,3,4,0,1,3,4,0,1,2,4,0,1,2,3], dtype=np.int64)
+ indptr_np = np.array([0,4,8,12,16,20], dtype=np.int64)
+ a = mx.nd.sparse.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+ a.asnumpy()
+ seed = mx.nd.array([0,1,2,3,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
+
+ out[0]
+ [0 1 2 3 4 5]
+ <NDArray 6 @cpu(0)>
+
+ out[1].asnumpy()
+ array([[ 0, 1, 0, 3, 0],
+ [ 5, 0, 0, 7, 0],
+ [ 9, 0, 0, 11, 0],
+ [13, 0, 15, 0, 0],
+ [17, 0, 19, 0, 0]])
+
+ out[2]
+ [0 0 0 0 0]
+<NDArray 5 @cpu(0)>
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<NeighborSampleParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+ return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+ size_t num_subgraphs = params.num_args - 1;
+ return num_subgraphs * 3;
+})
+.set_attr<FInferStorageType>("FInferStorageType", CSRNeighborUniformSampleStorageType)
+.set_attr<nnvm::FInferShape>("FInferShape", CSRNeighborUniformSampleShape)
+.set_attr<nnvm::FInferType>("FInferType", CSRNeighborUniformSampleType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CSRNeighborUniformSampleComputeExCPU)
+.add_argument("csr_matrix", "NDArray-or-Symbol", "csr matrix")
+.add_argument("seed_arrays", "NDArray-or-Symbol[]", "seed vertices")
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_arguments(NeighborSampleParam::__FIELDS__());
+
+/*
+ * Operator: contrib_csr_neighbor_non_uniform_sample
+ */
+static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+
+ int num_subgraphs = inputs.size() - 2;
+ CHECK_EQ(outputs.size(), 4 * num_subgraphs);
+
+ const float* probability = inputs[1].data().dptr<float>();
+
+#pragma omp parallel for
+ for (int i = 0; i < num_subgraphs; i++) {
+ float* sub_prob = outputs[i+2*num_subgraphs].data().dptr<float>();
+ SampleSubgraph(inputs[0], // graph_csr
+ inputs[i + 2], // seed vector
+ outputs[i], // sample_id
+ outputs[i + 1*num_subgraphs], // sub_csr
+ sub_prob, // sample_id_probability
+ outputs[i + 3*num_subgraphs], // sample_id_layer
+ probability,
+ params.num_hops,
+ params.num_neighbor,
+ params.max_num_vertices);
+ }
+}
+
+NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_non_uniform_sample)
+.describe(R"code(This operator samples sub-graph from a csr graph via an
+uniform probability.
+Example::
+
+ shape = (5, 5)
+ prob = mx.nd.array([0.9, 0.8, 0.2, 0.4, 0.1], dtype=np.float32)
+ data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
+ indices_np = np.array([1,2,3,4,0,2,3,4,0,1,3,4,0,1,2,4,0,1,2,3], dtype=np.int64)
+ indptr_np = np.array([0,4,8,12,16,20], dtype=np.int64)
+ a = mx.nd.sparse.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+ seed = mx.nd.array([0,1,2,3,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=1, num_neighbor=2, max_num_vertices=5)
+
+ out[0]
+ [0 1 2 3 4 5]
+ <NDArray 6 @cpu(0)>
+
+ out[1].asnumpy()
+ array([[ 0, 1, 2, 0, 0],
+ [ 5, 0, 6, 0, 0],
+ [ 9, 10, 0, 0, 0],
+ [13, 14, 0, 0, 0],
+ [ 0, 18, 19, 0, 0]])
+
+ out[2]
+ [0.9 0.8 0.2 0.4 0.1]
+ <NDArray 5 @cpu(0)>
+
+ out[3]
+ [0 0 0 0 0]
+ <NDArray 5 @cpu(0)>
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<NeighborSampleParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+ return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+ const NeighborSampleParam& params =
+ nnvm::get<NeighborSampleParam>(attrs.parsed);
+ size_t num_subgraphs = params.num_args - 2;
+ return num_subgraphs * 4;
+})
+.set_attr<FInferStorageType>("FInferStorageType", CSRNeighborNonUniformSampleStorageType)
+.set_attr<nnvm::FInferShape>("FInferShape", CSRNeighborNonUniformSampleShape)
+.set_attr<nnvm::FInferType>("FInferType", CSRNeighborNonUniformSampleType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CSRNeighborNonUniformSampleComputeExCPU)
+.add_argument("csr_matrix", "NDArray-or-Symbol", "csr matrix")
+.add_argument("probability", "NDArray-or-Symbol", "probability vector")
+.add_argument("seed_arrays", "NDArray-or-Symbol[]", "seed vertices")
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_arguments(NeighborSampleParam::__FIELDS__());
///////////////////////// Create induced subgraph ///////////////////////////
@@ -104,8 +973,6 @@ static bool DGLSubgraphType(const nnvm::NodeAttrs& attrs,
return true;
}
-typedef int64_t dgl_id_t;
-
class Bitmap {
const size_t size = 1024 * 1024 * 4;
const size_t mask = size - 1;
@@ -523,6 +1390,5 @@ Example::
.set_attr<FComputeEx>("FComputeEx<cpu>", DGLAdjacencyForwardEx<cpu>)
.add_argument("data", "NDArray-or-Symbol", "Input ndarray");
-
} // namespace op
} // namespace mxnet
diff --git a/tests/python/unittest/test_dgl_graph.py b/tests/python/unittest/test_dgl_graph.py
index 1d758b9..f996d7f 100644
--- a/tests/python/unittest/test_dgl_graph.py
+++ b/tests/python/unittest/test_dgl_graph.py
@@ -26,6 +26,128 @@ from numpy.testing import assert_allclose, assert_array_equal
from mxnet.test_utils import *
import unittest
+def check_uniform(out, num_hops, max_num_vertices):
+ sample_id = out[0]
+ sub_csr = out[1]
+ layer = out[2]
+ # check sample_id
+ assert (len(sample_id) == max_num_vertices+1)
+ count = 0
+ for data in sample_id:
+ if data != -1:
+ count = count + 1
+ assert (mx.nd.array([count-1], dtype=np.int64) == sample_id[-1])
+ # check sub_csr
+ sub_csr.check_format(full_check=True)
+ # check layer
+ for data in layer:
+ assert(data <= num_hops)
+
+def check_non_uniform(out, num_hops, max_num_vertices):
+ sample_id = out[0]
+ sub_csr = out[1]
+ prob = out[2]
+ layer = out[3]
+ # check sample_id
+ assert (len(sample_id) == max_num_vertices+1)
+ count = 0
+ for data in sample_id:
+ if data != -1:
+ count = count + 1
+ assert (mx.nd.array([count-1], dtype=np.int64) == sample_id[-1])
+ # check sub_csr
+ sub_csr.check_format(full_check=True)
+ # check prob
+ assert (len(prob) == max_num_vertices)
+ # check layer
+ for data in layer:
+ assert(data <= num_hops)
+
+def test_uniform_sample():
+ shape = (5, 5)
+ data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
+ indices_np = np.array([1,2,3,4,0,2,3,4,0,1,3,4,0,1,2,4,0,1,2,3], dtype=np.int64)
+ indptr_np = np.array([0,4,8,12,16,20], dtype=np.int64)
+ a = mx.nd.sparse.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+
+ seed = mx.nd.array([0,1,2,3,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 3)
+ check_uniform(out, num_hops=1, max_num_vertices=5)
+
+ seed = mx.nd.array([0], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=1, max_num_vertices=4)
+ assert (len(out) == 3)
+ check_uniform(out, num_hops=1, max_num_vertices=4)
+
+ seed = mx.nd.array([0], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=1, max_num_vertices=4)
+ assert (len(out) == 3)
+ check_uniform(out, num_hops=2, max_num_vertices=4)
+
+ seed = mx.nd.array([0,2,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 3)
+ check_uniform(out, num_hops=1, max_num_vertices=5)
+
+ seed = mx.nd.array([0,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 3)
+ check_uniform(out, num_hops=1, max_num_vertices=5)
+
+ seed = mx.nd.array([0,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 3)
+ check_uniform(out, num_hops=2, max_num_vertices=5)
+
+ seed = mx.nd.array([0,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 3)
+ check_uniform(out, num_hops=1, max_num_vertices=5)
+
+def test_non_uniform_sample():
+ shape = (5, 5)
+ prob = mx.nd.array([0.9, 0.8, 0.2, 0.4, 0.1], dtype=np.float32)
+ data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
+ indices_np = np.array([1,2,3,4,0,2,3,4,0,1,3,4,0,1,2,4,0,1,2,3], dtype=np.int64)
+ indptr_np = np.array([0,4,8,12,16,20], dtype=np.int64)
+ a = mx.nd.sparse.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+
+ seed = mx.nd.array([0,1,2,3,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 4)
+ check_non_uniform(out, num_hops=1, max_num_vertices=5)
+
+ seed = mx.nd.array([0], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=1, num_neighbor=1, max_num_vertices=4)
+ assert (len(out) == 4)
+ check_non_uniform(out, num_hops=1, max_num_vertices=4)
+
+ seed = mx.nd.array([0], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=2, num_neighbor=1, max_num_vertices=4)
+ assert (len(out) == 4)
+ check_non_uniform(out, num_hops=2, max_num_vertices=4)
+
+ seed = mx.nd.array([0,2,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 4)
+ check_non_uniform(out, num_hops=1, max_num_vertices=5)
+
+ seed = mx.nd.array([0,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 4)
+ check_non_uniform(out, num_hops=1, max_num_vertices=5)
+
+ seed = mx.nd.array([0,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=2, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 4)
+ check_non_uniform(out, num_hops=2, max_num_vertices=5)
+
+ seed = mx.nd.array([0,4], dtype=np.int64)
+ out = mx.nd.contrib.dgl_csr_neighbor_non_uniform_sample(a, prob, seed, num_args=3, num_hops=1, num_neighbor=2, max_num_vertices=5)
+ assert (len(out) == 4)
+ check_non_uniform(out, num_hops=1, max_num_vertices=5)
+
def test_edge_id():
shape = rand_shape_2d()
data = rand_ndarray(shape, stype='csr', density=0.4)