You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@marmotta.apache.org by ss...@apache.org on 2016/11/19 12:30:12 UTC

[1/2] marmotta git commit: Ostrich code improvements (well known URI compression etc)

Repository: marmotta
Updated Branches:
  refs/heads/develop d8252c632 -> 7275a000e


Ostrich code improvements (well known URI compression etc)


Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo
Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/d6694171
Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/d6694171
Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/d6694171

Branch: refs/heads/develop
Commit: d66941719b0a145c3a885c4bdf0db9b49d55578c
Parents: d3f9f73
Author: Sebastian Schaffert <ss...@apache.org>
Authored: Sun Oct 30 12:45:05 2016 +0100
Committer: Sebastian Schaffert <ss...@apache.org>
Committed: Sun Oct 30 12:45:05 2016 +0100

----------------------------------------------------------------------
 libraries/ostrich/backend/model/CMakeLists.txt  |   2 +-
 .../ostrich/backend/model/rdf_namespaces.cc     |  42 +++++++
 .../ostrich/backend/model/rdf_namespaces.h      |  35 ++++++
 .../backend/persistence/base_persistence.cc     |  28 +++++
 .../backend/persistence/base_persistence.h      | 120 +++++++++++++++++++
 .../backend/persistence/leveldb_persistence.cc  |   4 +-
 .../backend/persistence/rocksdb_persistence.cc  |  23 +++-
 .../backend/persistence/rocksdb_persistence.h   |   3 -
 .../ostrich/backend/test/PersistenceTest.cc     |  52 ++++++++
 9 files changed, 298 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/model/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/model/CMakeLists.txt b/libraries/ostrich/backend/model/CMakeLists.txt
index 473e6c8..94923be 100644
--- a/libraries/ostrich/backend/model/CMakeLists.txt
+++ b/libraries/ostrich/backend/model/CMakeLists.txt
@@ -2,5 +2,5 @@ file(GLOB ProtoFiles "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
 PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS ${ProtoFiles})
 include_directories(.. ${CMAKE_CURRENT_BINARY_DIR}/..)
 
-add_library(marmotta_model rdf_model.cc rdf_model.h ${PROTO_SRCS} ${PROTO_HDRS} rdf_operators.h rdf_operators.cc)
+add_library(marmotta_model rdf_model.cc rdf_model.h ${PROTO_SRCS} ${PROTO_HDRS} rdf_operators.h rdf_operators.cc rdf_namespaces.h rdf_namespaces.cc)
 target_link_libraries(marmotta_model ${CMAKE_THREAD_LIBS_INIT} ${PROTOBUF_LIBRARIES})
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/model/rdf_namespaces.cc
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/model/rdf_namespaces.cc b/libraries/ostrich/backend/model/rdf_namespaces.cc
new file mode 100644
index 0000000..3f54017
--- /dev/null
+++ b/libraries/ostrich/backend/model/rdf_namespaces.cc
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "model/rdf_namespaces.h"
+
+namespace marmotta {
+namespace rdf {
+
+const std::map<std::string, std::string>& NamespacesByPrefix() {
+    static const std::map<std::string, std::string> kNamespacePrefixes = {
+            {"skos:", "http://www.w3.org/2004/02/skos/core#"},
+            {"rdf:",  "http://www.w3.org/1999/02/22-rdf-syntax-ns#"},
+            {"rdfs:", "http://www.w3.org/2000/01/rdf-schema#"},
+            {"owl:",  "http://www.w3.org/2002/07/owl#"},
+            {"xmls:", "http://www.w3.org/2001/XMLSchema#"},
+            {"foaf:", "http://xmlns.com/foaf/0.1/"},
+            {"dcterms:", "http://purl.org/dc/terms/"},
+            {"dcelems:", "http://purl.org/dc/elements/1.1/"},
+            {"dctypes:", "http://purl.org/dc/dcmitype/"},
+            {"dbpedia:", "http://dbpedia.org/resource/"},
+
+    };
+    return kNamespacePrefixes;
+}
+
+}  // namespace rdf
+}  // namespace marmotta
+

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/model/rdf_namespaces.h
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/model/rdf_namespaces.h b/libraries/ostrich/backend/model/rdf_namespaces.h
new file mode 100644
index 0000000..793ac2f
--- /dev/null
+++ b/libraries/ostrich/backend/model/rdf_namespaces.h
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MARMOTTA_RDF_NAMESPACES_H
+#define MARMOTTA_RDF_NAMESPACES_H
+
+#include <map>
+#include <string>
+
+// Contains maps of well-known default namespaces.
+namespace marmotta {
+namespace rdf {
+
+// Return a map from namespace prefix name (including ":") to
+// namespace URI.
+const std::map<std::string, std::string>& NamespacesByPrefix();
+
+}  // namespace rdf
+}  // namespace marmotta
+
+#endif //MARMOTTA_RDF_NAMESPACES_H

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/persistence/base_persistence.cc
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/persistence/base_persistence.cc b/libraries/ostrich/backend/persistence/base_persistence.cc
index 6635647..ce1727b 100644
--- a/libraries/ostrich/backend/persistence/base_persistence.cc
+++ b/libraries/ostrich/backend/persistence/base_persistence.cc
@@ -19,6 +19,7 @@
 
 #include <cstring>
 
+#include "model/rdf_namespaces.h"
 #include "model/rdf_operators.h"
 #include "util/murmur3.h"
 
@@ -188,6 +189,33 @@ bool Matches(const Statement& pattern, const Statement& stmt) {
     return !(pattern.has_object() && stmt.object() != pattern.object());
 }
 
+
+// Apply prefix substitution for well-known URIs to save disk space.
+// Modifies the string passed as argument.
+void EncodeWellknownURI(std::string* uri) {
+    for (auto& ns : rdf::NamespacesByPrefix()) {
+        if (uri->compare(0, ns.second.size(), ns.second) == 0) {
+            std::string tmp = ns.first;
+            tmp += uri->substr(ns.second.size());
+            uri->swap(tmp);
+            return;
+        }
+    }
+}
+
+// Unapply prefix substitution for well-known URIs.
+// Modifies the string passed as argument.
+void DecodeWellknownURI(std::string* uri) {
+    for (auto& ns : rdf::NamespacesByPrefix()) {
+        if (uri->compare(0, ns.first.size(), ns.first) == 0) {
+            std::string tmp = ns.second;
+            tmp += uri->substr(ns.first.size());
+            uri->swap(tmp);
+            return;
+        }
+    }
+}
+
 }  // namespace persistence
 }  // namespace marmotta
 

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/persistence/base_persistence.h
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/persistence/base_persistence.h b/libraries/ostrich/backend/persistence/base_persistence.h
index 89a5822..f1cc494 100644
--- a/libraries/ostrich/backend/persistence/base_persistence.h
+++ b/libraries/ostrich/backend/persistence/base_persistence.h
@@ -27,6 +27,125 @@
 namespace marmotta {
 namespace persistence {
 
+// Apply prefix substitution for well-known URIs to save disk space.
+// Modifies the string passed as argument.
+void EncodeWellknownURI(std::string* uri);
+
+// Apply prefix substitution for well-known URIs to save disk space.
+// Replaces the uri string of the URI with the encoded one
+inline void EncodeWellknownURI(rdf::proto::URI* value){
+    EncodeWellknownURI(value->mutable_uri());
+}
+
+// Apply prefix substitution for well-known URIs to save disk space.
+// Replaces the uri string of the type URI with the encoded one
+inline void EncodeWellknownURI(rdf::proto::DatatypeLiteral* value) {
+    EncodeWellknownURI(value->mutable_datatype());
+}
+
+// Apply prefix substitution for well-known URIs to save disk space.
+// Cases:
+// - value is a URI: replace the uri string with the encoded one
+// - otherwise: do nothing
+inline void EncodeWellknownURI(rdf::proto::Resource* value) {
+    if (value->has_uri()) {
+        EncodeWellknownURI(value->mutable_uri());
+    }
+}
+
+// Apply prefix substitution for well-known URIs to save disk space.
+// Cases:
+// - value is a URI: replace the uri string with the encoded one
+// - value is a DatatypeLiteral: replace type URI with encoded one
+// - otherwise: do nothing
+inline void EncodeWellknownURI(rdf::proto::Value* value) {
+    if (value->has_resource()) {
+        EncodeWellknownURI(value->mutable_resource());
+    } else if (value->has_literal() && value->mutable_literal()->has_dataliteral()) {
+        EncodeWellknownURI(value->mutable_literal()->mutable_dataliteral());
+    }
+}
+
+// Apply prefix substitution for well-known URIs to save disk space.
+// Performs prefix substitution for subject, predicate, object and context.
+inline void EncodeWellknownURI(rdf::proto::Statement* stmt) {
+    if (stmt->has_subject()) {
+        EncodeWellknownURI(stmt->mutable_subject());
+    }
+    if (stmt->has_predicate()) {
+        EncodeWellknownURI(stmt->mutable_predicate());
+    }
+    if (stmt->has_object()) {
+        EncodeWellknownURI(stmt->mutable_object());
+    }
+    if (stmt->has_context()) {
+        EncodeWellknownURI(stmt->mutable_context());
+    }
+}
+
+// Compatibility placeholder, does nothing for namespaces.
+inline void EncodeWellknownURI(rdf::proto::Namespace* ns) {}
+
+// Unapply prefix substitution for well-known URIs.
+// Modifies the string passed as argument.
+void DecodeWellknownURI(std::string* uri);
+
+// Unapply prefix substitution for well-known URIs.
+// Replaces the uri string of the URI with the decoded one
+inline void DecodeWellknownURI(rdf::proto::URI* value){
+    DecodeWellknownURI(value->mutable_uri());
+}
+
+// Unapply prefix substitution for well-known URIs.
+// Replaces the uri string of the type URI with the decoded one
+inline void DecodeWellknownURI(rdf::proto::DatatypeLiteral* value) {
+    DecodeWellknownURI(value->mutable_datatype());
+}
+
+// Unapply prefix substitution for well-known URIs.
+// Cases:
+// - value is a URI: replace the uri string with the decoded one
+// - otherwise: do nothing
+inline void DecodeWellknownURI(rdf::proto::Resource* value) {
+    if (value->has_uri()) {
+        DecodeWellknownURI(value->mutable_uri());
+    }
+}
+
+// Unapply prefix substitution for well-known URIs.
+// Cases:
+// - value is a URI: replace the uri string with the decoded one
+// - value is a DatatypeLiteral: replace type URI with decoded one
+// - otherwise: do nothing
+inline void DecodeWellknownURI(rdf::proto::Value* value) {
+    if (value->has_resource()) {
+        DecodeWellknownURI(value->mutable_resource());
+    } else if (value->has_literal() && value->mutable_literal()->has_dataliteral()) {
+        DecodeWellknownURI(value->mutable_literal()->mutable_dataliteral());
+    }
+}
+
+// Apply prefix substitution for well-known URIs to save disk space.
+// Performs prefix substitution for subject, predicate, object and context.
+inline void DecodeWellknownURI(rdf::proto::Statement* stmt) {
+    if (stmt->has_subject()) {
+        DecodeWellknownURI(stmt->mutable_subject());
+    }
+    if (stmt->has_predicate()) {
+        DecodeWellknownURI(stmt->mutable_predicate());
+    }
+    if (stmt->has_object()) {
+        DecodeWellknownURI(stmt->mutable_object());
+    }
+    if (stmt->has_context()) {
+        DecodeWellknownURI(stmt->mutable_context());
+    }
+}
+
+// Compatibility placeholder, does nothing for namespaces.
+inline void DecodeWellknownURI(rdf::proto::Namespace* ns) {}
+
+// Length of key in bytes per field S, P, O and C.
 constexpr int kKeyLength = 16;
 
 enum IndexTypes {
@@ -188,6 +307,7 @@ class DBIterator : public util::CloseableIterator<T> {
     const T& next() override {
         // Parse current position, then iterate to next position for next call.
         proto.ParseFromString(it->value().ToString());
+        DecodeWellknownURI(&proto);
         it->Next();
         return proto;
     };

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/persistence/leveldb_persistence.cc
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/persistence/leveldb_persistence.cc b/libraries/ostrich/backend/persistence/leveldb_persistence.cc
index a14542b..55e8d91 100644
--- a/libraries/ostrich/backend/persistence/leveldb_persistence.cc
+++ b/libraries/ostrich/backend/persistence/leveldb_persistence.cc
@@ -469,8 +469,10 @@ void LevelDBPersistence::AddStatement(
 
     Key key(stmt);
 
+    Statement encoded = stmt;
+    EncodeWellknownURI(&encoded);
     std::string buffer;
-    stmt.SerializeToString(&buffer);
+    encoded.SerializeToString(&buffer);
 
     char *k_spoc = key.Create(IndexTypes::SPOC);
     spoc.Put(leveldb::Slice(k_spoc, 4 * KEY_LENGTH), buffer);

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/persistence/rocksdb_persistence.cc
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/persistence/rocksdb_persistence.cc b/libraries/ostrich/backend/persistence/rocksdb_persistence.cc
index db64b4d..1231285 100644
--- a/libraries/ostrich/backend/persistence/rocksdb_persistence.cc
+++ b/libraries/ostrich/backend/persistence/rocksdb_persistence.cc
@@ -19,10 +19,12 @@
 
 #include <chrono>
 #include <memory>
+#include <queue>
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <rocksdb/filter_policy.h>
+#include <rocksdb/statistics.h>
 #include <rocksdb/write_batch.h>
 #include <google/protobuf/wrappers.pb.h>
 #include <thread>
@@ -33,8 +35,10 @@
 
 #define CHECK_STATUS(s) CHECK(s.ok()) << "Writing to database failed: " << s.ToString()
 
-DEFINE_int64(write_batch_size, 1000000,
+DEFINE_int64(write_batch_size, 100000,
              "Maximum number of statements to write in a single batch to the database");
+DEFINE_bool(enable_statistics, false,
+             "Enable statistics collection and output.");
 
 
 constexpr char kSPOC[] = "spoc";
@@ -88,8 +92,7 @@ class StatementRangeIterator : public RocksDBIterator<Statement> {
 }  // namespace
 
 
-RocksDBPersistence::RocksDBPersistence(const std::string &path, int64_t cacheSize)
-        : workers_(8) {
+RocksDBPersistence::RocksDBPersistence(const std::string &path, int64_t cacheSize) {
     rocksdb::Options options;
     options.create_if_missing = true;
     options.create_missing_column_families = true;
@@ -103,6 +106,11 @@ RocksDBPersistence::RocksDBPersistence(const std::string &path, int64_t cacheSiz
     // Write buffer size 16MB (fast bulk imports)
     options.write_buffer_size = 16384 * 1024;
 
+    if (FLAGS_enable_statistics) {
+        options.statistics = rocksdb::CreateDBStatistics();
+        options.stats_dump_period_sec = 300;
+    }
+
     ColumnFamilyOptions cfOptions;
     cfOptions.OptimizeLevelStyleCompaction();
 
@@ -215,6 +223,7 @@ service::proto::UpdateResponse RocksDBPersistence::AddStatements(StatementIterat
     }
 
     CHECK_STATUS(database_->Write(rocksdb::WriteOptions(), &batch));
+    batch.Clear();
 
     LOG(INFO) << "Imported " << count << " statements (time="
               << std::chrono::duration <double, std::milli> (
@@ -379,10 +388,12 @@ void RocksDBPersistence::AddStatement(
         const Statement &stmt, WriteBatch &batch) {
     DLOG(INFO) << "Adding statement " << stmt.DebugString();
 
-    Key key(stmt);
-
     std::string buffer;
-    stmt.SerializeToString(&buffer);
+    Statement encoded = stmt;
+    EncodeWellknownURI(&encoded);
+    encoded.SerializeToString(&buffer);
+
+    Key key(stmt);
 
     char *k_spoc = key.Create(IndexTypes::SPOC);
     batch.Put(handles_[Handles::ISPOC], rocksdb::Slice(k_spoc, 4 * KEY_LENGTH), buffer);

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/persistence/rocksdb_persistence.h
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/persistence/rocksdb_persistence.h b/libraries/ostrich/backend/persistence/rocksdb_persistence.h
index a04169b..0c9b3ab 100644
--- a/libraries/ostrich/backend/persistence/rocksdb_persistence.h
+++ b/libraries/ostrich/backend/persistence/rocksdb_persistence.h
@@ -27,7 +27,6 @@
 #include <rocksdb/comparator.h>
 
 #include "persistence/base_persistence.h"
-#include "util/threadpool.h"
 
 namespace marmotta {
 namespace persistence {
@@ -124,8 +123,6 @@ class RocksDBPersistence : public Persistence {
      */
     int64_t Size() override;
  private:
-    ctpl::thread_pool workers_;
-
     KeyComparator comparator_;
     std::unique_ptr<rocksdb::DB> database_;
 

http://git-wip-us.apache.org/repos/asf/marmotta/blob/d6694171/libraries/ostrich/backend/test/PersistenceTest.cc
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/test/PersistenceTest.cc b/libraries/ostrich/backend/test/PersistenceTest.cc
index 7b415e1..7f37d92 100644
--- a/libraries/ostrich/backend/test/PersistenceTest.cc
+++ b/libraries/ostrich/backend/test/PersistenceTest.cc
@@ -60,6 +60,58 @@ TEST(KeyTest, BoundsDiffer) {
     }
 }
 
+TEST(URITest, EncodeURI) {
+    std::string uri1 = "http://www.w3.org/2002/07/owl#sameAs";
+    std::string uri2 = "http://marmotta.apache.org/test/uri1";
+
+    EncodeWellknownURI(&uri1);
+    EXPECT_EQ("owl:sameAs", uri1);
+
+    EncodeWellknownURI(&uri2);
+    EXPECT_EQ("http://marmotta.apache.org/test/uri1", uri2);
+}
+
+TEST(URITest, EncodeURIProto) {
+    rdf::URI uri1 = "http://www.w3.org/2002/07/owl#sameAs";
+    rdf::URI uri2 = "http://marmotta.apache.org/test/uri1";
+
+    rdf::proto::URI msg1 = uri1.getMessage();
+    rdf::proto::URI msg2 = uri2.getMessage();
+
+    EncodeWellknownURI(&msg1);
+    EXPECT_EQ("owl:sameAs", msg1.uri());
+
+    EncodeWellknownURI(&msg2);
+    EXPECT_EQ("http://marmotta.apache.org/test/uri1", msg2.uri());
+}
+
+
+TEST(URITest, DecodeURI) {
+    std::string uri1 = "owl:sameAs";
+    std::string uri2 = "http://marmotta.apache.org/test/uri1";
+
+    DecodeWellknownURI(&uri1);
+    EXPECT_EQ("http://www.w3.org/2002/07/owl#sameAs", uri1);
+
+    DecodeWellknownURI(&uri2);
+    EXPECT_EQ("http://marmotta.apache.org/test/uri1", uri2);
+}
+
+TEST(URITest, DecodeURIProto) {
+    rdf::URI uri1 = "owl:sameAs";
+    rdf::URI uri2 = "http://marmotta.apache.org/test/uri1";
+
+    rdf::proto::URI msg1 = uri1.getMessage();
+    rdf::proto::URI msg2 = uri2.getMessage();
+
+    DecodeWellknownURI(&msg1);
+    EXPECT_EQ("http://www.w3.org/2002/07/owl#sameAs", msg1.uri());
+
+    DecodeWellknownURI(&msg2);
+    EXPECT_EQ("http://marmotta.apache.org/test/uri1", msg2.uri());
+}
+
+
 }  // namespace test
 }  // namespace persistence
 }  // namespace marmotta


[2/2] marmotta git commit: Merge remote-tracking branch 'origin/develop' into develop

Posted by ss...@apache.org.
Merge remote-tracking branch 'origin/develop' into develop


Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo
Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/7275a000
Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/7275a000
Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/7275a000

Branch: refs/heads/develop
Commit: 7275a000e3281b52a54b486517c7707f15343899
Parents: d669417 d8252c6
Author: Sebastian Schaffert <ss...@apache.org>
Authored: Sun Oct 30 12:45:21 2016 +0100
Committer: Sebastian Schaffert <ss...@apache.org>
Committed: Sun Oct 30 12:45:21 2016 +0100

----------------------------------------------------------------------
 README.md                                       |  2 ++
 .../kiwi/sparql/test/KiWiSparqlTest.java        | 19 ++++++++++++++
 .../kiwi/sparql/test/MARMOTTA-651_1.sparql      | 26 ++++++++++++++++++++
 parent/pom.xml                                  |  2 +-
 .../ldp/webservices/LdpWebServiceTest.java      |  7 +++---
 .../sparql/webservices/SparqlWebService.java    |  2 +-
 .../src/main/resources/web/admin/sgvizler.html  |  4 +--
 7 files changed, 55 insertions(+), 7 deletions(-)
----------------------------------------------------------------------