You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@marmotta.apache.org by wi...@apache.org on 2016/05/02 11:49:54 UTC

[17/50] [abbrv] marmotta git commit: [MARMOTTA-621] implement a loader backend for Ostrich, experiment with DBPedia gives arounf 45k triples/sec

[MARMOTTA-621] implement a loader backend for Ostrich, experiment with DBPedia gives arounf 45k triples/sec


Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo
Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/8f60cf64
Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/8f60cf64
Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/8f60cf64

Branch: refs/heads/MARMOTTA-584
Commit: 8f60cf64dc8b8e99d89dbf4e54fa81af3635606f
Parents: 4eb074d
Author: Sebastian Schaffert <ss...@apache.org>
Authored: Sun Dec 20 12:40:52 2015 +0100
Committer: Sebastian Schaffert <ss...@apache.org>
Committed: Sun Dec 20 12:40:52 2015 +0100

----------------------------------------------------------------------
 libraries/ostrich/backend/CMakeLists.txt        |   2 +-
 .../ostrich/sail/OstrichSailConnection.java     |  14 +-
 loader/marmotta-loader-ostrich/pom.xml          |  83 ++++++++++
 .../loader/ostrich/OstrichLoaderBackend.java    | 100 ++++++++++++
 .../loader/ostrich/OstrichLoaderHandler.java    | 156 +++++++++++++++++++
 ...org.apache.marmotta.loader.api.LoaderBackend |  18 +++
 .../src/main/resources/logback.xml              |  32 ++++
 loader/pom.xml                                  |  10 ++
 8 files changed, 407 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/libraries/ostrich/backend/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/libraries/ostrich/backend/CMakeLists.txt b/libraries/ostrich/backend/CMakeLists.txt
index 5a8f110..61156a5 100644
--- a/libraries/ostrich/backend/CMakeLists.txt
+++ b/libraries/ostrich/backend/CMakeLists.txt
@@ -17,7 +17,7 @@ find_package (GLog REQUIRED)
 find_package (Boost 1.54.0 COMPONENTS iostreams filesystem system)
 find_package (Tcmalloc)
 
-#add_definitions(-DNDEBUG)
+add_definitions(-DNDEBUG)
 
 if (Boost_IOSTREAMS_FOUND)
     message(STATUS "Enabling gzip/bzip2 support (Boost iostreams found)")

http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java
----------------------------------------------------------------------
diff --git a/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java b/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java
index 93e21ac..e85fdd2 100644
--- a/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java
+++ b/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java
@@ -107,7 +107,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase {
 
     @Override
     protected void addStatementInternal(Resource subj, URI pred, Value obj, Resource... contexts) throws SailException {
-        log.info("Adding statements.");
+        log.debug("Adding statements.");
         ensureTransaction();
 
         if (contexts.length > 0) {
@@ -319,7 +319,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase {
 
     @Override
     protected void removeStatementsInternal(Resource subj, URI pred, Value obj, Resource... contexts) throws SailException {
-        log.info("Removing statements.");
+        log.debug("Removing statements.");
         commitForQuery();
         ensureTransaction();
 
@@ -338,7 +338,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase {
 
     @Override
     protected void clearInternal(Resource... contexts) throws SailException {
-        log.info("Clearing statements.");
+        log.debug("Clearing statements.");
         commitForQuery();
         ensureTransaction();
 
@@ -357,7 +357,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase {
 
     @Override
     protected CloseableIteration<? extends Namespace, SailException> getNamespacesInternal() throws SailException {
-        log.info("Getting namespaces.");
+        log.debug("Getting namespaces.");
         commitForQuery();
 
         Empty pattern = Empty.getDefaultInstance();
@@ -382,7 +382,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase {
 
     @Override
     protected void setNamespaceInternal(String prefix, String name) throws SailException {
-        log.info("Setting namespace {} = {}.", prefix, name);
+        log.debug("Setting namespace {} = {}.", prefix, name);
         ensureTransaction();
 
         ProtoNamespace ns = new ProtoNamespace(prefix, name);
@@ -393,7 +393,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase {
 
     @Override
     protected void removeNamespaceInternal(String prefix) throws SailException {
-        log.info("Removing namespace {}.", prefix);
+        log.debug("Removing namespace {}.", prefix);
         commitForQuery();
         ensureTransaction();
 
@@ -404,7 +404,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase {
 
     @Override
     protected void clearNamespacesInternal() throws SailException {
-        log.info("Clearing namespaces.");
+        log.debug("Clearing namespaces.");
         commitForQuery();
         ensureTransaction();
 

http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/pom.xml
----------------------------------------------------------------------
diff --git a/loader/marmotta-loader-ostrich/pom.xml b/loader/marmotta-loader-ostrich/pom.xml
new file mode 100644
index 0000000..5816b68
--- /dev/null
+++ b/loader/marmotta-loader-ostrich/pom.xml
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.marmotta</groupId>
+        <artifactId>marmotta-parent</artifactId>
+        <version>3.4.0-SNAPSHOT</version>
+        <relativePath>../../parent</relativePath>
+    </parent>
+
+    <artifactId>marmotta-loader-ostrich</artifactId>
+    <name>Loader: Ostrich Backend</name>
+
+    <description>
+        Apache Marmotta bulk loader backend for loading large datasets into a Ostrich/LevelDB backend.
+    </description>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>2.2</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>false</createDependencyReducedPom>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer" />
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>org.apache.marmotta.loader.core.MarmottaLoader</mainClass>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
+                                    <addHeader>false</addHeader>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.marmotta</groupId>
+            <artifactId>marmotta-loader-core</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.marmotta</groupId>
+            <artifactId>ostrich-model</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.marmotta</groupId>
+            <artifactId>ostrich-client</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java
----------------------------------------------------------------------
diff --git a/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java
new file mode 100644
index 0000000..bd6e49e
--- /dev/null
+++ b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.marmotta.loader.ostrich;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.configuration.Configuration;
+import org.apache.marmotta.loader.api.LoaderBackend;
+import org.apache.marmotta.loader.api.LoaderHandler;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Ostrich loader backend. Provides configuration for the OstrichLoaderHandler.
+ *
+ * @author Sebastian Schaffert (sschaffert@apache.org)
+ */
+public class OstrichLoaderBackend implements LoaderBackend {
+
+    /**
+     * Create the RDFHandler to be used for bulk-loading, optionally using the configuration passed as argument.
+     *
+     * @param configuration
+     * @return a newly created RDFHandler instance
+     */
+    @Override
+    public LoaderHandler createLoader(Configuration configuration) {
+        return new OstrichLoaderHandler(
+                configuration.getString("backend.ostrich.host", "localhost"),
+                configuration.getInt("backend.ostrich.port", 10000),
+                configuration.getLong("backend.ostrich.batchsize", 500000));
+    }
+
+    /**
+     * Return a unique identifier for the loader; used for identifying the loader to choose on the command line
+     * in case more than one loader implementation is available.
+     * <p/>
+     * Should match with the regular expression [a-z][a-z0-9]*
+     *
+     * @return
+     */
+    @Override
+    public String getIdentifier() {
+        return "ostrich";
+    }
+
+    /**
+     * Return any additional options that this backend offers (e.g. for connecting to a database etc).
+     * If there are no additional options, return an empty collection.
+     *
+     * @return
+     */
+    @Override
+    public Collection<Option> getOptions() {
+        Set<Option> options = new HashSet<>();
+
+        Option host =
+                OptionBuilder.withArgName("host")
+                        .hasArgs(1)
+                        .withDescription("hostname or IP address of Ostrich/LevelDB server")
+                        .withLongOpt("host")
+                        .create('H');
+        options.add(host);
+
+        Option port =
+                OptionBuilder.withArgName("port")
+                        .hasArgs(1)
+                        .withDescription("port used by Ostrich/LevelDB server")
+                        .withLongOpt("port")
+                        .create('P');
+        options.add(port);
+
+        Option batchSize =
+                OptionBuilder.withArgName("batchsize")
+                        .hasArgs(1)
+                        .withDescription("maximum number of statements to commit in one batch")
+                        .withLongOpt("batchsize")
+                        .create('B');
+        options.add(batchSize);
+
+        return options;
+    }
+}

http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java
----------------------------------------------------------------------
diff --git a/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java
new file mode 100644
index 0000000..675d25a
--- /dev/null
+++ b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.marmotta.loader.ostrich;
+
+import org.apache.marmotta.loader.api.LoaderHandler;
+import org.apache.marmotta.ostrich.sail.OstrichSail;
+import org.openrdf.model.Statement;
+import org.openrdf.rio.RDFHandlerException;
+import org.openrdf.sail.SailConnection;
+import org.openrdf.sail.SailException;
+
+/**
+ * Add file description here!
+ *
+ * @author Sebastian Schaffert (sschaffert@apache.org)
+ */
+public class OstrichLoaderHandler implements LoaderHandler {
+
+    private OstrichSail sail;
+    private SailConnection con;
+
+    long count = 0;
+    long batchSize = 500000;
+
+    public OstrichLoaderHandler(String host, int port, long batchSize) {
+        this.batchSize = batchSize;
+        this.sail      = new OstrichSail(host,port);
+    }
+
+    /**
+     * Initialise the handler, performing any initialisation steps that are necessary before bulk importing can
+     * start (e.g. dropping indexes or establishing a connection).
+     *
+     * @throws RDFHandlerException
+     */
+    @Override
+    public void initialise() throws RDFHandlerException {
+        try {
+            sail.initialize();
+            con = sail.getConnection();
+        } catch (SailException e) {
+            throw new RDFHandlerException("Could not establish Ostrich connection", e);
+        }
+    }
+
+    /**
+     * Peform cleanup on shutdown, e.g. re-creating indexes after import completed or freeing resources acquired by
+     * the handler.
+     */
+    @Override
+    public void shutdown() throws RDFHandlerException {
+        try {
+            con.close();
+            sail.shutDown();
+        } catch (SailException e) {
+            throw new RDFHandlerException("Could not close Ostrich connection", e);
+        }
+
+    }
+
+    /**
+     * Signals the start of the RDF data. This method is called before any data
+     * is reported.
+     *
+     * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error.
+     */
+    @Override
+    public void startRDF() throws RDFHandlerException {
+        try {
+            con.begin();
+        } catch (SailException e) {
+            throw new RDFHandlerException("Could not start transaction", e);
+        }
+    }
+
+    /**
+     * Signals the end of the RDF data. This method is called when all data has
+     * been reported.
+     *
+     * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error.
+     */
+    @Override
+    public void endRDF() throws RDFHandlerException {
+        try {
+            con.commit();
+        } catch (SailException e) {
+            throw new RDFHandlerException("Could not commit transaction", e);
+        }
+    }
+
+    /**
+     * Handles a namespace declaration/definition. A namespace declaration
+     * associates a (short) prefix string with the namespace's URI. The prefix
+     * for default namespaces, which do not have an associated prefix, are
+     * represented as empty strings.
+     *
+     * @param prefix The prefix for the namespace, or an empty string in case of a
+     *               default namespace.
+     * @param uri    The URI that the prefix maps to.
+     * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error.
+     */
+    @Override
+    public void handleNamespace(String prefix, String uri) throws RDFHandlerException {
+        try {
+            con.setNamespace(prefix, uri);
+        } catch (SailException e) {
+            throw new RDFHandlerException("Could not add namespace", e);
+        }
+    }
+
+    /**
+     * Handles a statement.
+     *
+     * @param st The statement.
+     * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error.
+     */
+    @Override
+    public void handleStatement(Statement st) throws RDFHandlerException {
+        try {
+            con.addStatement(st.getSubject(), st.getPredicate(), st.getObject(), st.getContext());
+
+            if (++count % batchSize == 0) {
+                con.commit();
+                con.begin();
+            }
+        } catch (SailException e) {
+            throw new RDFHandlerException("Could not add statement", e);
+        }
+    }
+
+    /**
+     * Handles a comment.
+     *
+     * @param comment The comment.
+     * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error.
+     */
+    @Override
+    public void handleComment(String comment) throws RDFHandlerException {
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend
----------------------------------------------------------------------
diff --git a/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend b/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend
new file mode 100644
index 0000000..ca1e9a1
--- /dev/null
+++ b/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend
@@ -0,0 +1,18 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+org.apache.marmotta.loader.ostrich.OstrichLoaderBackend
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/resources/logback.xml
----------------------------------------------------------------------
diff --git a/loader/marmotta-loader-ostrich/src/main/resources/logback.xml b/loader/marmotta-loader-ostrich/src/main/resources/logback.xml
new file mode 100644
index 0000000..ebc7937
--- /dev/null
+++ b/loader/marmotta-loader-ostrich/src/main/resources/logback.xml
@@ -0,0 +1,32 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~      http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<configuration>
+    <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder>
+            <pattern>%d{HH:mm:ss.SSS} %level %logger{15} - %m%n</pattern>
+        </encoder>
+    </appender>
+
+    <logger name="org.apache.zookeeper" level="WARN" />
+    <logger name="org.apache.hadoop" level="WARN" />
+
+
+    <root level="${root-level:-INFO}">
+        <appender-ref ref="CONSOLE"/>
+    </root>
+</configuration>

http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/pom.xml
----------------------------------------------------------------------
diff --git a/loader/pom.xml b/loader/pom.xml
index dbdc0f1..4771f28 100644
--- a/loader/pom.xml
+++ b/loader/pom.xml
@@ -92,4 +92,14 @@
         <module>marmotta-loader-berkeley</module>
     </modules>
 
+    <profiles>
+        <profile>
+            <id>ostrich</id>
+            <modules>
+                <module>marmotta-loader-ostrich</module>
+            </modules>
+        </profile>
+    </profiles>
+
+
 </project>