You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/03/21 14:19:39 UTC

svn commit: r1459296 - in /stanbol/trunk/entityhub/indexing/source/jenatdb: ./ src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/

Author: rwesten
Date: Thu Mar 21 13:19:38 2013
New Revision: 1459296

URL: http://svn.apache.org/r1459296
Log:
STANBOL-992: updated the Jena TDB indexing source to the current Jena / Jena TDB version. Also fixed bz2 decoding (see http://s.apache.org/QbK)

Added:
    stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
Modified:
    stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml
    stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
    stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java
    stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java

Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml Thu Mar 21 13:19:38 2013
@@ -115,8 +115,9 @@
       <artifactId>commons-compress</artifactId>
     </dependency>
     <dependency>
-      <groupId>com.hp.hpl.jena</groupId>
-      <artifactId>jena</artifactId>
+      <groupId>org.apache.jena</groupId>
+      <artifactId>jena-core</artifactId>
+      <version>2.10.0</version>
       <exclusions>
         <exclusion>
           <artifactId>slf4j-log4j12</artifactId>
@@ -125,12 +126,14 @@
       </exclusions>
     </dependency>
     <dependency>
-      <groupId>com.hp.hpl.jena</groupId>
-      <artifactId>tdb</artifactId>
+      <groupId>org.apache.jena</groupId>
+      <artifactId>jena-tdb</artifactId>
+      <version>0.10.0</version>
     </dependency>
     <dependency>
-      <groupId>com.hp.hpl.jena</groupId>
-      <artifactId>arq</artifactId>
+      <groupId>org.apache.jena</groupId>
+      <artifactId>jena-arq</artifactId>
+      <version>2.10.0</version>
     </dependency>
     <!-- dependencies for testing -->
     <dependency>  <!-- used for debug level logging during tests -->

Added: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java?rev=1459296&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java (added)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java Thu Mar 21 13:19:38 2013
@@ -0,0 +1,120 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import org.apache.jena.atlas.lib.Tuple;
+import org.slf4j.Logger;
+
+import com.hp.hpl.jena.graph.Node;
+import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.Quad;
+import com.hp.hpl.jena.tdb.TDBException;
+import com.hp.hpl.jena.tdb.TDBLoader;
+import com.hp.hpl.jena.tdb.solver.stats.Stats;
+import com.hp.hpl.jena.tdb.solver.stats.StatsCollector;
+import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
+import com.hp.hpl.jena.tdb.store.TripleTable;
+import com.hp.hpl.jena.tdb.store.bulkloader.BulkLoader;
+import com.hp.hpl.jena.tdb.store.bulkloader.BulkStreamRDF;
+import com.hp.hpl.jena.tdb.store.bulkloader.LoadMonitor;
+import com.hp.hpl.jena.tdb.store.bulkloader.LoaderNodeTupleTable;
+import com.hp.hpl.jena.tdb.sys.Names;
+
+/**
+ * Special version of an {@link BulkStreamRDF} that stores Triples to the
+ * {@link TripleTable} of the parsed {@link DatasetGraphTDB}. Even
+ * {@link Quad}s and {@link Tuple}s with >= 3 nodes are converted to triples.
+ * <p>
+ * This code is based on the DestinationGraph implementation private to the 
+ * {@link TDBLoader} class.
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+class DestinationTripleGraph implements BulkStreamRDF {
+    final private DatasetGraphTDB dsg ;
+    final private LoadMonitor monitor ;
+    final private LoaderNodeTupleTable loaderTriples ;
+    final private boolean startedEmpty ;
+    private long count = 0 ;
+    private StatsCollector stats ;
+
+    DestinationTripleGraph(final DatasetGraphTDB dsg, Logger log) {
+        this.dsg = dsg ;
+        startedEmpty = dsg.isEmpty() ;
+        monitor = new LoadMonitor(dsg, log, "triples", BulkLoader.DataTickPoint, BulkLoader.IndexTickPoint) ;
+        loaderTriples = new LoaderNodeTupleTable(dsg.getTripleTable().getNodeTupleTable(), "triples", monitor) ;
+    }
+
+    @Override
+    final public void startBulk()
+    {
+        loaderTriples.loadStart() ;
+        loaderTriples.loadDataStart() ;
+
+        this.stats = new StatsCollector() ;
+    }
+    @Override
+    final public void triple(Triple triple)
+    {
+        Node s = triple.getSubject() ;
+        Node p = triple.getPredicate() ;
+        Node o = triple.getObject() ;
+
+        loaderTriples.load(s, p, o)  ;
+        stats.record(null, s, p, o) ; 
+        count++ ;
+    }
+
+    @Override
+    final public void finishBulk()
+    {
+        loaderTriples.loadDataFinish() ;
+        loaderTriples.loadIndexStart() ;
+        loaderTriples.loadIndexFinish() ;
+        loaderTriples.loadFinish() ;
+
+        if ( ! dsg.getLocation().isMem() && startedEmpty )
+        {
+            String filename = dsg.getLocation().getPath(Names.optStats) ;
+            Stats.write(filename, stats) ;
+        }
+        forceSync(dsg) ;
+    }
+
+    @Override
+    public void start()                     {}
+    @Override
+    public void quad(Quad quad) { 
+        triple(quad.asTriple());
+    }
+    @Override
+    public void tuple(Tuple<Node> tuple) { 
+        if(tuple.size() >= 3){
+            loaderTriples.load(tuple.get(0), tuple.get(1), tuple.get(2))  ;
+            stats.record(null, tuple.get(0), tuple.get(1), tuple.get(2)) ; 
+            count++ ;
+        } else {
+            throw new TDBException("Tuple with < 3 Nodes encountered while loading a single graph");
+        }
+    }
+    @Override
+    public void base(String base)           { }
+    @Override
+    public void prefix(String prefix, String iri)  { } // TODO
+    @Override
+    public void finish()                    {}
+
+
+    static void forceSync(DatasetGraphTDB dsg)
+    {
+        // Force sync - we have been bypassing DSG tables.
+        // THIS DOES NOT WORK IF modules check for SYNC necessity.
+        dsg.getTripleTable().getNodeTupleTable().getNodeTable().sync();
+        dsg.getQuadTable().getNodeTupleTable().getNodeTable().sync();
+        dsg.getQuadTable().getNodeTupleTable().getNodeTable().sync();
+        dsg.getPrefixes().getNodeTupleTable().getNodeTable().sync();                
+        // This is not enough -- modules check whether sync needed.
+        dsg.sync() ;
+        
+    }
+}
+

Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java Thu Mar 21 13:19:38 2013
@@ -16,42 +16,33 @@
 */
 package org.apache.stanbol.entityhub.indexing.source.jenatdb;
 
-import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.zip.GZIPInputStream;
 
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.io.FilenameUtils;
-import org.apache.stanbol.entityhub.indexing.core.source.ResourceState;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.RDFLanguages;
+import org.apache.jena.riot.RiotReader;
 import org.apache.stanbol.entityhub.indexing.core.source.ResourceImporter;
-import org.openjena.riot.Lang;
-import org.openjena.riot.RiotReader;
+import org.apache.stanbol.entityhub.indexing.core.source.ResourceState;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.hp.hpl.jena.graph.Node;
-import com.hp.hpl.jena.graph.Triple;
-import com.hp.hpl.jena.rdf.model.AnonId;
-import com.hp.hpl.jena.rdf.model.Model;
-import com.hp.hpl.jena.rdf.model.ModelFactory;
-import com.hp.hpl.jena.sparql.core.Quad;
-import com.hp.hpl.jena.tdb.TDBLoader;
 import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
-import com.hp.hpl.jena.tdb.store.bulkloader.BulkLoader;
-import com.hp.hpl.jena.tdb.store.bulkloader.Destination;
-import com.hp.hpl.jena.tdb.store.bulkloader.LoadMonitor;
-import com.hp.hpl.jena.tdb.store.bulkloader.LoaderNodeTupleTable;
 
 public class RdfResourceImporter implements ResourceImporter {
 
     private static final Logger log = LoggerFactory.getLogger(RdfResourceImporter.class);
-    private final DatasetGraphTDB indexingDataset;
+   // private final DatasetGraphTDB indexingDataset;
+    private final DestinationTripleGraph destination;
     public RdfResourceImporter(DatasetGraphTDB indexingDataset){
         if(indexingDataset == null){
             throw new IllegalArgumentException("The parsed DatasetGraphTDB instance MUST NOT be NULL!");
         }
-        this.indexingDataset = indexingDataset;
+        //this.indexingDataset = indexingDataset;
+        this.destination = new DestinationTripleGraph(indexingDataset,log);
     }
 
     @Override
@@ -62,119 +53,51 @@ public class RdfResourceImporter impleme
             name = FilenameUtils.removeExtension(name);
             log.debug("   - from GZIP Archive");
         } else if ("bz2".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
-            is = new BZip2CompressorInputStream(is);
+            is = new BZip2CompressorInputStream(is,
+                true); //use true as 2nd param (see http://s.apache.org/QbK) 
             name = FilenameUtils.removeExtension(name);
             log.debug("   - from BZip2 Archive");
         }// TODO: No Zip Files inside Zip Files supported :o( ^^
-        Lang format = Lang.guess(name);
-        // For N-Triple we can use the TDBLoader
+        Lang format = RDFLanguages.filenameToLang(name);
         if (format == null) {
             log.warn("ignore File {} because of unknown extension ");
             return ResourceState.IGNORED;
-        } else if (format == Lang.NTRIPLES) {
-            TDBLoader.load(indexingDataset, is, true);
-        } else if(format == Lang.NQUADS || format == Lang.TRIG){ //quads
-            TDBLoader loader = new TDBLoader();
-            loader.setShowProgress(true);
-            Destination<Quad> dest = createQuad2TripleDestination();
-            dest.start();
-            RiotReader.parseQuads(is,format,null, dest);
-            dest.finish();
-        } else if (format != Lang.RDFXML) {
-            // use RIOT to parse the format but with a special configuration
-            // RiotReader!
-            TDBLoader loader = new TDBLoader();
-            loader.setShowProgress(true);
-            Destination<Triple> dest = createDestination();
-            dest.start();
-            RiotReader.parseTriples(is, format, null, dest);
-            dest.finish();
-        } else { // RDFXML
-            // in that case we need to use ARP
-            Model model = ModelFactory.createModelForGraph(indexingDataset.getDefaultGraph());
-            model.read(is, null);
+        } else {
+            log.info("    - bulk loading File {} using Format {}",resourceName,format);
+            try {
+            destination.startBulk() ;
+            RiotReader.parse(is, format, null, destination) ;
+            }catch (RuntimeException e) {
+                return ResourceState.ERROR;
+            } finally {
+                destination.finishBulk() ;
+            }
         }
+// old code - just keep it in case the above else does not support any of the below RDF formats.
+//        if (format == Lang.NTRIPLES) {
+//            BulkLoader.
+//            TDBLoader.load(indexingDataset, is, true);
+//        } else if(format == Lang.NQUADS || format == Lang.TRIG){ //quads
+//            TDBLoader loader = new TDBLoader();
+//            loader.setShowProgress(true);
+//            RDFSt dest = createQuad2TripleDestination();
+//            dest.start();
+//            RiotReader.parseQuads(is,format,null, dest);
+//            dest.finish();
+//        } else if (format != Lang.RDFXML) {
+//            // use RIOT to parse the format but with a special configuration
+//            // RiotReader!
+//            TDBLoader loader = new TDBLoader();
+//            loader.setShowProgress(true);
+//            Destination<Triple> dest = createDestination();
+//            dest.start();
+//            RiotReader.parseTriples(is, format, null, dest);
+//            dest.finish();
+//        } else { // RDFXML
+//            // in that case we need to use ARP
+//            Model model = ModelFactory.createModelForGraph(indexingDataset.getDefaultGraph());
+//            model.read(is, null);
+//        }
         return ResourceState.LOADED;
     }
-    /**
-     * Creates a triple destination for the default dataset of the
-     * {@link #indexingDataset}.
-     * This code is based on how Destinations are created in the {@link BulkLoader},
-     * implementation. Note that
-     * {@link BulkLoader#loadDefaultGraph(DatasetGraphTDB, InputStream, boolean)}
-     * can not be used for formats other than {@link Lang#NTRIPLES} because it
-     * hard codes this format for loading data form the parsed InputStream.
-     * @return the destination!
-     */
-    private Destination<Triple> createDestination() {
-        LoadMonitor monitor = new LoadMonitor(indexingDataset, 
-            log, "triples",50000,100000);
-        final LoaderNodeTupleTable loaderTriples = new LoaderNodeTupleTable(
-            indexingDataset.getTripleTable().getNodeTupleTable(), "triples", monitor) ;
-
-        Destination<Triple> sink = new Destination<Triple>() {
-            long count = 0 ;
-            public final void start()
-            {
-                loaderTriples.loadStart() ;
-                loaderTriples.loadDataStart() ;
-            }
-            public final void send(Triple triple)
-            {
-                loaderTriples.load(triple.getSubject(), triple.getPredicate(), 
-                    triple.getObject()) ;
-                count++ ;
-            }
-
-            public final void flush() { }
-            public void close() { }
-
-            public final void finish()
-            {
-                loaderTriples.loadDataFinish() ;
-                loaderTriples.loadIndexStart() ;
-                loaderTriples.loadIndexFinish() ;
-                loaderTriples.loadFinish() ;
-            }
-        } ;
-        return sink ;
-    }
-    /**
-     * Creates a Destination that consumes {@link Quad}s and stores
-     * {@link Triple}s to the {@link #indexingDataset}
-     * @return
-     */
-    private Destination<Quad> createQuad2TripleDestination() {
-        LoadMonitor monitor = new LoadMonitor(indexingDataset, 
-            log, "triples",50000,100000);
-        final LoaderNodeTupleTable loaderTriples = new LoaderNodeTupleTable(
-            indexingDataset.getTripleTable().getNodeTupleTable(), "triples", monitor) ;
-
-        Destination<Quad> sink = new Destination<Quad>() {
-            //long count = 0 ;
-            public final void start()
-            {
-                loaderTriples.loadStart() ;
-                loaderTriples.loadDataStart() ;
-            }
-            public final void send(Quad quad)
-            {
-                loaderTriples.load(quad.getSubject(), quad.getPredicate(), quad.getObject()) ;
-                //count++ ;
-            }
-
-            public final void flush() { }
-            public void close() { }
-
-            public final void finish()
-            {
-                loaderTriples.loadDataFinish() ;
-                loaderTriples.loadIndexStart() ;
-                loaderTriples.loadIndexFinish() ;
-                loaderTriples.loadFinish() ;
-            }
-
-        } ;
-        return sink ;
-    }
 }

Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java Thu Mar 21 13:19:38 2013
@@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory;
 
 import com.hp.hpl.jena.graph.Node;
 import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.DatasetGraph;
 import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
 import com.hp.hpl.jena.util.iterator.ExtendedIterator;
 
@@ -69,7 +70,7 @@ public class ResourceFilterIterator impl
     /**
      * The RDF data
      */
-    private DatasetGraphTDB indexingDataset;
+    private DatasetGraph indexingDataset;
     /**
      * The Iterator over the current EntityFilter (or <code>null</code> if not
      * yet initialised)

Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java Thu Mar 21 13:19:38 2013
@@ -26,9 +26,11 @@ import java.util.Map;
 import org.apache.stanbol.entityhub.indexing.core.IndexingComponent;
 import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
 
+import com.hp.hpl.jena.sparql.core.DatasetGraph;
 import com.hp.hpl.jena.tdb.TDBFactory;
 import com.hp.hpl.jena.tdb.base.file.Location;
 import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
+import com.hp.hpl.jena.tdb.sys.TDBMaker;
 
 public final class Utils {
 
@@ -57,7 +59,10 @@ public final class Utils {
             }
         }
         Location location = new Location(modelLocation.getAbsolutePath());
-        return TDBFactory.createDatasetGraph(location);
+        //TODO: change this to support transactions
+        //    TDBMaker.createDatasetGraphTransaction(location);
+        //  if we need transaction support!
+        return TDBMaker.createDatasetGraphTDB(location);
     }
 
     /**