You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/03/21 14:19:39 UTC
svn commit: r1459296 - in /stanbol/trunk/entityhub/indexing/source/jenatdb:
./ src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/
Author: rwesten
Date: Thu Mar 21 13:19:38 2013
New Revision: 1459296
URL: http://svn.apache.org/r1459296
Log:
STANBOL-992: updated the Jena TDB indexing source to the current Jena / Jena TDB version. Also fixed bz2 decoding (see http://s.apache.org/QbK)
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
Modified:
stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java
Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/pom.xml Thu Mar 21 13:19:38 2013
@@ -115,8 +115,9 @@
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
- <groupId>com.hp.hpl.jena</groupId>
- <artifactId>jena</artifactId>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-core</artifactId>
+ <version>2.10.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
@@ -125,12 +126,14 @@
</exclusions>
</dependency>
<dependency>
- <groupId>com.hp.hpl.jena</groupId>
- <artifactId>tdb</artifactId>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-tdb</artifactId>
+ <version>0.10.0</version>
</dependency>
<dependency>
- <groupId>com.hp.hpl.jena</groupId>
- <artifactId>arq</artifactId>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-arq</artifactId>
+ <version>2.10.0</version>
</dependency>
<!-- dependencies for testing -->
<dependency> <!-- used for debug level logging during tests -->
Added: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java?rev=1459296&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java (added)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java Thu Mar 21 13:19:38 2013
@@ -0,0 +1,120 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import org.apache.jena.atlas.lib.Tuple;
+import org.slf4j.Logger;
+
+import com.hp.hpl.jena.graph.Node;
+import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.Quad;
+import com.hp.hpl.jena.tdb.TDBException;
+import com.hp.hpl.jena.tdb.TDBLoader;
+import com.hp.hpl.jena.tdb.solver.stats.Stats;
+import com.hp.hpl.jena.tdb.solver.stats.StatsCollector;
+import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
+import com.hp.hpl.jena.tdb.store.TripleTable;
+import com.hp.hpl.jena.tdb.store.bulkloader.BulkLoader;
+import com.hp.hpl.jena.tdb.store.bulkloader.BulkStreamRDF;
+import com.hp.hpl.jena.tdb.store.bulkloader.LoadMonitor;
+import com.hp.hpl.jena.tdb.store.bulkloader.LoaderNodeTupleTable;
+import com.hp.hpl.jena.tdb.sys.Names;
+
+/**
+ * Special version of an {@link BulkStreamRDF} that stores Triples to the
+ * {@link TripleTable} of the parsed {@link DatasetGraphTDB}. Even
+ * {@link Quad}s and {@link Tuple}s with >= 3 nodes are converted to triples.
+ * <p>
+ * This code is based on the DestinationGraph implementation private to the
+ * {@link TDBLoader} class.
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+class DestinationTripleGraph implements BulkStreamRDF {
+ final private DatasetGraphTDB dsg ;
+ final private LoadMonitor monitor ;
+ final private LoaderNodeTupleTable loaderTriples ;
+ final private boolean startedEmpty ;
+ private long count = 0 ;
+ private StatsCollector stats ;
+
+ DestinationTripleGraph(final DatasetGraphTDB dsg, Logger log) {
+ this.dsg = dsg ;
+ startedEmpty = dsg.isEmpty() ;
+ monitor = new LoadMonitor(dsg, log, "triples", BulkLoader.DataTickPoint, BulkLoader.IndexTickPoint) ;
+ loaderTriples = new LoaderNodeTupleTable(dsg.getTripleTable().getNodeTupleTable(), "triples", monitor) ;
+ }
+
+ @Override
+ final public void startBulk()
+ {
+ loaderTriples.loadStart() ;
+ loaderTriples.loadDataStart() ;
+
+ this.stats = new StatsCollector() ;
+ }
+ @Override
+ final public void triple(Triple triple)
+ {
+ Node s = triple.getSubject() ;
+ Node p = triple.getPredicate() ;
+ Node o = triple.getObject() ;
+
+ loaderTriples.load(s, p, o) ;
+ stats.record(null, s, p, o) ;
+ count++ ;
+ }
+
+ @Override
+ final public void finishBulk()
+ {
+ loaderTriples.loadDataFinish() ;
+ loaderTriples.loadIndexStart() ;
+ loaderTriples.loadIndexFinish() ;
+ loaderTriples.loadFinish() ;
+
+ if ( ! dsg.getLocation().isMem() && startedEmpty )
+ {
+ String filename = dsg.getLocation().getPath(Names.optStats) ;
+ Stats.write(filename, stats) ;
+ }
+ forceSync(dsg) ;
+ }
+
+ @Override
+ public void start() {}
+ @Override
+ public void quad(Quad quad) {
+ triple(quad.asTriple());
+ }
+ @Override
+ public void tuple(Tuple<Node> tuple) {
+ if(tuple.size() >= 3){
+ loaderTriples.load(tuple.get(0), tuple.get(1), tuple.get(2)) ;
+ stats.record(null, tuple.get(0), tuple.get(1), tuple.get(2)) ;
+ count++ ;
+ } else {
+ throw new TDBException("Tuple with < 3 Nodes encountered while loading a single graph");
+ }
+ }
+ @Override
+ public void base(String base) { }
+ @Override
+ public void prefix(String prefix, String iri) { } // TODO
+ @Override
+ public void finish() {}
+
+
+ static void forceSync(DatasetGraphTDB dsg)
+ {
+ // Force sync - we have been bypassing DSG tables.
+ // THIS DOES NOT WORK IF modules check for SYNC necessity.
+ dsg.getTripleTable().getNodeTupleTable().getNodeTable().sync();
+ dsg.getQuadTable().getNodeTupleTable().getNodeTable().sync();
+ dsg.getQuadTable().getNodeTupleTable().getNodeTable().sync();
+ dsg.getPrefixes().getNodeTupleTable().getNodeTable().sync();
+ // This is not enough -- modules check whether sync needed.
+ dsg.sync() ;
+
+ }
+}
+
Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java Thu Mar 21 13:19:38 2013
@@ -16,42 +16,33 @@
*/
package org.apache.stanbol.entityhub.indexing.source.jenatdb;
-import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.io.FilenameUtils;
-import org.apache.stanbol.entityhub.indexing.core.source.ResourceState;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.RDFLanguages;
+import org.apache.jena.riot.RiotReader;
import org.apache.stanbol.entityhub.indexing.core.source.ResourceImporter;
-import org.openjena.riot.Lang;
-import org.openjena.riot.RiotReader;
+import org.apache.stanbol.entityhub.indexing.core.source.ResourceState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.hp.hpl.jena.graph.Node;
-import com.hp.hpl.jena.graph.Triple;
-import com.hp.hpl.jena.rdf.model.AnonId;
-import com.hp.hpl.jena.rdf.model.Model;
-import com.hp.hpl.jena.rdf.model.ModelFactory;
-import com.hp.hpl.jena.sparql.core.Quad;
-import com.hp.hpl.jena.tdb.TDBLoader;
import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
-import com.hp.hpl.jena.tdb.store.bulkloader.BulkLoader;
-import com.hp.hpl.jena.tdb.store.bulkloader.Destination;
-import com.hp.hpl.jena.tdb.store.bulkloader.LoadMonitor;
-import com.hp.hpl.jena.tdb.store.bulkloader.LoaderNodeTupleTable;
public class RdfResourceImporter implements ResourceImporter {
private static final Logger log = LoggerFactory.getLogger(RdfResourceImporter.class);
- private final DatasetGraphTDB indexingDataset;
+ // private final DatasetGraphTDB indexingDataset;
+ private final DestinationTripleGraph destination;
public RdfResourceImporter(DatasetGraphTDB indexingDataset){
if(indexingDataset == null){
throw new IllegalArgumentException("The parsed DatasetGraphTDB instance MUST NOT be NULL!");
}
- this.indexingDataset = indexingDataset;
+ //this.indexingDataset = indexingDataset;
+ this.destination = new DestinationTripleGraph(indexingDataset,log);
}
@Override
@@ -62,119 +53,51 @@ public class RdfResourceImporter impleme
name = FilenameUtils.removeExtension(name);
log.debug(" - from GZIP Archive");
} else if ("bz2".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
- is = new BZip2CompressorInputStream(is);
+ is = new BZip2CompressorInputStream(is,
+ true); //use true as 2nd param (see http://s.apache.org/QbK)
name = FilenameUtils.removeExtension(name);
log.debug(" - from BZip2 Archive");
}// TODO: No Zip Files inside Zip Files supported :o( ^^
- Lang format = Lang.guess(name);
- // For N-Triple we can use the TDBLoader
+ Lang format = RDFLanguages.filenameToLang(name);
if (format == null) {
log.warn("ignore File {} because of unknown extension ");
return ResourceState.IGNORED;
- } else if (format == Lang.NTRIPLES) {
- TDBLoader.load(indexingDataset, is, true);
- } else if(format == Lang.NQUADS || format == Lang.TRIG){ //quads
- TDBLoader loader = new TDBLoader();
- loader.setShowProgress(true);
- Destination<Quad> dest = createQuad2TripleDestination();
- dest.start();
- RiotReader.parseQuads(is,format,null, dest);
- dest.finish();
- } else if (format != Lang.RDFXML) {
- // use RIOT to parse the format but with a special configuration
- // RiotReader!
- TDBLoader loader = new TDBLoader();
- loader.setShowProgress(true);
- Destination<Triple> dest = createDestination();
- dest.start();
- RiotReader.parseTriples(is, format, null, dest);
- dest.finish();
- } else { // RDFXML
- // in that case we need to use ARP
- Model model = ModelFactory.createModelForGraph(indexingDataset.getDefaultGraph());
- model.read(is, null);
+ } else {
+ log.info(" - bulk loading File {} using Format {}",resourceName,format);
+ try {
+ destination.startBulk() ;
+ RiotReader.parse(is, format, null, destination) ;
+ }catch (RuntimeException e) {
+ return ResourceState.ERROR;
+ } finally {
+ destination.finishBulk() ;
+ }
}
+// old code - just keep it in case the above else does not support any of the below RDF formats.
+// if (format == Lang.NTRIPLES) {
+// BulkLoader.
+// TDBLoader.load(indexingDataset, is, true);
+// } else if(format == Lang.NQUADS || format == Lang.TRIG){ //quads
+// TDBLoader loader = new TDBLoader();
+// loader.setShowProgress(true);
+// RDFSt dest = createQuad2TripleDestination();
+// dest.start();
+// RiotReader.parseQuads(is,format,null, dest);
+// dest.finish();
+// } else if (format != Lang.RDFXML) {
+// // use RIOT to parse the format but with a special configuration
+// // RiotReader!
+// TDBLoader loader = new TDBLoader();
+// loader.setShowProgress(true);
+// Destination<Triple> dest = createDestination();
+// dest.start();
+// RiotReader.parseTriples(is, format, null, dest);
+// dest.finish();
+// } else { // RDFXML
+// // in that case we need to use ARP
+// Model model = ModelFactory.createModelForGraph(indexingDataset.getDefaultGraph());
+// model.read(is, null);
+// }
return ResourceState.LOADED;
}
- /**
- * Creates a triple destination for the default dataset of the
- * {@link #indexingDataset}.
- * This code is based on how Destinations are created in the {@link BulkLoader},
- * implementation. Note that
- * {@link BulkLoader#loadDefaultGraph(DatasetGraphTDB, InputStream, boolean)}
- * can not be used for formats other than {@link Lang#NTRIPLES} because it
- * hard codes this format for loading data form the parsed InputStream.
- * @return the destination!
- */
- private Destination<Triple> createDestination() {
- LoadMonitor monitor = new LoadMonitor(indexingDataset,
- log, "triples",50000,100000);
- final LoaderNodeTupleTable loaderTriples = new LoaderNodeTupleTable(
- indexingDataset.getTripleTable().getNodeTupleTable(), "triples", monitor) ;
-
- Destination<Triple> sink = new Destination<Triple>() {
- long count = 0 ;
- public final void start()
- {
- loaderTriples.loadStart() ;
- loaderTriples.loadDataStart() ;
- }
- public final void send(Triple triple)
- {
- loaderTriples.load(triple.getSubject(), triple.getPredicate(),
- triple.getObject()) ;
- count++ ;
- }
-
- public final void flush() { }
- public void close() { }
-
- public final void finish()
- {
- loaderTriples.loadDataFinish() ;
- loaderTriples.loadIndexStart() ;
- loaderTriples.loadIndexFinish() ;
- loaderTriples.loadFinish() ;
- }
- } ;
- return sink ;
- }
- /**
- * Creates a Destination that consumes {@link Quad}s and stores
- * {@link Triple}s to the {@link #indexingDataset}
- * @return
- */
- private Destination<Quad> createQuad2TripleDestination() {
- LoadMonitor monitor = new LoadMonitor(indexingDataset,
- log, "triples",50000,100000);
- final LoaderNodeTupleTable loaderTriples = new LoaderNodeTupleTable(
- indexingDataset.getTripleTable().getNodeTupleTable(), "triples", monitor) ;
-
- Destination<Quad> sink = new Destination<Quad>() {
- //long count = 0 ;
- public final void start()
- {
- loaderTriples.loadStart() ;
- loaderTriples.loadDataStart() ;
- }
- public final void send(Quad quad)
- {
- loaderTriples.load(quad.getSubject(), quad.getPredicate(), quad.getObject()) ;
- //count++ ;
- }
-
- public final void flush() { }
- public void close() { }
-
- public final void finish()
- {
- loaderTriples.loadDataFinish() ;
- loaderTriples.loadIndexStart() ;
- loaderTriples.loadIndexFinish() ;
- loaderTriples.loadFinish() ;
- }
-
- } ;
- return sink ;
- }
}
Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/ResourceFilterIterator.java Thu Mar 21 13:19:38 2013
@@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory;
import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.DatasetGraph;
import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
import com.hp.hpl.jena.util.iterator.ExtendedIterator;
@@ -69,7 +70,7 @@ public class ResourceFilterIterator impl
/**
* The RDF data
*/
- private DatasetGraphTDB indexingDataset;
+ private DatasetGraph indexingDataset;
/**
* The Iterator over the current EntityFilter (or <code>null</code> if not
* yet initialised)
Modified: stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java?rev=1459296&r1=1459295&r2=1459296&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java (original)
+++ stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/Utils.java Thu Mar 21 13:19:38 2013
@@ -26,9 +26,11 @@ import java.util.Map;
import org.apache.stanbol.entityhub.indexing.core.IndexingComponent;
import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
+import com.hp.hpl.jena.sparql.core.DatasetGraph;
import com.hp.hpl.jena.tdb.TDBFactory;
import com.hp.hpl.jena.tdb.base.file.Location;
import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
+import com.hp.hpl.jena.tdb.sys.TDBMaker;
public final class Utils {
@@ -57,7 +59,10 @@ public final class Utils {
}
}
Location location = new Location(modelLocation.getAbsolutePath());
- return TDBFactory.createDatasetGraph(location);
+ //TODO: change this to support transactions
+ // TDBMaker.createDatasetGraphTransaction(location);
+ // if we need transaction support!
+ return TDBMaker.createDatasetGraphTDB(location);
}
/**