You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by mr...@apache.org on 2014/05/02 10:41:29 UTC

svn commit: r1591826 - in /jackrabbit/oak/trunk/oak-run: pom.xml src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java

Author: mreutegg
Date: Fri May  2 08:41:29 2014
New Revision: 1591826

URL: http://svn.apache.org/r1591826
Log:
OAK-1790: Import of compressed wikipedia dump

Modified:
    jackrabbit/oak/trunk/oak-run/pom.xml
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java

Modified: jackrabbit/oak/trunk/oak-run/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/pom.xml?rev=1591826&r1=1591825&r2=1591826&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-run/pom.xml Fri May  2 08:41:29 2014
@@ -183,6 +183,11 @@
       <version>2.0</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+      <version>1.8</version>
+    </dependency>
+    <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-servlet</artifactId>
       <version>${jetty.version}</version>

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java?rev=1591826&r1=1591825&r2=1591826&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java Fri May  2 08:41:29 2014
@@ -19,7 +19,9 @@ package org.apache.jackrabbit.oak.benchm
 import static com.google.common.base.Preconditions.checkState;
 import static java.lang.Math.min;
 
+import java.io.BufferedInputStream;
 import java.io.File;
+import java.io.FileInputStream;
 
 import javax.jcr.Node;
 import javax.jcr.NodeIterator;
@@ -32,6 +34,7 @@ import javax.xml.stream.XMLStreamConstan
 import javax.xml.stream.XMLStreamReader;
 import javax.xml.transform.stream.StreamSource;
 
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.jackrabbit.commons.JcrUtils;
 import org.apache.jackrabbit.oak.benchmark.Benchmark;
 import org.apache.jackrabbit.oak.fixture.RepositoryFixture;
@@ -101,7 +104,7 @@ public class WikipediaImport extends Ben
         }
 
         String type = "nt:unstructured";
-        if (flat) {
+        if (session.getWorkspace().getNodeTypeManager().hasNodeType("oak:Unstructured")) {
             type = "oak:Unstructured";
         }
         Node wikipedia = session.getRootNode().addNode("wikipedia", type);
@@ -118,8 +121,15 @@ public class WikipediaImport extends Ben
         String title = null;
         String text = null;
         XMLInputFactory factory = XMLInputFactory.newInstance();
-        XMLStreamReader reader =
-                factory.createXMLStreamReader(new StreamSource(dump));
+        StreamSource source;
+        if (dump.getName().endsWith(".xml")) {
+            source = new StreamSource(dump);
+        } else {
+            CompressorStreamFactory csf = new CompressorStreamFactory();
+            source = new StreamSource(csf.createCompressorInputStream(
+                    new BufferedInputStream(new FileInputStream(dump))));
+        }
+        XMLStreamReader reader = factory.createXMLStreamReader(source);
         while (reader.hasNext()) {
             switch (reader.next()) {
             case XMLStreamConstants.START_ELEMENT: