You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ju...@apache.org on 2014/04/12 10:34:47 UTC
svn commit: r1586835 - in /jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark: BenchmarkRunner.java FullTextSearchTest.java wikipedia/WikipediaImport.java

Author: jukka
Date: Sat Apr 12 08:34:47 2014
New Revision: 1586835

URL: http://svn.apache.org/r1586835
Log:
OAK-1702: Create a benchmark for Full text search

Reuse the existing WikipediaImport code in FullTextSearch.
Make the flatness of the Wikipedia import configurable so that we can
use also larger test sets with Jackrabbit Classic.
Use a better word pattern to avoid the need for convoluted escaping.

Modified:
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/BenchmarkRunner.java
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/FullTextSearchTest.java
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/BenchmarkRunner.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/BenchmarkRunner.java?rev=1586835&r1=1586834&r2=1586835&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/BenchmarkRunner.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/BenchmarkRunner.java Sat Apr 12 08:34:47 2014
@@ -72,7 +72,7 @@ public class BenchmarkRunner {
                         .defaultsTo(Boolean.FALSE);
         OptionSpec<File> csvFile = parser.accepts("csvFile", "File to write a CSV version of the benchmark data.")
                 .withOptionalArg().ofType(File.class);
-        OptionSpec<Boolean> flatStructure = parser.accepts("flatStructure", "Whether user/group should be setup with a flat structure or not.")
+        OptionSpec<Boolean> flatStructure = parser.accepts("flatStructure", "Whether the test should use a flat structure or not.")
                 .withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.FALSE);
         OptionSpec<Integer> numberOfUsers = parser.accepts("numberOfUsers")
                 .withOptionalArg().ofType(Integer.class).defaultsTo(10000);
@@ -138,7 +138,10 @@ public class BenchmarkRunner {
             new CreateManyNodesTest(),
             new UpdateManyChildNodesTest(),
             new TransientManyChildNodesTest(),
-            new WikipediaImport(wikipedia.value(options)),
+            new WikipediaImport(
+                    wikipedia.value(options),
+                    flatStructure.value(options),
+                    report.value(options)),
             new CreateNodesBenchmark(),
             new ManyNodes(),
             new ObservationTest(),
@@ -210,6 +213,7 @@ public class BenchmarkRunner {
                     flatStructure.value(options)),
             new FullTextSearchTest(
                     wikipedia.value(options),
+                    flatStructure.value(options),
                     report.value(options))
         };
 

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/FullTextSearchTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/FullTextSearchTest.java?rev=1586835&r1=1586834&r2=1586835&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/FullTextSearchTest.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/FullTextSearchTest.java Sat Apr 12 08:34:47 2014
@@ -16,13 +16,18 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.jackrabbit.oak.benchmark;
 
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.collect.Lists.newArrayList;
+import static com.google.common.collect.Sets.newHashSet;
+
 import java.io.File;
 import java.util.List;
 import java.util.Random;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import javax.jcr.Node;
 import javax.jcr.Repository;
@@ -31,15 +36,8 @@ import javax.jcr.query.Query;
 import javax.jcr.query.QueryManager;
 import javax.jcr.query.QueryResult;
 import javax.jcr.query.RowIterator;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamConstants;
-import javax.xml.stream.XMLStreamReader;
-import javax.xml.transform.stream.StreamSource;
-
-import com.google.common.base.CharMatcher;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
+
+import org.apache.jackrabbit.oak.benchmark.wikipedia.WikipediaImport;
 import org.apache.jackrabbit.oak.fixture.JcrCustomizer;
 import org.apache.jackrabbit.oak.fixture.OakRepositoryFixture;
 import org.apache.jackrabbit.oak.fixture.RepositoryFixture;
@@ -47,29 +45,57 @@ import org.apache.jackrabbit.oak.jcr.Jcr
 import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexProvider;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneInitializerHelper;
-import org.apache.jackrabbit.util.Text;
-
-import static com.google.common.base.Preconditions.checkArgument;
 
 public class FullTextSearchTest extends AbstractTest<FullTextSearchTest.TestContext> {
+
+    /**
+     * Pattern used to find words and other searchable tokens within the
+     * imported Wikipedia pages.
+     */
+    private static final Pattern WORD_PATTERN =
+            Pattern.compile("\\p{IsLetterOrDigit}{3,}");
+
     private int maxSampleSize = 100;
-    private final File dump;
-    private final boolean doReport;
-    private List<String> sampleSet;
-    private Random random;
+
+    private final WikipediaImport importer;
+
+    private final Set<String> sampleSet = newHashSet();
+
+    private final Random random = new Random(42); //fixed seed
+
     private int maxRowsToFetch = Integer.getInteger("maxRowsToFetch",10000);
+
     private TestContext defaultContext;
 
-    public FullTextSearchTest(File dump, boolean doReport) {
-        this.dump = dump;
-        this.doReport = doReport;
+    public FullTextSearchTest(File dump, boolean flat, boolean doReport) {
+        this.importer = new WikipediaImport(dump, flat, doReport) {
+            private int count = 0;
+            @Override
+            protected void pageAdded(String title, String text) {
+                count++;
+                if (count % 1000 == 0
+                        && sampleSet.size() < maxSampleSize
+                        && text != null) {
+                    List<String> words = newArrayList();
+
+                    Matcher matcher = WORD_PATTERN.matcher(text);
+                    while (matcher.find()) {
+                        words.add(matcher.group());
+                    }
+
+                    if (!words.isEmpty()) {
+                        sampleSet.add(words.get(words.size() / 2));
+                    }
+                }
+            }
+        };
     }
 
     @Override
     public void beforeSuite() throws Exception {
-        random = new Random(42); //fixed seed
-        Session importSession = loginWriter();
-        sampleSet = importWikipedia(importSession);
+        importer.importWikipedia(loginWriter());
+        Thread.sleep(5); // allow some time for the indexer to catch up
+
         defaultContext = new TestContext();
     }
 
@@ -105,7 +131,7 @@ public class FullTextSearchTest extends 
 
     class TestContext {
         final Session session = loginWriter();
-        final String word = Text.escapeIllegalJcrChars(sampleSet.get(random.nextInt(sampleSet.size())));
+        final String word = newArrayList(sampleSet).get(random.nextInt(sampleSet.size()));
     }
 
     @Override
@@ -124,68 +150,4 @@ public class FullTextSearchTest extends 
         return super.createRepository(fixture);
     }
 
-    private List<String> importWikipedia(Session session) throws Exception {
-        long start = System.currentTimeMillis();
-        int count = 0;
-        Set<String> sampleWords = Sets.newHashSet();
-
-        checkArgument(dump.exists(), "Dump file %s does not exist", dump.getAbsolutePath());
-        if (doReport) {
-            System.out.format("Importing %s...%n", dump);
-        }
-        Node wikipedia = session.getRootNode().addNode("wikipedia", "nt:unstructured");
-
-        String title = null;
-        String text = null;
-        XMLInputFactory factory = XMLInputFactory.newInstance();
-        XMLStreamReader reader =
-                factory.createXMLStreamReader(new StreamSource(dump));
-        while (reader.hasNext()) {
-            switch (reader.next()) {
-                case XMLStreamConstants.START_ELEMENT:
-                    if ("title".equals(reader.getLocalName())) {
-                        title = reader.getElementText();
-                    } else if ("text".equals(reader.getLocalName())) {
-                        text = reader.getElementText();
-                    }
-                    break;
-                case XMLStreamConstants.END_ELEMENT:
-                    if ("page".equals(reader.getLocalName())) {
-                        String name = Text.escapeIllegalJcrChars(title);
-                        Node page = wikipedia.addNode(name);
-                        page.setProperty("title", title);
-                        page.setProperty("text", Text.escapeIllegalJcrChars(text));
-                        count++;
-
-                        if (count % 1000 == 0
-                                && sampleWords.size() < maxSampleSize
-                                && text != null) {
-                            List<String> words = Splitter.on(CharMatcher.BREAKING_WHITESPACE)
-                                    .trimResults().splitToList(text);
-                            if (!words.isEmpty()) {
-                                sampleWords.add(words.get(words.size() / 2));
-                            }
-                        }
-
-                        if (doReport && count % 1000 == 0) {
-                            long millis = System.currentTimeMillis() - start;
-                            System.out.format(
-                                    "Added %d pages in %d seconds (%.2fms/page)%n",
-                                    count, millis / 1000, (double) millis / count);
-                        }
-                    }
-                    break;
-            }
-        }
-
-        session.save();
-
-        if (doReport) {
-            long millis = System.currentTimeMillis() - start;
-            System.out.format(
-                    "Imported %d pages in %d seconds (%.2fms/page)%n",
-                    count, millis / 1000, (double) millis / count);
-        }
-        return Lists.newArrayList(sampleWords);
-    }
 }

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java?rev=1586835&r1=1586834&r2=1586835&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java Sat Apr 12 08:34:47 2014
@@ -17,12 +17,14 @@
 package org.apache.jackrabbit.oak.benchmark.wikipedia;
 
 import static com.google.common.base.Preconditions.checkState;
+import static java.lang.Math.min;
 
 import java.io.File;
 
 import javax.jcr.Node;
 import javax.jcr.NodeIterator;
 import javax.jcr.Repository;
+import javax.jcr.RepositoryException;
 import javax.jcr.Session;
 import javax.jcr.SimpleCredentials;
 import javax.xml.stream.XMLInputFactory;
@@ -30,6 +32,7 @@ import javax.xml.stream.XMLStreamConstan
 import javax.xml.stream.XMLStreamReader;
 import javax.xml.transform.stream.StreamSource;
 
+import org.apache.jackrabbit.commons.JcrUtils;
 import org.apache.jackrabbit.oak.benchmark.Benchmark;
 import org.apache.jackrabbit.oak.fixture.RepositoryFixture;
 import org.apache.jackrabbit.util.Text;
@@ -38,8 +41,14 @@ public class WikipediaImport extends Ben
 
     private final File dump;
 
-    public WikipediaImport(File dump) {
+    private final boolean doReport;
+
+    private final boolean flat;
+
+    public WikipediaImport(File dump, boolean flat, boolean doReport) {
         this.dump = dump;
+        this.flat = flat;
+        this.doReport = doReport;
     }
 
     @Override
@@ -75,21 +84,34 @@ public class WikipediaImport extends Ben
                 new SimpleCredentials("admin", "admin".toCharArray()));
         try {
             int before = importWikipedia(session);
-            int after = traverseWikipedia(session);
+            int after = new Traversal().traverse(session);
             checkState(before == after, "Import vs. traverse mismatch");
         } finally {
             session.logout();
         }
     }
 
-    private int importWikipedia(Session session) throws Exception {
+    public int importWikipedia(Session session) throws Exception {
         long start = System.currentTimeMillis();
         int count = 0;
         int code = 0;
 
         System.out.format("Importing %s...%n", dump);
-        Node wikipedia = session.getRootNode().addNode(
-                "wikipedia", "oak:Unstructured");
+
+        String type = "nt:unstructured";
+        if (flat) {
+            type = "oak:Unstructured";
+        }
+        Node wikipedia = session.getRootNode().addNode("wikipedia", type);
+
+        int levels = 0;
+        if (!flat) {
+            // calculate the number of levels needed, based on the rough
+            // estimate that the average XML size of a page is about 1kB
+            for (long pages = dump.length() / 1024; pages > 256; pages /= 256) {
+                levels++;
+            }
+        }
 
         String title = null;
         String text = null;
@@ -108,18 +130,34 @@ public class WikipediaImport extends Ben
             case XMLStreamConstants.END_ELEMENT:
                 if ("page".equals(reader.getLocalName())) {
                     String name = Text.escapeIllegalJcrChars(title);
-                    Node page = wikipedia.addNode(name);
+                    Node parent = wikipedia;
+                    if (levels > 0) {
+                        int n = name.length();
+                        for (int i = 0; i < levels; i++) {
+                            int hash = name.substring(min(i, n)).hashCode();
+                            parent = JcrUtils.getOrAddNode(
+                                    parent, String.format("%02x", hash & 0xff));
+                        }
+                    }
+                    Node page = parent.addNode(name);
                     page.setProperty("title", title);
                     page.setProperty("text", text);
                     code += title.hashCode();
                     code += text.hashCode();
                     count++;
                     if (count % 1000 == 0) {
-                        long millis = System.currentTimeMillis() - start;
-                        System.out.format(
-                                "Added %d pages in %d seconds (%.2fms/page)%n",
-                                count, millis / 1000, (double) millis / count);
+                        if (!flat) {
+                            session.save();
+                        }
+                        if (doReport) {
+                            long millis = System.currentTimeMillis() - start;
+                            System.out.format(
+                                    "Added %d pages in %d seconds (%.2fms/page)%n",
+                                    count, millis / 1000, (double) millis / count);
+                        }
                     }
+
+                    pageAdded(title, text);
                 }
                 break;
             }
@@ -127,40 +165,61 @@ public class WikipediaImport extends Ben
 
         session.save();
 
-        long millis = System.currentTimeMillis() - start;
-        System.out.format(
-                "Imported %d pages in %d seconds (%.2fms/page)%n",
-                count, millis / 1000, (double) millis / count);
+        if (doReport) {
+            long millis = System.currentTimeMillis() - start;
+            System.out.format(
+                    "Imported %d pages in %d seconds (%.2fms/page)%n",
+                    count, millis / 1000, (double) millis / count);
+        }
+
         return code;
     }
 
-    private int traverseWikipedia(Session session) throws Exception {
-        long start = System.currentTimeMillis();
-        int count = 0;
-        int code = 0;
+    protected void pageAdded(String title, String text) {
+    }
 
-        System.out.format("Traversing imported pages...%n");
-        Node wikipedia = session.getNode("/wikipedia");
+    private class Traversal {
 
-        NodeIterator pages = wikipedia.getNodes();
-        while (pages.hasNext()) {
-            Node page = pages.nextNode();
-            code += page.getProperty("title").getString().hashCode();
-            code += page.getProperty("text").getString().hashCode();
-            count++;
-            if (count % 1000 == 0) {
+        private final long start = System.currentTimeMillis();
+        private int count = 0;
+        private int code = 0;
+
+        private int traverse(Session session) throws Exception {
+            System.out.format("Traversing imported pages...%n");
+            Node wikipedia = session.getNode("/wikipedia");
+
+            traverse(wikipedia);
+
+            if (doReport) {
                 long millis = System.currentTimeMillis() - start;
                 System.out.format(
-                        "Read %d pages in %d seconds (%.2fms/page)%n",
+                        "Traversed %d pages in %d seconds (%.2fms/page)%n",
                         count, millis / 1000, (double) millis / count);
             }
+
+            return code;
+        }
+
+        private void traverse(Node parent) throws RepositoryException {
+            NodeIterator pages = parent.getNodes();
+            while (pages.hasNext()) {
+                Node page = pages.nextNode();
+
+                code += page.getProperty("title").getString().hashCode();
+                code += page.getProperty("text").getString().hashCode();
+
+                count++;
+                if (count % 1000 == 0 && doReport) {
+                    long millis = System.currentTimeMillis() - start;
+                    System.out.format(
+                            "Read %d pages in %d seconds (%.2fms/page)%n",
+                            count, millis / 1000, (double) millis / count);
+                }
+
+                traverse(page);
+            }
         }
 
-        long millis = System.currentTimeMillis() - start;
-        System.out.format(
-                "Traversed %d pages in %d seconds (%.2fms/page)%n",
-                count, millis / 1000, (double) millis / count);
-        return code;
     }
 
 }