You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/03/22 11:22:48 UTC
[lucene] branch main updated: LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 246c4be  LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)
246c4be is described below

commit 246c4beb22c8b172f381b2eeae2becb9982a4c1b
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Mon Mar 22 12:22:39 2021 +0100

    LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)
---
 build.gradle                                       |   4 +-
 .../lucene/gradle/datasets}/ExtractReuters.java    |  23 +--
 gradle/datasets/external-datasets.gradle           | 174 +++++++++++++++++++++
 gradle/testing/randomization.gradle                |   2 +-
 gradle/validation/validate-source-patterns.gradle  |   3 +-
 help/tests.txt                                     |   9 ++
 lucene/benchmark/.gitignore                        |   3 +-
 lucene/benchmark/build.gradle                      |  93 -----------
 .../org/apache/lucene/util/LuceneTestCase.java     |  13 +-
 .../lucene/util/RunListenerPrintReproduceInfo.java |   6 +-
 10 files changed, 216 insertions(+), 114 deletions(-)

diff --git a/build.gradle b/build.gradle
index 5c0baca..2789d03 100644
--- a/build.gradle
+++ b/build.gradle
@@ -23,7 +23,7 @@ plugins {
   id "com.palantir.consistent-versions" version "1.14.0"
   id "org.owasp.dependencycheck" version "5.3.0"
   id 'de.thetaphi.forbiddenapis' version '3.1' apply false
-  id "de.undercouch.download" version "4.0.2" apply false
+  id "de.undercouch.download" version "4.1.1" apply false
   id "net.ltgt.errorprone" version "1.2.1" apply false
   id 'com.diffplug.spotless' version "5.8.2" apply false
 }
@@ -156,6 +156,8 @@ apply from: file('gradle/generation/nori.gradle')
 apply from: file('gradle/generation/icu.gradle')
 apply from: file('gradle/generation/javacc.gradle')
 
+apply from: file('gradle/datasets/external-datasets.gradle')
+
 // Shared configuration of subprojects containing native code.
 apply from: file('gradle/native/disable-native.gradle')
 
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java
similarity index 90%
rename from lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
rename to buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java
index 4e3003d..b8d6735 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
+++ b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.benchmark.utils;
+package org.apache.lucene.gradle.datasets;
 
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
@@ -27,10 +27,10 @@ import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import org.apache.lucene.util.IOUtils;
 
 /**
- * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
+ * Split the Reuters SGML documents into Simple Text files containing:
+ * Title, Date, Dateline, Body
  */
 public class ExtractReuters {
   private Path reutersDir;
@@ -39,13 +39,16 @@ public class ExtractReuters {
   public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
     this.reutersDir = reutersDir;
     this.outputDir = outputDir;
-    System.out.println("Deleting all files in " + outputDir);
-    IOUtils.rm(outputDir);
   }
 
   public void extract() throws IOException {
     long count = 0;
     Files.createDirectories(outputDir);
+
+    if (Files.list(outputDir).count() > 0) {
+      throw new IOException("The output directory must be empty: " + outputDir);
+    }
+
     try (DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm")) {
       for (Path sgmFile : stream) {
         extractFile(sgmFile);
@@ -53,7 +56,7 @@ public class ExtractReuters {
       }
     }
     if (count == 0) {
-      System.err.println("No .sgm files in " + reutersDir);
+      throw new IOException("No .sgm files in " + reutersDir);
     }
   }
 
@@ -65,7 +68,7 @@ public class ExtractReuters {
   private static String[] META_CHARS_SERIALIZATIONS = {"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};
 
   /** Override if you wish to change what is extracted */
-  protected void extractFile(Path sgmFile) {
+  protected void extractFile(Path sgmFile) throws IOException {
     try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) {
       StringBuilder buffer = new StringBuilder(1024);
       StringBuilder outBuffer = new StringBuilder(1024);
@@ -105,8 +108,6 @@ public class ExtractReuters {
           buffer.setLength(0);
         }
       }
-    } catch (IOException e) {
-      throw new RuntimeException(e);
     }
   }
 
@@ -135,6 +136,8 @@ public class ExtractReuters {
     System.err.println(
         "Usage: "
             + msg
-            + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
+            + " :: java -cp <...> "
+            + ExtractReuters.class.getName()
+            + " <Path to Reuters SGM files> <Output Path>");
   }
 }
diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle
new file mode 100644
index 0000000..f466909
--- /dev/null
+++ b/gradle/datasets/external-datasets.gradle
@@ -0,0 +1,174 @@
+import org.apache.lucene.gradle.datasets.ExtractReuters
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO: not sure whether this should live in benchmarks, but for now
+// let it be.
+configure(project(":lucene:benchmark")) {
+  apply plugin: "java"
+  apply plugin: "de.undercouch.download"
+
+  ext {
+    dataDir = file("data")
+  }
+
+  task getEnWiki(type: Download) {
+    ext {
+      name = "enwiki-20070527-pages-articles.xml"
+      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.file ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+    }
+  }
+
+  task getEnWikiRandomLines(type: Download) {
+    ext {
+      name = "enwiki.random.lines.txt"
+      src = "https://home.apache.org/~mikemccand/${name}.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.file ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+    }
+  }
+
+  task getGeoNames(type: Download) {
+    // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
+    //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
+    //       and then compress with: bzip2 -9 -k file_random.txt
+    ext {
+      name = "geonames_20130921_randomOrder_allCountries.txt"
+      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.file ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+    }
+  }
+
+  task getTop100kWikiWordFiles(type: Download) {
+    ext {
+      name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
+      src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.dir ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      project.sync {
+        from tarTree(ext.intermediate) // defined above. Will decompress on the fly
+        into ext.dst
+      }
+    }
+  }
+
+  task getReuters(type: Download) {
+    ext {
+      name = "reuters21578"
+      // note: there is no HTTPS url and we don't care because this is merely test/perf data
+      src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
+      intermediate = file("${dataDir}/${name}.tar.gz")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.dir ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      def untarPath = file("$temporaryDir/reuters-untar")
+
+      logger.lifecycle("Decompressing ${ext.name}...")
+      project.sync {
+        from(tarTree(intermediate)) {
+          exclude '*.txt'
+        }
+        into untarPath
+      }
+
+      logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
+      ext.dst.deleteDir()
+      ExtractReuters.main(untarPath.toString(), ext.dst.toString())
+    }
+  }
+
+  task downloadDatasets() {
+    group "Data set download"
+    description "Download all data sets."
+  }
+
+  [
+      getEnWiki,
+      getGeoNames,
+      getTop100kWikiWordFiles,
+      getReuters,
+      getEnWikiRandomLines
+  ].each { task ->
+    task.group "Data set download"
+    task.description "Download the ${task.ext.name} data set."
+
+    downloadDatasets.dependsOn(task)
+
+    task.doFirst {
+      logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
+    }
+  }
+}
\ No newline at end of file
diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle
index 370b692..fc6c991 100644
--- a/gradle/testing/randomization.gradle
+++ b/gradle/testing/randomization.gradle
@@ -96,7 +96,7 @@ allprojects {
              RandomPicks.randomFrom(new Random(projectSeedLong), ["US-ASCII", "ISO-8859-1", "UTF-8"])
            },
            description: "Sets the default file.encoding on test JVM.", buildOnly: true],
-          // test data
+          // Test data file used.
           [propName: 'tests.linedocsfile', value: 'europarl.lines.txt.gz', description: "Test data file path."],
           // miscellaneous; some of them very weird.
           [propName: 'tests.LUCENE_VERSION', value: baseVersion, description: "Base Lucene version."],
diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle
index d82f8b3..e9939a1 100644
--- a/gradle/validation/validate-source-patterns.gradle
+++ b/gradle/validation/validate-source-patterns.gradle
@@ -90,8 +90,7 @@ subprojects {
 
 configure(project(':lucene:benchmark')) {
   project.tasks.withType(ValidateSourcePatternsTask) {
-    sourceFiles.exclude 'temp/**'
-    sourceFiles.exclude 'work/**'
+    sourceFiles.exclude 'data/**'
   }
 }
 
diff --git a/help/tests.txt b/help/tests.txt
index 5054c0e..b4a1444 100644
--- a/help/tests.txt
+++ b/help/tests.txt
@@ -155,3 +155,12 @@ Using these additional options will make the results more sparse, so it may be u
 to increase the top-N count:
 
 gradlew -p lucene/core test -Ptests.profile=true -Ptests.profile.count=100
+
+
+External data sets
+------------------
+
+Some tests may require external (and large) data sets. To see relevant tasks
+that download and extract these data files automatically, run the following:
+
+gradlew tasks --group "Data set download"
diff --git a/lucene/benchmark/.gitignore b/lucene/benchmark/.gitignore
index a20524a..249cda9 100644
--- a/lucene/benchmark/.gitignore
+++ b/lucene/benchmark/.gitignore
@@ -1,2 +1 @@
-/temp
-/work
\ No newline at end of file
+/data
\ No newline at end of file
diff --git a/lucene/benchmark/build.gradle b/lucene/benchmark/build.gradle
index 73f1dd7..9271b00 100644
--- a/lucene/benchmark/build.gradle
+++ b/lucene/benchmark/build.gradle
@@ -17,7 +17,6 @@
 
 plugins {
   id "java"
-  id "de.undercouch.download"
 }
 
 description = 'System for benchmarking Lucene'
@@ -44,9 +43,6 @@ dependencies {
   testImplementation project(':lucene:test-framework')
 }
 
-def tempDir = file("temp")
-def workDir = file("work")
-
 task run(type: JavaExec) {
   description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
   main 'org.apache.lucene.benchmark.byTask.Benchmark'
@@ -67,92 +63,3 @@ task run(type: JavaExec) {
     suspend = true
   }
 }
-
-/* Old "collation" Ant target:
-gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
-perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
- */
-
-/* Old "shingle" Ant target:
-gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
-perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
- */
-
-// The remaining tasks just get / extract / prepare data
-
-task getEnWiki(type: Download) {
-  def finalName = "enwiki-20070527-pages-articles.xml"
-  src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
-  dest file("$tempDir/" + finalName + ".bz2")
-  overwrite false
-  compress false
-
-  doLast {
-    ant.bunzip2(src: dest, dest: tempDir)
-  }
-  outputs.file file("$tempDir/$finalName")
-}
-
-task getGeoNames(type: Download) {
-  // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
-  //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
-  //       and then compress with: bzip2 -9 -k file_random.txt
-  def finalName = "geonames_20130921_randomOrder_allCountries.txt"
-  src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
-  dest file("$tempDir/" + finalName + ".bz2")
-  overwrite false
-  compress false
-
-  doLast {
-    ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
-  }
-  outputs.file file("$tempDir/$finalName")
-}
-
-task getTop100kWikiWordFiles(type: Download) {
-  src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
-  dest file("$tempDir/${src.file.split('/').last()}")
-  overwrite false
-  compress false
-
-  def finalPath = file("$workDir/top100k-out")
-
-  doLast {
-    project.sync {
-      from tarTree(dest) // defined above.  Will decompress on the fly
-      into finalPath
-    }
-  }
-  outputs.dir finalPath
-}
-
-task getReuters(type: Download) {
-  // note: there is no HTTPS url and we don't care because this is merely test/perf data
-  src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
-  dest file("$tempDir/${src.file.split('/').last()}")
-  overwrite false
-  compress false
-
-  def untarPath = file("$workDir/reuters")
-  def finalPath = file("$workDir/reuters-out")
-  dependsOn sourceSets.main.runtimeClasspath
-
-  doLast {
-    project.sync {
-      from(tarTree(dest)) { // defined above.  Will decompress on the fly
-        exclude '*.txt'
-      }
-      into untarPath
-    }
-    println "Extracting reuters to $finalPath"
-    finalPath.deleteDir() // necessary
-    // TODO consider porting ExtractReuters to groovy?
-    project.javaexec {
-      main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
-      classpath = sourceSets.main.runtimeClasspath
-      maxHeapSize = '1G'
-      args = [untarPath, finalPath]
-    }
-  }
-  outputs.dir finalPath
-}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
index f05b62f..08d18bc 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
@@ -387,10 +387,17 @@ public abstract class LuceneTestCase extends Assert {
 
   public static final boolean TEST_ASSERTS_ENABLED = systemPropertyAsBoolean("tests.asserts", true);
 
-  /** TODO: javadoc? */
+  /**
+   * The default (embedded resource) lines file.
+   *
+   * @see #TEST_LINE_DOCS_FILE
+   */
   public static final String DEFAULT_LINE_DOCS_FILE = "europarl.lines.txt.gz";
 
-  /** TODO: javadoc? */
+  /**
+   * Random sample from enwiki used in tests. See {@code help/tests.txt}. gradle task downloading
+   * this data set: {@code gradlew getEnWikiRandomLines}.
+   */
   public static final String JENKINS_LARGE_LINE_DOCS_FILE = "enwiki.random.lines.txt";
 
   /** Gets the codec to run tests with. */
@@ -407,7 +414,7 @@ public abstract class LuceneTestCase extends Assert {
   /** Gets the directory to run tests with */
   public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
 
-  /** the line file used by LineFileDocs */
+  /** The line file used in tests (by {@link LineFileDocs}). */
   public static final String TEST_LINE_DOCS_FILE =
       System.getProperty("tests.linedocsfile", DEFAULT_LINE_DOCS_FILE);
 
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java b/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
index e088a71..0f9664b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
@@ -151,8 +151,10 @@ public final class RunListenerPrintReproduceInfo extends RunListener {
     }
     if (TEST_LINE_DOCS_FILE.endsWith(JENKINS_LARGE_LINE_DOCS_FILE)) {
       System.err.println(
-          "NOTE: download the large Jenkins line-docs file by running "
-              + "'ant get-jenkins-line-docs' in the lucene directory.");
+          "NOTE: large line-docs file was used in this run. You have to download "
+              + "it manually ('gradlew getEnWikiRandomLines') and use -P"
+              + TEST_LINE_DOCS_FILE
+              + "=... property to point to it.");
     }
 
     final StringBuilder b = new StringBuilder();