You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/03/22 11:22:48 UTC
[lucene] branch main updated: LUCENE-9854: Clean up utilities to
download and extract test/ benchmark data sets. (#27)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 246c4be LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)
246c4be is described below
commit 246c4beb22c8b172f381b2eeae2becb9982a4c1b
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Mon Mar 22 12:22:39 2021 +0100
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)
---
build.gradle | 4 +-
.../lucene/gradle/datasets}/ExtractReuters.java | 23 +--
gradle/datasets/external-datasets.gradle | 174 +++++++++++++++++++++
gradle/testing/randomization.gradle | 2 +-
gradle/validation/validate-source-patterns.gradle | 3 +-
help/tests.txt | 9 ++
lucene/benchmark/.gitignore | 3 +-
lucene/benchmark/build.gradle | 93 -----------
.../org/apache/lucene/util/LuceneTestCase.java | 13 +-
.../lucene/util/RunListenerPrintReproduceInfo.java | 6 +-
10 files changed, 216 insertions(+), 114 deletions(-)
diff --git a/build.gradle b/build.gradle
index 5c0baca..2789d03 100644
--- a/build.gradle
+++ b/build.gradle
@@ -23,7 +23,7 @@ plugins {
id "com.palantir.consistent-versions" version "1.14.0"
id "org.owasp.dependencycheck" version "5.3.0"
id 'de.thetaphi.forbiddenapis' version '3.1' apply false
- id "de.undercouch.download" version "4.0.2" apply false
+ id "de.undercouch.download" version "4.1.1" apply false
id "net.ltgt.errorprone" version "1.2.1" apply false
id 'com.diffplug.spotless' version "5.8.2" apply false
}
@@ -156,6 +156,8 @@ apply from: file('gradle/generation/nori.gradle')
apply from: file('gradle/generation/icu.gradle')
apply from: file('gradle/generation/javacc.gradle')
+apply from: file('gradle/datasets/external-datasets.gradle')
+
// Shared configuration of subprojects containing native code.
apply from: file('gradle/native/disable-native.gradle')
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java
similarity index 90%
rename from lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
rename to buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java
index 4e3003d..b8d6735 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
+++ b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.benchmark.utils;
+package org.apache.lucene.gradle.datasets;
import java.io.BufferedReader;
import java.io.BufferedWriter;
@@ -27,10 +27,10 @@ import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.lucene.util.IOUtils;
/**
- * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
+ * Split the Reuters SGML documents into Simple Text files containing:
+ * Title, Date, Dateline, Body
*/
public class ExtractReuters {
private Path reutersDir;
@@ -39,13 +39,16 @@ public class ExtractReuters {
public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
this.reutersDir = reutersDir;
this.outputDir = outputDir;
- System.out.println("Deleting all files in " + outputDir);
- IOUtils.rm(outputDir);
}
public void extract() throws IOException {
long count = 0;
Files.createDirectories(outputDir);
+
+ if (Files.list(outputDir).count() > 0) {
+ throw new IOException("The output directory must be empty: " + outputDir);
+ }
+
try (DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm")) {
for (Path sgmFile : stream) {
extractFile(sgmFile);
@@ -53,7 +56,7 @@ public class ExtractReuters {
}
}
if (count == 0) {
- System.err.println("No .sgm files in " + reutersDir);
+ throw new IOException("No .sgm files in " + reutersDir);
}
}
@@ -65,7 +68,7 @@ public class ExtractReuters {
private static String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"};
/** Override if you wish to change what is extracted */
- protected void extractFile(Path sgmFile) {
+ protected void extractFile(Path sgmFile) throws IOException {
try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) {
StringBuilder buffer = new StringBuilder(1024);
StringBuilder outBuffer = new StringBuilder(1024);
@@ -105,8 +108,6 @@ public class ExtractReuters {
buffer.setLength(0);
}
}
- } catch (IOException e) {
- throw new RuntimeException(e);
}
}
@@ -135,6 +136,8 @@ public class ExtractReuters {
System.err.println(
"Usage: "
+ msg
- + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
+ + " :: java -cp <...> "
+ + ExtractReuters.class.getName()
+ + " <Path to Reuters SGM files> <Output Path>");
}
}
diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle
new file mode 100644
index 0000000..f466909
--- /dev/null
+++ b/gradle/datasets/external-datasets.gradle
@@ -0,0 +1,174 @@
+import org.apache.lucene.gradle.datasets.ExtractReuters
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO: not sure whether this should live in benchmarks, but for now
+// let it be.
+configure(project(":lucene:benchmark")) {
+ apply plugin: "java"
+ apply plugin: "de.undercouch.download"
+
+ ext {
+ dataDir = file("data")
+ }
+
+ task getEnWiki(type: Download) {
+ ext {
+ name = "enwiki-20070527-pages-articles.xml"
+ src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
+ intermediate = file("${dataDir}/${name}.bz2")
+ dst = file("${dataDir}/${name}")
+ }
+
+ outputs.file ext.dst
+
+ src ext.src
+ dest ext.intermediate
+ overwrite false
+ compress false
+
+ doLast {
+ logger.lifecycle("Decompressing ${ext.name}...")
+ ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+ }
+ }
+
+ task getEnWikiRandomLines(type: Download) {
+ ext {
+ name = "enwiki.random.lines.txt"
+ src = "https://home.apache.org/~mikemccand/${name}.bz2"
+ intermediate = file("${dataDir}/${name}.bz2")
+ dst = file("${dataDir}/${name}")
+ }
+
+ outputs.file ext.dst
+
+ src ext.src
+ dest ext.intermediate
+ overwrite false
+ compress false
+
+ doLast {
+ logger.lifecycle("Decompressing ${ext.name}...")
+ ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+ }
+ }
+
+ task getGeoNames(type: Download) {
+ // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
+ // and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
+ // and then compress with: bzip2 -9 -k file_random.txt
+ ext {
+ name = "geonames_20130921_randomOrder_allCountries.txt"
+ src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
+ intermediate = file("${dataDir}/${name}.bz2")
+ dst = file("${dataDir}/${name}")
+ }
+
+ outputs.file ext.dst
+
+ src ext.src
+ dest ext.intermediate
+ overwrite false
+ compress false
+
+ doLast {
+ logger.lifecycle("Decompressing ${ext.name}...")
+ ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+ }
+ }
+
+ task getTop100kWikiWordFiles(type: Download) {
+ ext {
+ name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
+ src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
+ intermediate = file("${dataDir}/${name}.bz2")
+ dst = file("${dataDir}/${name}")
+ }
+
+ outputs.dir ext.dst
+
+ src ext.src
+ dest ext.intermediate
+ overwrite false
+ compress false
+
+ doLast {
+ logger.lifecycle("Decompressing ${ext.name}...")
+ project.sync {
+ from tarTree(ext.intermediate) // defined above. Will decompress on the fly
+ into ext.dst
+ }
+ }
+ }
+
+ task getReuters(type: Download) {
+ ext {
+ name = "reuters21578"
+ // note: there is no HTTPS url and we don't care because this is merely test/perf data
+ src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
+ intermediate = file("${dataDir}/${name}.tar.gz")
+ dst = file("${dataDir}/${name}")
+ }
+
+ outputs.dir ext.dst
+
+ src ext.src
+ dest ext.intermediate
+ overwrite false
+ compress false
+
+ doLast {
+ def untarPath = file("$temporaryDir/reuters-untar")
+
+ logger.lifecycle("Decompressing ${ext.name}...")
+ project.sync {
+ from(tarTree(intermediate)) {
+ exclude '*.txt'
+ }
+ into untarPath
+ }
+
+ logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
+ ext.dst.deleteDir()
+ ExtractReuters.main(untarPath.toString(), ext.dst.toString())
+ }
+ }
+
+ task downloadDatasets() {
+ group "Data set download"
+ description "Download all data sets."
+ }
+
+ [
+ getEnWiki,
+ getGeoNames,
+ getTop100kWikiWordFiles,
+ getReuters,
+ getEnWikiRandomLines
+ ].each { task ->
+ task.group "Data set download"
+ task.description "Download the ${task.ext.name} data set."
+
+ downloadDatasets.dependsOn(task)
+
+ task.doFirst {
+ logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
+ }
+ }
+}
\ No newline at end of file
diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle
index 370b692..fc6c991 100644
--- a/gradle/testing/randomization.gradle
+++ b/gradle/testing/randomization.gradle
@@ -96,7 +96,7 @@ allprojects {
RandomPicks.randomFrom(new Random(projectSeedLong), ["US-ASCII", "ISO-8859-1", "UTF-8"])
},
description: "Sets the default file.encoding on test JVM.", buildOnly: true],
- // test data
+ // Test data file used.
[propName: 'tests.linedocsfile', value: 'europarl.lines.txt.gz', description: "Test data file path."],
// miscellaneous; some of them very weird.
[propName: 'tests.LUCENE_VERSION', value: baseVersion, description: "Base Lucene version."],
diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle
index d82f8b3..e9939a1 100644
--- a/gradle/validation/validate-source-patterns.gradle
+++ b/gradle/validation/validate-source-patterns.gradle
@@ -90,8 +90,7 @@ subprojects {
configure(project(':lucene:benchmark')) {
project.tasks.withType(ValidateSourcePatternsTask) {
- sourceFiles.exclude 'temp/**'
- sourceFiles.exclude 'work/**'
+ sourceFiles.exclude 'data/**'
}
}
diff --git a/help/tests.txt b/help/tests.txt
index 5054c0e..b4a1444 100644
--- a/help/tests.txt
+++ b/help/tests.txt
@@ -155,3 +155,12 @@ Using these additional options will make the results more sparse, so it may be u
to increase the top-N count:
gradlew -p lucene/core test -Ptests.profile=true -Ptests.profile.count=100
+
+
+External data sets
+------------------
+
+Some tests may require external (and large) data sets. To see relevant tasks
+that download and extract these data files automatically, run the following:
+
+gradlew tasks --group "Data set download"
diff --git a/lucene/benchmark/.gitignore b/lucene/benchmark/.gitignore
index a20524a..249cda9 100644
--- a/lucene/benchmark/.gitignore
+++ b/lucene/benchmark/.gitignore
@@ -1,2 +1 @@
-/temp
-/work
\ No newline at end of file
+/data
\ No newline at end of file
diff --git a/lucene/benchmark/build.gradle b/lucene/benchmark/build.gradle
index 73f1dd7..9271b00 100644
--- a/lucene/benchmark/build.gradle
+++ b/lucene/benchmark/build.gradle
@@ -17,7 +17,6 @@
plugins {
id "java"
- id "de.undercouch.download"
}
description = 'System for benchmarking Lucene'
@@ -44,9 +43,6 @@ dependencies {
testImplementation project(':lucene:test-framework')
}
-def tempDir = file("temp")
-def workDir = file("work")
-
task run(type: JavaExec) {
description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
main 'org.apache.lucene.benchmark.byTask.Benchmark'
@@ -67,92 +63,3 @@ task run(type: JavaExec) {
suspend = true
}
}
-
-/* Old "collation" Ant target:
-gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
-perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
- */
-
-/* Old "shingle" Ant target:
-gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
-perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
- */
-
-// The remaining tasks just get / extract / prepare data
-
-task getEnWiki(type: Download) {
- def finalName = "enwiki-20070527-pages-articles.xml"
- src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
- dest file("$tempDir/" + finalName + ".bz2")
- overwrite false
- compress false
-
- doLast {
- ant.bunzip2(src: dest, dest: tempDir)
- }
- outputs.file file("$tempDir/$finalName")
-}
-
-task getGeoNames(type: Download) {
- // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
- // and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
- // and then compress with: bzip2 -9 -k file_random.txt
- def finalName = "geonames_20130921_randomOrder_allCountries.txt"
- src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
- dest file("$tempDir/" + finalName + ".bz2")
- overwrite false
- compress false
-
- doLast {
- ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
- }
- outputs.file file("$tempDir/$finalName")
-}
-
-task getTop100kWikiWordFiles(type: Download) {
- src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
- dest file("$tempDir/${src.file.split('/').last()}")
- overwrite false
- compress false
-
- def finalPath = file("$workDir/top100k-out")
-
- doLast {
- project.sync {
- from tarTree(dest) // defined above. Will decompress on the fly
- into finalPath
- }
- }
- outputs.dir finalPath
-}
-
-task getReuters(type: Download) {
- // note: there is no HTTPS url and we don't care because this is merely test/perf data
- src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
- dest file("$tempDir/${src.file.split('/').last()}")
- overwrite false
- compress false
-
- def untarPath = file("$workDir/reuters")
- def finalPath = file("$workDir/reuters-out")
- dependsOn sourceSets.main.runtimeClasspath
-
- doLast {
- project.sync {
- from(tarTree(dest)) { // defined above. Will decompress on the fly
- exclude '*.txt'
- }
- into untarPath
- }
- println "Extracting reuters to $finalPath"
- finalPath.deleteDir() // necessary
- // TODO consider porting ExtractReuters to groovy?
- project.javaexec {
- main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
- classpath = sourceSets.main.runtimeClasspath
- maxHeapSize = '1G'
- args = [untarPath, finalPath]
- }
- }
- outputs.dir finalPath
-}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
index f05b62f..08d18bc 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
@@ -387,10 +387,17 @@ public abstract class LuceneTestCase extends Assert {
public static final boolean TEST_ASSERTS_ENABLED = systemPropertyAsBoolean("tests.asserts", true);
- /** TODO: javadoc? */
+ /**
+ * The default (embedded resource) lines file.
+ *
+ * @see #TEST_LINE_DOCS_FILE
+ */
public static final String DEFAULT_LINE_DOCS_FILE = "europarl.lines.txt.gz";
- /** TODO: javadoc? */
+ /**
+ * Random sample from enwiki used in tests. See {@code help/tests.txt}. gradle task downloading
+ * this data set: {@code gradlew getEnWikiRandomLines}.
+ */
public static final String JENKINS_LARGE_LINE_DOCS_FILE = "enwiki.random.lines.txt";
/** Gets the codec to run tests with. */
@@ -407,7 +414,7 @@ public abstract class LuceneTestCase extends Assert {
/** Gets the directory to run tests with */
public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
- /** the line file used by LineFileDocs */
+ /** The line file used in tests (by {@link LineFileDocs}). */
public static final String TEST_LINE_DOCS_FILE =
System.getProperty("tests.linedocsfile", DEFAULT_LINE_DOCS_FILE);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java b/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
index e088a71..0f9664b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
@@ -151,8 +151,10 @@ public final class RunListenerPrintReproduceInfo extends RunListener {
}
if (TEST_LINE_DOCS_FILE.endsWith(JENKINS_LARGE_LINE_DOCS_FILE)) {
System.err.println(
- "NOTE: download the large Jenkins line-docs file by running "
- + "'ant get-jenkins-line-docs' in the lucene directory.");
+ "NOTE: large line-docs file was used in this run. You have to download "
+ + "it manually ('gradlew getEnWikiRandomLines') and use -P"
+ + TEST_LINE_DOCS_FILE
+ + "=... property to point to it.");
}
final StringBuilder b = new StringBuilder();