You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2020/02/16 04:17:43 UTC
[lucene-solr] branch jira/LUCENE-9220 updated: LUCENE-9220:
automate generation of (bsd license-only, sampled) test data
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch jira/LUCENE-9220
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/jira/LUCENE-9220 by this push:
new d9a285c LUCENE-9220: automate generation of (bsd license-only, sampled) test data
d9a285c is described below
commit d9a285c857e632a32f7762c49d2ab8363ae8c876
Author: Robert Muir <rm...@apache.org>
AuthorDate: Sat Feb 15 23:17:38 2020 -0500
LUCENE-9220: automate generation of (bsd license-only, sampled) test data
---
.../analysis/snowball/TestSnowballVocab.java | 37 ++++++---------------
.../analysis/snowball/TestSnowballVocabData.zip | Bin 3568843 -> 0 bytes
.../org/apache/lucene/analysis/snowball/danish.zip | Bin 0 -> 16287 bytes
.../org/apache/lucene/analysis/snowball/dutch.zip | Bin 0 -> 20628 bytes
.../apache/lucene/analysis/snowball/english.zip | Bin 0 -> 16365 bytes
.../apache/lucene/analysis/snowball/finnish.zip | Bin 0 -> 19301 bytes
.../org/apache/lucene/analysis/snowball/german.zip | Bin 0 -> 18745 bytes
.../apache/lucene/analysis/snowball/german2.zip | Bin 0 -> 18740 bytes
.../apache/lucene/analysis/snowball/hungarian.zip | Bin 0 -> 20252 bytes
.../org/apache/lucene/analysis/snowball/irish.zip | Bin 0 -> 20185 bytes
.../apache/lucene/analysis/snowball/italian.zip | Bin 0 -> 16180 bytes
.../org/apache/lucene/analysis/snowball/kp.zip | Bin 0 -> 20366 bytes
.../org/apache/lucene/analysis/snowball/lovins.zip | Bin 0 -> 15833 bytes
.../org/apache/lucene/analysis/snowball/nepali.zip | Bin 0 -> 17856 bytes
.../apache/lucene/analysis/snowball/norwegian.zip | Bin 0 -> 18274 bytes
.../org/apache/lucene/analysis/snowball/porter.zip | Bin 0 -> 16037 bytes
.../apache/lucene/analysis/snowball/portuguese.zip | Bin 0 -> 16593 bytes
.../apache/lucene/analysis/snowball/romanian.zip | Bin 0 -> 16812 bytes
.../apache/lucene/analysis/snowball/russian.zip | Bin 0 -> 20221 bytes
.../apache/lucene/analysis/snowball/spanish.zip | Bin 0 -> 16322 bytes
.../apache/lucene/analysis/snowball/swedish.zip | Bin 0 -> 17167 bytes
.../lucene/analysis/snowball/test_languages.txt | 20 +++++++++++
.../apache/lucene/analysis/snowball/turkish.zip | Bin 0 -> 19412 bytes
23 files changed, 31 insertions(+), 26 deletions(-)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
index 864a8e5..1937c57 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
@@ -18,52 +18,38 @@ package org.apache.lucene.analysis.snowball;
import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.LuceneTestCase.Nightly;
import static org.apache.lucene.analysis.VocabularyAssert.*;
/**
* Test the snowball filters against the snowball data tests
*/
-@Nightly
public class TestSnowballVocab extends LuceneTestCase {
/**
* Run all languages against their snowball vocabulary tests.
*/
public void testStemmers() throws IOException {
- // nocommit: regenerate test data!
- assertCorrectOutput("Arabic", "arabic");
- assertCorrectOutput("Danish", "danish");
- assertCorrectOutput("Dutch", "dutch");
- assertCorrectOutput("English", "english");
- //assertCorrectOutput("Finnish", "finnish");
- //assertCorrectOutput("French", "french");
- //assertCorrectOutput("German", "german");
- assertCorrectOutput("German2", "german2");
- //assertCorrectOutput("Hungarian", "hungarian");
- assertCorrectOutput("Italian", "italian");
- assertCorrectOutput("Kp", "kraaij_pohlmann");
- assertCorrectOutput("Lovins", "lovins");
- assertCorrectOutput("Norwegian", "norwegian");
- assertCorrectOutput("Porter", "porter");
- //assertCorrectOutput("Portuguese", "portuguese");
- assertCorrectOutput("Romanian", "romanian");
- assertCorrectOutput("Russian", "russian");
- assertCorrectOutput("Spanish", "spanish");
- assertCorrectOutput("Swedish", "swedish");
- assertCorrectOutput("Turkish", "turkish");
+ try (InputStream in = getClass().getResourceAsStream("test_languages.txt")) {
+ for (String datafile : WordlistLoader.getLines(in, StandardCharsets.UTF_8)) {
+ String language = "" + Character.toUpperCase(datafile.charAt(0)) + datafile.substring(1);
+ assertCorrectOutput(language, datafile + ".zip");
+ }
+ }
}
/**
* For the supplied language, run the stemmer against all strings in voc.txt
* The output should be the same as the string in output.txt
*/
- private void assertCorrectOutput(final String snowballLanguage, String dataDirectory)
+ private void assertCorrectOutput(final String snowballLanguage, String zipfile)
throws IOException {
if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);
@@ -75,8 +61,7 @@ public class TestSnowballVocab extends LuceneTestCase {
}
};
- assertVocabulary(a, getDataPath("TestSnowballVocabData.zip"),
- dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
+ assertVocabulary(a, getDataPath(zipfile), "voc.txt", "output.txt");
a.close();
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocabData.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocabData.zip
deleted file mode 100644
index e3cae65..0000000
Binary files a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocabData.zip and /dev/null differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/danish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/danish.zip
new file mode 100644
index 0000000..ac867dc
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/danish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/dutch.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/dutch.zip
new file mode 100644
index 0000000..89a2729
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/dutch.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/english.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/english.zip
new file mode 100644
index 0000000..0c5c22d
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/english.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/finnish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/finnish.zip
new file mode 100644
index 0000000..ba6ed09
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/finnish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german.zip
new file mode 100644
index 0000000..3a9dccb
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german2.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german2.zip
new file mode 100644
index 0000000..1dd220e
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german2.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/hungarian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/hungarian.zip
new file mode 100644
index 0000000..5a5e506
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/hungarian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/irish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/irish.zip
new file mode 100644
index 0000000..1222261
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/irish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/italian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/italian.zip
new file mode 100644
index 0000000..31e0a9d
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/italian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/kp.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/kp.zip
new file mode 100644
index 0000000..6278202
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/kp.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/lovins.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/lovins.zip
new file mode 100644
index 0000000..b662a1a
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/lovins.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/nepali.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/nepali.zip
new file mode 100644
index 0000000..6544dd1
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/nepali.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/norwegian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/norwegian.zip
new file mode 100644
index 0000000..db20318
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/norwegian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/porter.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/porter.zip
new file mode 100644
index 0000000..2c531e9
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/porter.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/portuguese.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/portuguese.zip
new file mode 100644
index 0000000..41785c9
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/portuguese.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/romanian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/romanian.zip
new file mode 100644
index 0000000..932200e
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/romanian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/russian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/russian.zip
new file mode 100644
index 0000000..7dcac50
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/russian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/spanish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/spanish.zip
new file mode 100644
index 0000000..03eb313
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/spanish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/swedish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/swedish.zip
new file mode 100644
index 0000000..36e5d08
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/swedish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/test_languages.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/test_languages.txt
new file mode 100644
index 0000000..5a7629e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/test_languages.txt
@@ -0,0 +1,20 @@
+danish
+dutch
+english
+finnish
+german
+german2
+hungarian
+irish
+italian
+kp
+lovins
+nepali
+norwegian
+porter
+portuguese
+romanian
+russian
+spanish
+swedish
+turkish
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/turkish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/turkish.zip
new file mode 100644
index 0000000..b017086
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/turkish.zip differ