You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2020/02/16 04:17:43 UTC

[lucene-solr] branch jira/LUCENE-9220 updated: LUCENE-9220: automate generation of (bsd license-only, sampled) test data

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch jira/LUCENE-9220
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/jira/LUCENE-9220 by this push:
     new d9a285c  LUCENE-9220: automate generation of (bsd license-only, sampled) test data
d9a285c is described below

commit d9a285c857e632a32f7762c49d2ab8363ae8c876
Author: Robert Muir <rm...@apache.org>
AuthorDate: Sat Feb 15 23:17:38 2020 -0500

    LUCENE-9220: automate generation of (bsd license-only, sampled) test data
---
 .../analysis/snowball/TestSnowballVocab.java       |  37 ++++++---------------
 .../analysis/snowball/TestSnowballVocabData.zip    | Bin 3568843 -> 0 bytes
 .../org/apache/lucene/analysis/snowball/danish.zip | Bin 0 -> 16287 bytes
 .../org/apache/lucene/analysis/snowball/dutch.zip  | Bin 0 -> 20628 bytes
 .../apache/lucene/analysis/snowball/english.zip    | Bin 0 -> 16365 bytes
 .../apache/lucene/analysis/snowball/finnish.zip    | Bin 0 -> 19301 bytes
 .../org/apache/lucene/analysis/snowball/german.zip | Bin 0 -> 18745 bytes
 .../apache/lucene/analysis/snowball/german2.zip    | Bin 0 -> 18740 bytes
 .../apache/lucene/analysis/snowball/hungarian.zip  | Bin 0 -> 20252 bytes
 .../org/apache/lucene/analysis/snowball/irish.zip  | Bin 0 -> 20185 bytes
 .../apache/lucene/analysis/snowball/italian.zip    | Bin 0 -> 16180 bytes
 .../org/apache/lucene/analysis/snowball/kp.zip     | Bin 0 -> 20366 bytes
 .../org/apache/lucene/analysis/snowball/lovins.zip | Bin 0 -> 15833 bytes
 .../org/apache/lucene/analysis/snowball/nepali.zip | Bin 0 -> 17856 bytes
 .../apache/lucene/analysis/snowball/norwegian.zip  | Bin 0 -> 18274 bytes
 .../org/apache/lucene/analysis/snowball/porter.zip | Bin 0 -> 16037 bytes
 .../apache/lucene/analysis/snowball/portuguese.zip | Bin 0 -> 16593 bytes
 .../apache/lucene/analysis/snowball/romanian.zip   | Bin 0 -> 16812 bytes
 .../apache/lucene/analysis/snowball/russian.zip    | Bin 0 -> 20221 bytes
 .../apache/lucene/analysis/snowball/spanish.zip    | Bin 0 -> 16322 bytes
 .../apache/lucene/analysis/snowball/swedish.zip    | Bin 0 -> 17167 bytes
 .../lucene/analysis/snowball/test_languages.txt    |  20 +++++++++++
 .../apache/lucene/analysis/snowball/turkish.zip    | Bin 0 -> 19412 bytes
 23 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
index 864a8e5..1937c57 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
@@ -18,52 +18,38 @@ package org.apache.lucene.analysis.snowball;
 
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.LuceneTestCase.Nightly;
 
 import static org.apache.lucene.analysis.VocabularyAssert.*;
 
 /**
  * Test the snowball filters against the snowball data tests
  */
-@Nightly
 public class TestSnowballVocab extends LuceneTestCase {
   /**
    * Run all languages against their snowball vocabulary tests.
    */
   public void testStemmers() throws IOException {
-    // nocommit: regenerate test data!
-    assertCorrectOutput("Arabic", "arabic");
-    assertCorrectOutput("Danish", "danish");
-    assertCorrectOutput("Dutch", "dutch");
-    assertCorrectOutput("English", "english");
-    //assertCorrectOutput("Finnish", "finnish");
-    //assertCorrectOutput("French", "french");
-    //assertCorrectOutput("German", "german");
-    assertCorrectOutput("German2", "german2");
-    //assertCorrectOutput("Hungarian", "hungarian");
-    assertCorrectOutput("Italian", "italian");
-    assertCorrectOutput("Kp", "kraaij_pohlmann");
-    assertCorrectOutput("Lovins", "lovins");
-    assertCorrectOutput("Norwegian", "norwegian");
-    assertCorrectOutput("Porter", "porter");
-    //assertCorrectOutput("Portuguese", "portuguese");
-    assertCorrectOutput("Romanian", "romanian");
-    assertCorrectOutput("Russian", "russian");
-    assertCorrectOutput("Spanish", "spanish");
-    assertCorrectOutput("Swedish", "swedish");
-    assertCorrectOutput("Turkish", "turkish");
+    try (InputStream in = getClass().getResourceAsStream("test_languages.txt")) {
+      for (String datafile : WordlistLoader.getLines(in, StandardCharsets.UTF_8)) {
+        String language = "" + Character.toUpperCase(datafile.charAt(0)) + datafile.substring(1);
+        assertCorrectOutput(language, datafile + ".zip");
+      }
+    }
   }
     
   /**
    * For the supplied language, run the stemmer against all strings in voc.txt
    * The output should be the same as the string in output.txt
    */
-  private void assertCorrectOutput(final String snowballLanguage, String dataDirectory)
+  private void assertCorrectOutput(final String snowballLanguage, String zipfile)
       throws IOException {
     if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);
     
@@ -75,8 +61,7 @@ public class TestSnowballVocab extends LuceneTestCase {
       }  
     };
     
-    assertVocabulary(a, getDataPath("TestSnowballVocabData.zip"), 
-        dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
+    assertVocabulary(a, getDataPath(zipfile), "voc.txt", "output.txt");
     a.close();
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocabData.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocabData.zip
deleted file mode 100644
index e3cae65..0000000
Binary files a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocabData.zip and /dev/null differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/danish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/danish.zip
new file mode 100644
index 0000000..ac867dc
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/danish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/dutch.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/dutch.zip
new file mode 100644
index 0000000..89a2729
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/dutch.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/english.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/english.zip
new file mode 100644
index 0000000..0c5c22d
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/english.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/finnish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/finnish.zip
new file mode 100644
index 0000000..ba6ed09
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/finnish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german.zip
new file mode 100644
index 0000000..3a9dccb
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german2.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german2.zip
new file mode 100644
index 0000000..1dd220e
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/german2.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/hungarian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/hungarian.zip
new file mode 100644
index 0000000..5a5e506
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/hungarian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/irish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/irish.zip
new file mode 100644
index 0000000..1222261
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/irish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/italian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/italian.zip
new file mode 100644
index 0000000..31e0a9d
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/italian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/kp.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/kp.zip
new file mode 100644
index 0000000..6278202
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/kp.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/lovins.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/lovins.zip
new file mode 100644
index 0000000..b662a1a
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/lovins.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/nepali.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/nepali.zip
new file mode 100644
index 0000000..6544dd1
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/nepali.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/norwegian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/norwegian.zip
new file mode 100644
index 0000000..db20318
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/norwegian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/porter.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/porter.zip
new file mode 100644
index 0000000..2c531e9
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/porter.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/portuguese.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/portuguese.zip
new file mode 100644
index 0000000..41785c9
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/portuguese.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/romanian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/romanian.zip
new file mode 100644
index 0000000..932200e
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/romanian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/russian.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/russian.zip
new file mode 100644
index 0000000..7dcac50
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/russian.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/spanish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/spanish.zip
new file mode 100644
index 0000000..03eb313
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/spanish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/swedish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/swedish.zip
new file mode 100644
index 0000000..36e5d08
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/swedish.zip differ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/test_languages.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/test_languages.txt
new file mode 100644
index 0000000..5a7629e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/test_languages.txt
@@ -0,0 +1,20 @@
+danish
+dutch
+english
+finnish
+german
+german2
+hungarian
+irish
+italian
+kp
+lovins
+nepali
+norwegian
+porter
+portuguese
+romanian
+russian
+spanish
+swedish
+turkish
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/turkish.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/turkish.zip
new file mode 100644
index 0000000..b017086
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/turkish.zip differ