You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/01/13 22:51:52 UTC
svn commit: r898950 - in /lucene/java/trunk/contrib/snowball: build.xml
src/test/org/apache/lucene/analysis/snowball/
src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
Author: rmuir
Date: Wed Jan 13 21:51:51 2010
New Revision: 898950
URL: http://svn.apache.org/viewvc?rev=898950&view=rev
Log:
LUCENE-2203: use the snowball vocabulary tests for improved testing
Added:
lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java (with props)
Modified:
lucene/java/trunk/contrib/snowball/build.xml
lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/ (props changed)
Modified: lucene/java/trunk/contrib/snowball/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/build.xml?rev=898950&r1=898949&r2=898950&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/build.xml (original)
+++ lucene/java/trunk/contrib/snowball/build.xml Wed Jan 13 21:51:51 2010
@@ -131,10 +131,26 @@
</target>
<target name="compile-core" depends="build-analyzers, common.compile-core" />
+ <target name="compile-test" depends="download-vocab-tests, common.compile-test" />
<target name="build-analyzers" unless="analyzers.jar.present">
<echo>Snowball building dependency ${analyzers.jar}</echo>
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
</target>
+ <property name="snowball.vocab.rev" value="500"/>
+ <property name="snowball.vocab.url"
+ value="svn://svn.tartarus.org/snowball/trunk/data"/>
+ <property name="vocab.dir" value="src/test/org/apache/lucene/analysis/snowball"/>
+
+ <target name="download-vocab-tests" depends="compile-core"
+ description="Downloads Snowball vocabulary tests">
+ <sequential>
+ <mkdir dir="${vocab.dir}"/>
+ <exec dir="${vocab.dir}" executable="${svn.exe}"
+ failifexecutionfails="false">
+ <arg line="checkout -r ${snowball.vocab.rev} ${snowball.vocab.url}"/>
+ </exec>
+ </sequential>
+ </target>
</project>
Propchange: lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Jan 13 21:51:51 2010
@@ -0,0 +1 @@
+data
Added: lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java?rev=898950&view=auto
==============================================================================
--- lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java (added)
+++ lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java Wed Jan 13 21:51:51 2010
@@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.snowball;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Test the snowball filters against the snowball data tests
+ */
+public class TestSnowballVocab extends BaseTokenStreamTestCase {
+ private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+ static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
+ static final File dataRoot = new File(dataDir,
+ "org/apache/lucene/analysis/snowball/data");
+
+ /**
+ * Run all languages against their snowball vocabulary tests.
+ */
+ public void testStemmers() throws IOException {
+ assertCorrectOutput("Danish", "danish");
+ assertCorrectOutput("Dutch", "dutch");
+ assertCorrectOutput("English", "english");
+ // disabled due to snowball java code generation bug:
+ // see http://article.gmane.org/gmane.comp.search.snowball/1139
+ // assertCorrectOutput("Finnish", "finnish");
+ assertCorrectOutput("French", "french");
+ assertCorrectOutput("German", "german");
+ assertCorrectOutput("German2", "german2");
+ assertCorrectOutput("Hungarian", "hungarian");
+ assertCorrectOutput("Italian", "italian");
+ assertCorrectOutput("Kp", "kraaij_pohlmann");
+ // disabled due to snowball java code generation bug:
+ // see http://article.gmane.org/gmane.comp.search.snowball/1139
+ // assertCorrectOutput("Lovins", "lovins");
+ assertCorrectOutput("Norwegian", "norwegian");
+ assertCorrectOutput("Porter", "porter");
+ assertCorrectOutput("Portuguese", "portuguese");
+ assertCorrectOutput("Romanian", "romanian");
+ assertCorrectOutput("Russian", "russian");
+ assertCorrectOutput("Spanish", "spanish");
+ assertCorrectOutput("Swedish", "swedish");
+ assertCorrectOutput("Turkish", "turkish");
+ }
+
+ /**
+ * For the supplied language, run the stemmer against all strings in voc.txt
+ * The output should be the same as the string in output.txt
+ */
+ private void assertCorrectOutput(String snowballLanguage, String dataDirectory)
+ throws IOException {
+ System.err.println("checking snowball language: " + snowballLanguage);
+ TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
+ InputStream vocFile = new FileInputStream(new File(dataRoot,
+ dataDirectory + "/voc.txt"));
+ InputStream outputFile = new FileInputStream(new File(dataRoot,
+ dataDirectory + "/output.txt"));
+ BufferedReader vocReader = new BufferedReader(new InputStreamReader(
+ vocFile, "UTF-8"));
+ BufferedReader outputReader = new BufferedReader(new InputStreamReader(
+ outputFile, "UTF-8"));
+ String inputWord = null;
+ while ((inputWord = vocReader.readLine()) != null) {
+ String expectedWord = outputReader.readLine();
+ assertNotNull(expectedWord);
+ tokenizer.reset(new StringReader(inputWord));
+ filter.reset();
+ assertTokenStreamContents(filter, new String[] {expectedWord});
+ }
+ vocReader.close();
+ outputReader.close();
+ }
+}
Propchange: lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java
------------------------------------------------------------------------------
svn:eol-style = native