You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/05/12 00:53:39 UTC
svn commit: r1102127 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/backwards/ lucene/contrib/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/
lucen...
Author: rmuir
Date: Wed May 11 22:53:38 2011
New Revision: 1102127
URL: http://svn.apache.org/viewvc?rev=1102127&view=rev
Log:
LUCENE-3086: add ElisionFilter to ItalianAnalyzer
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/path/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java
lucene/dev/branches/branch_3x/solr/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Wed May 11 22:53:38 2011
@@ -2,6 +2,11 @@ Lucene contrib change Log
======================= Lucene 3.x (not yet released) =======================
+Changes in runtime behavior
+
+ * LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian
+ contractions by default. (Robert Muir)
+
Bug Fixes
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java Wed May 11 22:53:38 2011
@@ -30,8 +30,6 @@ import org.apache.lucene.util.Version;
/**
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
* tokenized as "avion" (plane).
- * <p>
- * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
*
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
*/
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Wed May 11 22:53:38 2011
@@ -19,6 +19,7 @@ package org.apache.lucene.analysis.it;
import java.io.IOException;
import java.io.Reader;
+import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
@@ -27,6 +28,7 @@ import org.apache.lucene.analysis.Keywor
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -38,6 +40,14 @@ import org.tartarus.snowball.ext.Italian
/**
* {@link Analyzer} for Italian.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ItalianAnalyzer:
+ * <ul>
+ * <li> As of 3.2, ElisionFilter with a set of Italian
+ * contractions is used by default.
+ * </ul>
*/
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
@@ -45,6 +55,13 @@ public final class ItalianAnalyzer exten
/** File containing default Italian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
+ private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+ new CharArraySet(Version.LUCENE_CURRENT,
+ Arrays.asList(
+ "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell",
+ "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
+ ), true));
+
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
@@ -112,7 +129,7 @@ public final class ItalianAnalyzer exten
* @return A
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+ * {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@@ -121,6 +138,9 @@ public final class ItalianAnalyzer exten
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
+ if (matchVersion.onOrAfter(Version.LUCENE_32)) {
+ result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+ }
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java Wed May 11 22:53:38 2011
@@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
@@ -55,4 +56,18 @@ public class TestItalianAnalyzer extends
public void testRandomStrings() throws Exception {
checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
}
+
+ /** test that the elisionfilter is working */
+ public void testContractions() throws IOException {
+ Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
+ assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
+ }
+
+ /** test that we don't enable this before 3.2*/
+ public void testContractionsBackwards() throws IOException {
+ Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
+ assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java Wed May 11 22:53:38 2011
@@ -113,7 +113,7 @@ public abstract class LuceneTestCase ext
/** Use this constant when creating Analyzers and any other version-dependent stuff.
* <p><b>NOTE:</b> Change this when development starts for new Lucene version:
*/
- public static final Version TEST_VERSION_CURRENT = Version.LUCENE_31;
+ public static final Version TEST_VERSION_CURRENT = Version.LUCENE_32;
/**
* If this is set, it is the only method that should run.