You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/05/12 00:53:39 UTC

svn commit: r1102127 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/ lucene/contrib/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ lucen...

Author: rmuir
Date: Wed May 11 22:53:38 2011
New Revision: 1102127

URL: http://svn.apache.org/viewvc?rev=1102127&view=rev
Log:
LUCENE-3086: add ElisionFilter to ItalianAnalyzer

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/path/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
    lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java
    lucene/dev/branches/branch_3x/solr/   (props changed)

Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Wed May 11 22:53:38 2011
@@ -2,6 +2,11 @@ Lucene contrib change Log
 
 ======================= Lucene 3.x (not yet released) =======================
 
+Changes in runtime behavior
+
+ * LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian
+   contractions by default.  (Robert Muir)
+
 Bug Fixes
 
  * LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java Wed May 11 22:53:38 2011
@@ -30,8 +30,6 @@ import org.apache.lucene.util.Version;
 /**
  * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
  * tokenized as "avion" (plane).
- * <p>
- * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
  * 
  * @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
  */

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Wed May 11 22:53:38 2011
@@ -19,6 +19,7 @@ package org.apache.lucene.analysis.it;
 
 import java.io.IOException;
 import java.io.Reader;
+import java.util.Arrays;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -27,6 +28,7 @@ import org.apache.lucene.analysis.Keywor
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.fr.ElisionFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
@@ -38,6 +40,14 @@ import org.tartarus.snowball.ext.Italian
 
 /**
  * {@link Analyzer} for Italian.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ItalianAnalyzer:
+ * <ul>
+ *   <li> As of 3.2, ElisionFilter with a set of Italian 
+ *        contractions is used by default.
+ * </ul>
  */
 public final class ItalianAnalyzer extends StopwordAnalyzerBase {
   private final Set<?> stemExclusionSet;
@@ -45,6 +55,13 @@ public final class ItalianAnalyzer exten
   /** File containing default Italian stopwords. */
   public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
   
+  private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+      new CharArraySet(Version.LUCENE_CURRENT, 
+          Arrays.asList(
+          "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", 
+          "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
+          ), true));
+
   /**
    * Returns an unmodifiable instance of the default stop words set.
    * @return default stop words set.
@@ -112,7 +129,7 @@ public final class ItalianAnalyzer exten
    * @return A
    *         {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
    *         built from an {@link StandardTokenizer} filtered with
-   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+   *         {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
    *         , {@link KeywordMarkerFilter} if a stem exclusion set is
    *         provided and {@link SnowballFilter}.
    */
@@ -121,6 +138,9 @@ public final class ItalianAnalyzer exten
       Reader reader) {
     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
+    if (matchVersion.onOrAfter(Version.LUCENE_32)) {
+      result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+    }
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if(!stemExclusionSet.isEmpty())

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java Wed May 11 22:53:38 2011
@@ -23,6 +23,7 @@ import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
 
 public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
   /** This test fails with NPE when the 
@@ -55,4 +56,18 @@ public class TestItalianAnalyzer extends
   public void testRandomStrings() throws Exception {
     checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
   }
+  
+  /** test that the elisionfilter is working */
+  public void testContractions() throws IOException {
+    Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
+    assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
+  }
+  
+  /** test that we don't enable this before 3.2*/
+  public void testContractionsBackwards() throws IOException {
+    Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
+    assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
+    assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java?rev=1102127&r1=1102126&r2=1102127&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java Wed May 11 22:53:38 2011
@@ -113,7 +113,7 @@ public abstract class LuceneTestCase ext
   /** Use this constant when creating Analyzers and any other version-dependent stuff.
    * <p><b>NOTE:</b> Change this when development starts for new Lucene version:
    */
-  public static final Version TEST_VERSION_CURRENT = Version.LUCENE_31;
+  public static final Version TEST_VERSION_CURRENT = Version.LUCENE_32;
 
   /**
    * If this is set, it is the only method that should run.