You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2011/10/06 00:02:28 UTC

svn commit: r1179458 - in /lucene/dev/branches/branch_3x: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/ solr/ solr/core/src/java/org/apache/so...

Author: janhoy
Date: Wed Oct  5 22:02:28 2011
New Revision: 1179458

URL: http://svn.apache.org/viewvc?rev=1179458&view=rev
Log:
SOLR-2792: Allow case insensitive Hunspell stemming

Modified:
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java Wed Oct  5 22:02:28 2011
@@ -27,6 +27,7 @@ import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;
 
 public class HunspellDictionary {
 
@@ -43,11 +44,15 @@ public class HunspellDictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
+  private static final boolean IGNORE_CASE_DEFAULT = false;
+
   private CharArrayMap<List<HunspellWord>> words;
   private CharArrayMap<List<HunspellAffix>> prefixes;
   private CharArrayMap<List<HunspellAffix>> suffixes;
 
   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
+  private boolean ignoreCase = IGNORE_CASE_DEFAULT;
+
   private final Version version;
 
   /**
@@ -61,7 +66,22 @@ public class HunspellDictionary {
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
-    this(affix, Arrays.asList(dictionary), version);
+    this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
+  }
+
+  /**
+   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
+   * and dictionary files
+   *
+   * @param affix InputStream for reading the hunspell affix file
+   * @param dictionary InputStream for reading the hunspell dictionary file
+   * @param version Lucene Version
+   * @param ignoreCase If true, dictionary matching will be case insensitive
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
+    this(affix, Arrays.asList(dictionary), version, ignoreCase);
   }
 
   /**
@@ -71,15 +91,17 @@ public class HunspellDictionary {
    * @param affix InputStream for reading the hunspell affix file
    * @param dictionaries InputStreams for reading the hunspell dictionary file
    * @param version Lucene Version
+   * @param ignoreCase If true, dictionary matching will be case insensitive
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
-  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException {
+  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
     this.version = version;
+    this.ignoreCase = ignoreCase;
     String encoding = getDictionaryEncoding(affix);
     CharsetDecoder decoder = getJavaEncoding(encoding);
     readAffixFile(affix, decoder);
-    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false);
+    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
     for (InputStream dictionary : dictionaries) {
       readDictionaryFile(dictionary, decoder);
     }
@@ -129,8 +151,8 @@ public class HunspellDictionary {
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
-    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
-    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
+    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
+    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
     
     BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
     String line = null;
@@ -308,6 +330,9 @@ public class HunspellDictionary {
         wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
         Arrays.sort(wordForm.getFlags());
         entry = line.substring(0, flagSep);
+        if(ignoreCase) {
+          entry = entry.toLowerCase(Locale.ENGLISH);
+        }
       }
       
       List<HunspellWord> entries = words.get(entry);
@@ -408,4 +433,8 @@ public class HunspellDictionary {
       return flags;
     }
   }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java Wed Oct  5 22:02:28 2011
@@ -21,9 +21,14 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Scanner;
 
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.CharacterUtils;
 import org.apache.lucene.util.Version;
 
 /**
@@ -36,6 +41,7 @@ public class HunspellStemmer {
   
   private final HunspellDictionary dictionary;
   private final StringBuilder segment = new StringBuilder();
+  private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_34);
 
   /**
    * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
@@ -79,7 +85,7 @@ public class HunspellStemmer {
    */
   public List<Stem> uniqueStems(char word[], int length) {
     List<Stem> stems = new ArrayList<Stem>();
-    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false);
+    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
     if (dictionary.lookupWord(word, 0, length) != null) {
       stems.add(new Stem(word, length));
       terms.add(word);
@@ -167,6 +173,12 @@ public class HunspellStemmer {
    */
   @SuppressWarnings("unchecked")
   public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
+    if(dictionary.isIgnoreCase()) {
+      for(int i=0;i<strippedWord.length;){
+        i += Character.toChars(
+              Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
+      }
+    }
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     if (!affix.checkCondition(segment)) {
@@ -174,7 +186,7 @@ public class HunspellStemmer {
     }
 
     List<Stem> stems = new ArrayList<Stem>();
-    
+
     List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
     if (words != null) {
       for (HunspellWord hunspellWord : words) {
@@ -294,15 +306,24 @@ public class HunspellStemmer {
    * @throws ParseException Can be thrown while parsing the files
    */
   public static void main(String[] args) throws IOException, ParseException {
-    if (args.length != 2) {
-      System.out.println("usage: HunspellStemmer <affix location> <dic location>");
+    boolean ignoreCase = false;
+    int offset = 0;
+    
+    if (args.length < 2) {
+      System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
       System.exit(1);
     }
 
-    InputStream affixInputStream = new FileInputStream(args[0]);
-    InputStream dicInputStream = new FileInputStream(args[1]);
+    if(args[offset].equals("-i")) {
+      ignoreCase = true;
+      System.out.println("Ignoring case. All stems will be returned lowercased");
+      offset++;
+    }
+    
+    InputStream affixInputStream = new FileInputStream(args[offset++]);
+    InputStream dicInputStream = new FileInputStream(args[offset++]);
 
-    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_34);
+    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_34, ignoreCase);
 
     affixInputStream.close();
     dicInputStream.close();

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java Wed Oct  5 22:02:28 2011
@@ -34,7 +34,7 @@ public class HunspellDictionaryTest {
     InputStream dictStream = getClass().getResourceAsStream("test.dic");
 
     HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_34);
-    assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
     assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
     assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
 

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java Wed Oct  5 22:02:28 2011
@@ -34,14 +34,7 @@ public class HunspellStemmerTest {
 
   @BeforeClass
   public static void beforeClass() throws IOException, ParseException {
-    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
-    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
-
-    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_34);
-    stemmer = new HunspellStemmer(dictionary);
-
-    affixStream.close();
-    dictStream.close();
+    createStemmer(true);
   }
 
   @Test
@@ -73,4 +66,61 @@ public class HunspellStemmerTest {
     assertEquals("ab", stems.get(0).getStemString());
   }
 
+  @Test
+  public void testStem_ignoreCase() throws IOException, ParseException {
+    List<HunspellStemmer.Stem> stems;
+    createStemmer(true);
+
+    stems = stemmer.stem("apache");
+    assertEquals(1, stems.size());
+    assertEquals("apach", stems.get(0).getStemString());
+
+    stems = stemmer.stem("APACHE");
+    assertEquals(1, stems.size());
+    assertEquals("apach", stems.get(0).getStemString());
+
+    stems = stemmer.stem("Apache");
+    assertEquals(1, stems.size());
+    assertEquals("apach", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("foos");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("food");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("Foos");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("Food");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+  }
+
+  @Test
+  public void testStem_caseSensitive() throws IOException, ParseException {
+    createStemmer(false);
+    List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
+    assertEquals(0, stems.size());
+
+    stems = stemmer.stem("Apache");
+    assertEquals(1, stems.size());
+    assertEquals("Apach", stems.get(0).getStemString());
+  }
+
+  
+  private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
+    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
+    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
+
+    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_34, ignoreCase);
+    stemmer = new HunspellStemmer(dictionary);
+
+    affixStream.close();
+    dictStream.close();
+  }
+
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff Wed Oct  5 22:02:28 2011
@@ -1,13 +1,20 @@
 SET UTF-8
 TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
 
-SFX A Y 2
+SFX A Y 3
 SFX A   0     e         n
 SFX A   0     e         t
+SFX A   0     e         h
 
 SFX C Y 2
 SFX C   0     d/C       c
 SFX C   0     c         b
 
+SFX D Y 1
+SFX D   0     s         o
+
+SFX E Y 1
+SFX E   0     d         o
+
 PFX B Y 1
 PFX B   0     s         o
\ No newline at end of file

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic Wed Oct  5 22:02:28 2011
@@ -1,6 +1,9 @@
-5
+6
 lucen/A
 lucene
 mahout/A
 olr/B
-ab/C
\ No newline at end of file
+ab/C
+Apach/A
+foo/D
+Foo/E
\ No newline at end of file

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Wed Oct  5 22:02:28 2011
@@ -105,6 +105,8 @@ Bug Fixes
 
 * SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)
 
+* SOLR-2792: Allow case insensitive Hunspell stemming (janhoy, rmuir)
+
 ==================  3.4.0  ==================
 
 Upgrading from Solr 3.3

Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java Wed Oct  5 22:02:28 2011
@@ -25,21 +25,36 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.hunspell.HunspellDictionary;
 import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
 import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 
 /**
  * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
- * Example config for British English including a custom dictionary:
+ * Example config for British English including a custom dictionary, case insensitive matching:
  * <pre class="prettyprint" >
  * &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
  *    dictionary=&quot;en_GB.dic,my_custom.dic&quot;
- *    affix=&quot;en_GB.aff&quot;/&gt;</pre>
- * Dictionaries for many languages are available through the OpenOffice project
- * <p>See: <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice Dictionaries</a>
+ *    affix=&quot;en_GB.aff&quot;
+ *    ignoreCase=&quot;true&quot; /&gt;</pre>
+ * Both parameters dictionary and affix are mandatory.
+ * <br/>
+ * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
+ * <br/> 
+ * Dictionaries for many languages are available through the OpenOffice project.
+ * 
+ * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
  */
 public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
   
+  private static final String PARAM_DICTIONARY = "dictionary";
+  private static final String PARAM_AFFIX = "affix";
+  private static final String PARAM_IGNORE_CASE = "ignoreCase";
+  private static final String TRUE = "true";
+  private static final String FALSE = "false";
+  
   private HunspellDictionary dictionary;
+  private boolean ignoreCase = false;
 
   /**
    * Loads the hunspell dictionary and affix files defined in the configuration
@@ -48,15 +63,21 @@ public class HunspellStemFilterFactory e
    */
   public void inform(ResourceLoader loader) {
     assureMatchVersion();
-    String dictionaryFiles[] = args.get("dictionary").split(",");
-    String affixFile = args.get("affix");
+    String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(",");
+    String affixFile = args.get(PARAM_AFFIX);
+    String pic = args.get(PARAM_IGNORE_CASE);
+    if(pic != null) {
+      if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true;
+      else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false;
+      else throw new SolrException(ErrorCode.UNKNOWN, "Unknown value for "+PARAM_IGNORE_CASE+": "+pic+". Must be true or false");
+    }
 
     try {
       List<InputStream> dictionaries = new ArrayList<InputStream>();
       for (String file : dictionaryFiles) {
         dictionaries.add(loader.openResource(file));
       }
-      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion);
+      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
     } catch (Exception e) {
       throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
     }