You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2011/10/06 00:02:28 UTC
svn commit: r1179458 - in /lucene/dev/branches/branch_3x:
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/
lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/
solr/ solr/core/src/java/org/apache/so...
Author: janhoy
Date: Wed Oct 5 22:02:28 2011
New Revision: 1179458
URL: http://svn.apache.org/viewvc?rev=1179458&view=rev
Log:
SOLR-2792: Allow case insensitive Hunspell stemming
Modified:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java Wed Oct 5 22:02:28 2011
@@ -27,6 +27,7 @@ import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
+import java.util.Locale;
public class HunspellDictionary {
@@ -43,11 +44,15 @@ public class HunspellDictionary {
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+ private static final boolean IGNORE_CASE_DEFAULT = false;
+
private CharArrayMap<List<HunspellWord>> words;
private CharArrayMap<List<HunspellAffix>> prefixes;
private CharArrayMap<List<HunspellAffix>> suffixes;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
+ private boolean ignoreCase = IGNORE_CASE_DEFAULT;
+
private final Version version;
/**
@@ -61,7 +66,22 @@ public class HunspellDictionary {
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
- this(affix, Arrays.asList(dictionary), version);
+ this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
+ }
+
+ /**
+ * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
+ * and dictionary files
+ *
+ * @param affix InputStream for reading the hunspell affix file
+ * @param dictionary InputStream for reading the hunspell dictionary file
+ * @param version Lucene Version
+ * @param ignoreCase If true, dictionary matching will be case insensitive
+ * @throws IOException Can be thrown while reading from the InputStreams
+ * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+ */
+ public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
+ this(affix, Arrays.asList(dictionary), version, ignoreCase);
}
/**
@@ -71,15 +91,17 @@ public class HunspellDictionary {
* @param affix InputStream for reading the hunspell affix file
* @param dictionaries InputStreams for reading the hunspell dictionary file
* @param version Lucene Version
+ * @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
- public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException {
+ public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
this.version = version;
+ this.ignoreCase = ignoreCase;
String encoding = getDictionaryEncoding(affix);
CharsetDecoder decoder = getJavaEncoding(encoding);
readAffixFile(affix, decoder);
- words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false);
+ words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
for (InputStream dictionary : dictionaries) {
readDictionaryFile(dictionary, decoder);
}
@@ -129,8 +151,8 @@ public class HunspellDictionary {
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
- prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
- suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
+ prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
+ suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
String line = null;
@@ -308,6 +330,9 @@ public class HunspellDictionary {
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
+ if(ignoreCase) {
+ entry = entry.toLowerCase(Locale.ENGLISH);
+ }
}
List<HunspellWord> entries = words.get(entry);
@@ -408,4 +433,8 @@ public class HunspellDictionary {
return flags;
}
}
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java Wed Oct 5 22:02:28 2011
@@ -21,9 +21,14 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Scanner;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
@@ -36,6 +41,7 @@ public class HunspellStemmer {
private final HunspellDictionary dictionary;
private final StringBuilder segment = new StringBuilder();
+ private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_34);
/**
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
@@ -79,7 +85,7 @@ public class HunspellStemmer {
*/
public List<Stem> uniqueStems(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
- CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false);
+ CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
if (dictionary.lookupWord(word, 0, length) != null) {
stems.add(new Stem(word, length));
terms.add(word);
@@ -167,6 +173,12 @@ public class HunspellStemmer {
*/
@SuppressWarnings("unchecked")
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
+ if(dictionary.isIgnoreCase()) {
+ for(int i=0;i<strippedWord.length;){
+ i += Character.toChars(
+ Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
+ }
+ }
segment.setLength(0);
segment.append(strippedWord, 0, length);
if (!affix.checkCondition(segment)) {
@@ -174,7 +186,7 @@ public class HunspellStemmer {
}
List<Stem> stems = new ArrayList<Stem>();
-
+
List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
if (words != null) {
for (HunspellWord hunspellWord : words) {
@@ -294,15 +306,24 @@ public class HunspellStemmer {
* @throws ParseException Can be thrown while parsing the files
*/
public static void main(String[] args) throws IOException, ParseException {
- if (args.length != 2) {
- System.out.println("usage: HunspellStemmer <affix location> <dic location>");
+ boolean ignoreCase = false;
+ int offset = 0;
+
+ if (args.length < 2) {
+ System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
System.exit(1);
}
- InputStream affixInputStream = new FileInputStream(args[0]);
- InputStream dicInputStream = new FileInputStream(args[1]);
+ if(args[offset].equals("-i")) {
+ ignoreCase = true;
+ System.out.println("Ignoring case. All stems will be returned lowercased");
+ offset++;
+ }
+
+ InputStream affixInputStream = new FileInputStream(args[offset++]);
+ InputStream dicInputStream = new FileInputStream(args[offset++]);
- HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_34);
+ HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_34, ignoreCase);
affixInputStream.close();
dicInputStream.close();
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java Wed Oct 5 22:02:28 2011
@@ -34,7 +34,7 @@ public class HunspellDictionaryTest {
InputStream dictStream = getClass().getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_34);
- assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+ assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java Wed Oct 5 22:02:28 2011
@@ -34,14 +34,7 @@ public class HunspellStemmerTest {
@BeforeClass
public static void beforeClass() throws IOException, ParseException {
- InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
- InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
-
- HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_34);
- stemmer = new HunspellStemmer(dictionary);
-
- affixStream.close();
- dictStream.close();
+ createStemmer(true);
}
@Test
@@ -73,4 +66,61 @@ public class HunspellStemmerTest {
assertEquals("ab", stems.get(0).getStemString());
}
+ @Test
+ public void testStem_ignoreCase() throws IOException, ParseException {
+ List<HunspellStemmer.Stem> stems;
+ createStemmer(true);
+
+ stems = stemmer.stem("apache");
+ assertEquals(1, stems.size());
+ assertEquals("apach", stems.get(0).getStemString());
+
+ stems = stemmer.stem("APACHE");
+ assertEquals(1, stems.size());
+ assertEquals("apach", stems.get(0).getStemString());
+
+ stems = stemmer.stem("Apache");
+ assertEquals(1, stems.size());
+ assertEquals("apach", stems.get(0).getStemString());
+
+ stems = stemmer.stem("foos");
+ assertEquals(1, stems.size());
+ assertEquals("foo", stems.get(0).getStemString());
+
+ stems = stemmer.stem("food");
+ assertEquals(1, stems.size());
+ assertEquals("foo", stems.get(0).getStemString());
+
+ stems = stemmer.stem("Foos");
+ assertEquals(1, stems.size());
+ assertEquals("foo", stems.get(0).getStemString());
+
+ stems = stemmer.stem("Food");
+ assertEquals(1, stems.size());
+ assertEquals("foo", stems.get(0).getStemString());
+ }
+
+ @Test
+ public void testStem_caseSensitive() throws IOException, ParseException {
+ createStemmer(false);
+ List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
+ assertEquals(0, stems.size());
+
+ stems = stemmer.stem("Apache");
+ assertEquals(1, stems.size());
+ assertEquals("Apach", stems.get(0).getStemString());
+ }
+
+
+ private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
+ InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
+ InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
+
+ HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_34, ignoreCase);
+ stemmer = new HunspellStemmer(dictionary);
+
+ affixStream.close();
+ dictStream.close();
+ }
+
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.aff Wed Oct 5 22:02:28 2011
@@ -1,13 +1,20 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-SFX A Y 2
+SFX A Y 3
SFX A 0 e n
SFX A 0 e t
+SFX A 0 e h
SFX C Y 2
SFX C 0 d/C c
SFX C 0 c b
+SFX D Y 1
+SFX D 0 s o
+
+SFX E Y 1
+SFX E 0 d o
+
PFX B Y 1
PFX B 0 s o
\ No newline at end of file
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hunspell/test.dic Wed Oct 5 22:02:28 2011
@@ -1,6 +1,9 @@
-5
+6
lucen/A
lucene
mahout/A
olr/B
-ab/C
\ No newline at end of file
+ab/C
+Apach/A
+foo/D
+Foo/E
\ No newline at end of file
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Wed Oct 5 22:02:28 2011
@@ -105,6 +105,8 @@ Bug Fixes
* SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)
+* SOLR-2792: Allow case insensitive Hunspell stemming (janhoy, rmuir)
+
================== 3.4.0 ==================
Upgrading from Solr 3.3
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java?rev=1179458&r1=1179457&r2=1179458&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java Wed Oct 5 22:02:28 2011
@@ -25,21 +25,36 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
- * Example config for British English including a custom dictionary:
+ * Example config for British English including a custom dictionary, case insensitive matching:
* <pre class="prettyprint" >
* <filter class="solr.HunspellStemFilterFactory"
* dictionary="en_GB.dic,my_custom.dic"
- * affix="en_GB.aff"/></pre>
- * Dictionaries for many languages are available through the OpenOffice project
- * <p>See: <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice Dictionaries</a>
+ * affix="en_GB.aff"
+ * ignoreCase="true" /></pre>
+ * Both parameters dictionary and affix are mandatory.
+ * <br/>
+ * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
+ * <br/>
+ * Dictionaries for many languages are available through the OpenOffice project.
+ *
+ * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
*/
public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private static final String PARAM_DICTIONARY = "dictionary";
+ private static final String PARAM_AFFIX = "affix";
+ private static final String PARAM_IGNORE_CASE = "ignoreCase";
+ private static final String TRUE = "true";
+ private static final String FALSE = "false";
+
private HunspellDictionary dictionary;
+ private boolean ignoreCase = false;
/**
* Loads the hunspell dictionary and affix files defined in the configuration
@@ -48,15 +63,21 @@ public class HunspellStemFilterFactory e
*/
public void inform(ResourceLoader loader) {
assureMatchVersion();
- String dictionaryFiles[] = args.get("dictionary").split(",");
- String affixFile = args.get("affix");
+ String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(",");
+ String affixFile = args.get(PARAM_AFFIX);
+ String pic = args.get(PARAM_IGNORE_CASE);
+ if(pic != null) {
+ if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true;
+ else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false;
+ else throw new SolrException(ErrorCode.UNKNOWN, "Unknown value for "+PARAM_IGNORE_CASE+": "+pic+". Must be true or false");
+ }
try {
List<InputStream> dictionaries = new ArrayList<InputStream>();
for (String file : dictionaryFiles) {
dictionaries.add(loader.openResource(file));
}
- this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion);
+ this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
} catch (Exception e) {
throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
}