You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/03 15:23:42 UTC
svn commit: r1573570 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/
Author: rmuir
Date: Mon Mar 3 14:23:41 2014
New Revision: 1573570
URL: http://svn.apache.org/r1573570
Log:
LUCENE-5485: hunspell circumfix support
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java
- copied unchanged from r1573569, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff
- copied unchanged from r1573569, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic
- copied unchanged from r1573569, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1573570&r1=1573569&r2=1573570&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Mar 3 14:23:41 2014
@@ -24,6 +24,8 @@ New Features
* LUCENE-5479: FacetsConfig subclass can now customize the default
per-dim facets configuration. (Rob Audenaerde via Mike McCandless)
+* LUCENE-5485: Add circumfix support to HunspellStemFilter. (Robert Muir)
+
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1573570&r1=1573569&r2=1573570&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Mon Mar 3 14:23:41 2014
@@ -66,6 +66,7 @@ public class Dictionary {
private static final String SUFFIX_KEY = "SFX";
private static final String FLAG_KEY = "FLAG";
private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
+ private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
@@ -107,6 +108,8 @@ public class Dictionary {
boolean ignoreCase;
boolean complexPrefixes;
+ int circumfix = -1; // circumfix flag, or -1 if one is not defined
+
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
@@ -240,6 +243,12 @@ public class Dictionary {
flagParsingStrategy = getFlagParsingStrategy(line);
} else if (line.equals(COMPLEXPREFIXES_KEY)) {
complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
+ } else if (line.startsWith(CIRCUMFIX_KEY)) {
+ String parts[] = line.split("\\s+");
+ if (parts.length != 2) {
+ throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
+ }
+ circumfix = flagParsingStrategy.parseFlag(parts[1]);
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1573570&r1=1573569&r2=1573570&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Mon Mar 3 14:23:41 2014
@@ -81,7 +81,7 @@ final class Stemmer {
stems.add(new CharsRef(word, 0, length));
}
}
- stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false));
+ stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
return stems;
}
@@ -122,9 +122,11 @@ final class Stemmer {
* @param previousWasPrefix true if the previous removal was a prefix:
* if we are removing a suffix, and it has no continuation requirements, its ok.
* but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
+ * @param circumfix true if the previous prefix removal was signed as a circumfix
+ * this means inner most suffix must also contain circumfix flag.
* @return List of stems, or empty list if no stems are found
*/
- private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix) {
+ private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<CharsRef>();
@@ -171,7 +173,7 @@ final class Stemmer {
.append(word, deAffixedStart, deAffixedLength)
.toString();
- List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true);
+ List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix);
stems.addAll(stemList);
}
@@ -219,7 +221,7 @@ final class Stemmer {
dictionary.stripLookup.get(stripOrd, scratch);
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
- List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false);
+ List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix);
stems.addAll(stemList);
}
@@ -242,7 +244,7 @@ final class Stemmer {
* @param prefix true if we are removing a prefix (false if its a suffix)
* @return List of stems for the word, or an empty list if none are found
*/
- List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix) {
+ List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
segment.setLength(0);
segment.append(strippedWord, 0, length);
@@ -279,10 +281,28 @@ final class Stemmer {
continue;
}
}
+
+ // if circumfix was previously set by a prefix, we must check this suffix,
+ // to ensure it has it, and vice versa
+ if (dictionary.circumfix != -1) {
+ dictionary.flagLookup.get(append, scratch);
+ char appendFlags[] = Dictionary.decodeFlags(scratch);
+ boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
+ if (circumfix != suffixCircumfix) {
+ continue;
+ }
+ }
stems.add(new CharsRef(strippedWord, 0, length));
}
}
}
+
+ // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
+ if (dictionary.circumfix != -1 && !circumfix && prefix) {
+ dictionary.flagLookup.get(append, scratch);
+ char appendFlags[] = Dictionary.decodeFlags(scratch);
+ circumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
+ }
if (crossProduct) {
if (recursionDepth == 0) {
@@ -290,20 +310,20 @@ final class Stemmer {
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true));
+ stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix));
} else if (!dictionary.complexPrefixes) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false));
+ stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
}
} else if (recursionDepth == 1) {
if (prefix && dictionary.complexPrefixes) {
// we took away the second prefix: go look for another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true));
+ stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
} else if (prefix == false && dictionary.complexPrefixes == false) {
// we took away a prefix, then a suffix: go look for another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false));
+ stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
}
}
}