You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/07/16 22:18:15 UTC
svn commit: r1611175 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/
Author: rmuir
Date: Wed Jul 16 20:18:14 2014
New Revision: 1611175
URL: http://svn.apache.org/r1611175
Log:
LUCENE-5826: Support proper hunspell case handling and related options
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAlternateCasing.java
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAlternateCasing.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseSensitive.java
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseSensitive.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestNeedAffix.java
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestNeedAffix.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestOnlyInCompound.java
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestOnlyInCompound.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/alternate-casing.aff
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/alternate-casing.aff
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/alternate-casing.dic
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/alternate-casing.dic
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/casesensitive.aff
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/casesensitive.aff
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/casesensitive.dic
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/casesensitive.dic
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.aff
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.aff
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix.aff
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix.aff
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix.dic
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix.dic
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/onlyincompound.aff
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/onlyincompound.aff
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/onlyincompound.dic
- copied unchanged from r1611161, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/onlyincompound.dic
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1611175&r1=1611174&r2=1611175&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Jul 16 20:18:14 2014
@@ -21,6 +21,9 @@ New Features
* LUCENE-5806: Extend expressions grammar to support array access in variables.
Added helper class VariableContext to parse complex variable into pieces.
(Ryan Ernst)
+
+* LUCENE-5826: Support proper hunspell case handling, LANG, KEEPCASE, NEEDAFFIX,
+ and ONLYINCOMPOUND flags. (Robert Muir)
API Changes
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1611175&r1=1611174&r2=1611175&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Wed Jul 16 20:18:14 2014
@@ -85,6 +85,11 @@ public class Dictionary {
private static final String ICONV_KEY = "ICONV";
private static final String OCONV_KEY = "OCONV";
private static final String FULLSTRIP_KEY = "FULLSTRIP";
+ private static final String LANG_KEY = "LANG";
+ private static final String KEEPCASE_KEY = "KEEPCASE";
+ private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
+ private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
+ private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
@@ -140,6 +145,9 @@ public class Dictionary {
boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
int circumfix = -1; // circumfix flag, or -1 if one is not defined
+ int keepcase = -1; // keepcase flag, or -1 if one is not defined
+ int needaffix = -1; // needaffix flag, or -1 if one is not defined
+ int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
// ignored characters (dictionary, affix, inputs)
private char[] ignore;
@@ -154,6 +162,11 @@ public class Dictionary {
// true if we can strip suffixes "down to nothing"
boolean fullStrip;
+ // language declaration of the dictionary
+ String language;
+ // true if case algorithms should use alternate (Turkish/Azeri) mapping
+ boolean alternateCasing;
+
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
@@ -315,6 +328,24 @@ public class Dictionary {
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
}
circumfix = flagParsingStrategy.parseFlag(parts[1]);
+ } else if (line.startsWith(KEEPCASE_KEY)) {
+ String parts[] = line.split("\\s+");
+ if (parts.length != 2) {
+ throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
+ }
+ keepcase = flagParsingStrategy.parseFlag(parts[1]);
+ } else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
+ String parts[] = line.split("\\s+");
+ if (parts.length != 2) {
+ throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
+ }
+ needaffix = flagParsingStrategy.parseFlag(parts[1]);
+ } else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
+ String parts[] = line.split("\\s+");
+ if (parts.length != 2) {
+ throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
+ }
+ onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(IGNORE_KEY)) {
String parts[] = line.split("\\s+");
if (parts.length != 2) {
@@ -340,6 +371,9 @@ public class Dictionary {
}
} else if (line.startsWith(FULLSTRIP_KEY)) {
fullStrip = true;
+ } else if (line.startsWith(LANG_KEY)) {
+ language = line.substring(LANG_KEY.length()).trim();
+ alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
}
}
@@ -1117,7 +1151,7 @@ public class Dictionary {
if (ignoreCase && iconv == null) {
// if we have no input conversion mappings, do this on-the-fly
- ch = Character.toLowerCase(ch);
+ ch = caseFold(ch);
}
reuse.append(ch);
@@ -1131,7 +1165,7 @@ public class Dictionary {
}
if (ignoreCase) {
for (int i = 0; i < reuse.length(); i++) {
- reuse.setCharAt(i, Character.toLowerCase(reuse.charAt(i)));
+ reuse.setCharAt(i, caseFold(reuse.charAt(i)));
}
}
}
@@ -1139,6 +1173,21 @@ public class Dictionary {
return reuse;
}
+ /** folds single character (according to LANG if present) */
+ char caseFold(char c) {
+ if (alternateCasing) {
+ if (c == 'I') {
+ return 'ı';
+ } else if (c == 'Ä°') {
+ return 'i';
+ } else {
+ return Character.toLowerCase(c);
+ }
+ } else {
+ return Character.toLowerCase(c);
+ }
+ }
+
// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
final FST.BytesReader bytesReader = fst.getBytesReader();
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1611175&r1=1611174&r2=1611175&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Wed Jul 16 20:18:14 2014
@@ -100,17 +100,104 @@ final class Stemmer {
word = scratchBuffer;
}
+ int caseType = caseOf(word, length);
+ if (caseType == UPPER_CASE) {
+ // upper: union exact, title, lower
+ caseFoldTitle(word, length);
+ caseFoldLower(titleBuffer, length);
+ List<CharsRef> list = doStem(word, length, false);
+ list.addAll(doStem(titleBuffer, length, true));
+ list.addAll(doStem(lowerBuffer, length, true));
+ return list;
+ } else if (caseType == TITLE_CASE) {
+ // title: union exact, lower
+ caseFoldLower(word, length);
+ List<CharsRef> list = doStem(word, length, false);
+ list.addAll(doStem(lowerBuffer, length, true));
+ return list;
+ } else {
+ // exact match only
+ return doStem(word, length, false);
+ }
+ }
+
+ // temporary buffers for case variants
+ private char[] lowerBuffer = new char[8];
+ private char[] titleBuffer = new char[8];
+
+ private static final int EXACT_CASE = 0;
+ private static final int TITLE_CASE = 1;
+ private static final int UPPER_CASE = 2;
+
+ /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
+ private int caseOf(char word[], int length) {
+ if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
+ return EXACT_CASE;
+ }
+
+ // determine if we are title or lowercase (or something funky, in which its exact)
+ boolean seenUpper = false;
+ boolean seenLower = false;
+ for (int i = 1; i < length; i++) {
+ boolean v = Character.isUpperCase(word[i]);
+ seenUpper |= v;
+ seenLower |= !v;
+ }
+
+ if (!seenLower) {
+ return UPPER_CASE;
+ } else if (!seenUpper) {
+ return TITLE_CASE;
+ } else {
+ return EXACT_CASE;
+ }
+ }
+
+ /** folds titlecase variant of word to titleBuffer */
+ private void caseFoldTitle(char word[], int length) {
+ titleBuffer = ArrayUtil.grow(titleBuffer, length);
+ System.arraycopy(word, 0, titleBuffer, 0, length);
+ for (int i = 1; i < length; i++) {
+ titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
+ }
+ }
+
+ /** folds lowercase variant of word (title cased) to lowerBuffer */
+ private void caseFoldLower(char word[], int length) {
+ lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
+ System.arraycopy(word, 0, lowerBuffer, 0, length);
+ lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
+ }
+
+ private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, 0, length);
if (forms != null) {
- // TODO: some forms should not be added, e.g. ONLYINCOMPOUND
- // just because it exists, does not make it valid...
for (int i = 0; i < forms.length; i += formStep) {
+ boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
+ boolean checkNeedAffix = dictionary.needaffix != -1;
+ boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
+ if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
+ dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
+ char wordFlags[] = Dictionary.decodeFlags(scratch);
+ // we are looking for a case variant, but this word does not allow it
+ if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
+ continue;
+ }
+ // we can't add this form, its a pseudostem requiring an affix
+ if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) {
+ continue;
+ }
+ // we can't add this form, it only belongs inside a compound word
+ if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
+ continue;
+ }
+ }
stems.add(newStem(word, length, forms, i));
}
}
try {
- boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
+ boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
@@ -203,9 +290,10 @@ final class Stemmer {
* but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @param circumfix true if the previous prefix removal was signed as a circumfix
* this means inner most suffix must also contain circumfix flag.
+ * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
* @return List of stems, or empty list if no stems are found
*/
- private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) throws IOException {
+ private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<>();
@@ -250,13 +338,22 @@ final class Stemmer {
final boolean compatible;
if (recursionDepth == 0) {
- compatible = true;
+ if (dictionary.onlyincompound == -1) {
+ compatible = true;
+ } else {
+ // check if affix is allowed in a non-compound word
+ dictionary.flagLookup.get(append, scratch);
+ char appendFlags[] = Dictionary.decodeFlags(scratch);
+ compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
+ }
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
- compatible = hasCrossCheckedFlag((char)prevFlag, appendFlags, false);
+ boolean allowed = dictionary.onlyincompound == -1 ||
+ !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
+ compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, false);
} else {
compatible = false;
}
@@ -277,7 +374,7 @@ final class Stemmer {
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
- List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix);
+ List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
stems.addAll(stemList);
}
@@ -325,13 +422,22 @@ final class Stemmer {
final boolean compatible;
if (recursionDepth == 0) {
- compatible = true;
+ if (dictionary.onlyincompound == -1) {
+ compatible = true;
+ } else {
+ // check if affix is allowed in a non-compound word
+ dictionary.flagLookup.get(append, scratch);
+ char appendFlags[] = Dictionary.decodeFlags(scratch);
+ compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
+ }
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
- compatible = hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
+ boolean allowed = dictionary.onlyincompound == -1 ||
+ !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
+ compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
} else {
compatible = false;
}
@@ -352,7 +458,7 @@ final class Stemmer {
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
- List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix);
+ List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
stems.addAll(stemList);
}
@@ -399,7 +505,7 @@ final class Stemmer {
* @param prefix true if we are removing a prefix (false if its a suffix)
* @return List of stems for the word, or an empty list if none are found
*/
- List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) throws IOException {
+ List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: just pass this in from before, no need to decode it twice
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
@@ -439,6 +545,15 @@ final class Stemmer {
continue;
}
}
+
+ // we are looking for a case variant, but this word does not allow it
+ if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
+ continue;
+ }
+ // we aren't decompounding (yet)
+ if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
+ continue;
+ }
stems.add(newStem(strippedWord, length, forms, i));
}
}
@@ -457,20 +572,20 @@ final class Stemmer {
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with a suffix
- stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
+ stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
+ stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
} else if (recursionDepth == 1) {
if (prefix && dictionary.complexPrefixes) {
// we took away the second prefix: go look for another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
+ stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
} else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a prefix, then a suffix: go look for another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
+ stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
}
}