You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/12 14:50:49 UTC
svn commit: r1576739 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/
Author: rmuir
Date: Wed Mar 12 13:50:49 2014
New Revision: 1576739
URL: http://svn.apache.org/r1576739
Log:
LUCENE-5518: minor hunspell optimizations
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Wed Mar 12 13:50:49 2014
@@ -27,6 +27,8 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
@@ -54,6 +56,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@@ -83,17 +86,16 @@ public class Dictionary {
private static final String UTF8_FLAG_TYPE = "UTF-8";
private static final String LONG_FLAG_TYPE = "long";
+ // TODO: really for suffixes we should reverse the automaton and run them backwards
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
FST<IntsRef> prefixes;
FST<IntsRef> suffixes;
- // all Patterns used by prefixes and suffixes. these are typically re-used across
+ // all condition checks used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
- // TODO: maybe don't use Pattern for the condition check...
- // TODO: when we cut over Affix to FST, just store integer index to this.
- ArrayList<Pattern> patterns = new ArrayList<Pattern>();
+ ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
// the entries in the .dic file, mapping to their set of flags.
// the fst output is the ordinal list for flagLookup
@@ -103,7 +105,8 @@ public class Dictionary {
BytesRefHash flagLookup = new BytesRefHash();
// the list of unique strip affixes.
- BytesRefHash stripLookup = new BytesRefHash();
+ char[] stripData;
+ int[] stripOffsets;
// 8 bytes per affix
byte[] affixData = new byte[64];
@@ -118,6 +121,7 @@ public class Dictionary {
boolean ignoreCase;
boolean complexPrefixes;
+ boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
int circumfix = -1; // circumfix flag, or -1 if one is not defined
@@ -160,7 +164,6 @@ public class Dictionary {
this.needsInputCleaning = ignoreCase;
this.needsOutputCleaning = false; // set if we have an OCONV
flagLookup.add(new BytesRef()); // no flags -> ord 0
- stripLookup.add(new BytesRef()); // no strip -> ord 0
File aff = File.createTempFile("affix", "aff", tempDir);
OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
@@ -272,6 +275,14 @@ public class Dictionary {
TreeMap<String, List<Character>> prefixes = new TreeMap<String, List<Character>>();
TreeMap<String, List<Character>> suffixes = new TreeMap<String, List<Character>>();
Map<String,Integer> seenPatterns = new HashMap<String,Integer>();
+
+ // zero condition -> 0 ord
+ seenPatterns.put(".*", 0);
+ patterns.add(null);
+
+ // zero strip -> 0 ord
+ Map<String,Integer> seenStrips = new LinkedHashMap<>();
+ seenStrips.put("", 0);
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
String line = null;
@@ -283,9 +294,9 @@ public class Dictionary {
if (line.startsWith(ALIAS_KEY)) {
parseAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
- parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
+ parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(SUFFIX_KEY)) {
- parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
+ parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
@@ -326,6 +337,22 @@ public class Dictionary {
this.prefixes = affixFST(prefixes);
this.suffixes = affixFST(suffixes);
+
+ int totalChars = 0;
+ for (String strip : seenStrips.keySet()) {
+ totalChars += strip.length();
+ }
+ stripData = new char[totalChars];
+ stripOffsets = new int[seenStrips.size()+1];
+ int currentOffset = 0;
+ int currentIndex = 0;
+ for (String strip : seenStrips.keySet()) {
+ stripOffsets[currentIndex++] = currentOffset;
+ strip.getChars(0, strip.length(), stripData, currentOffset);
+ currentOffset += strip.length();
+ }
+ assert currentIndex == seenStrips.size();
+ stripOffsets[currentIndex] = currentOffset;
}
private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
@@ -360,7 +387,8 @@ public class Dictionary {
String header,
LineNumberReader reader,
String conditionPattern,
- Map<String,Integer> seenPatterns) throws IOException, ParseException {
+ Map<String,Integer> seenPatterns,
+ Map<String,Integer> seenStrips) throws IOException, ParseException {
BytesRef scratch = new BytesRef();
StringBuilder sb = new StringBuilder();
@@ -399,7 +427,10 @@ public class Dictionary {
appendFlags = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
+ twoStageAffix = true;
}
+
+ // TODO: add test and fix zero-affix handling!
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
// at least the gascon affix file has this issue
@@ -411,7 +442,16 @@ public class Dictionary {
condition = condition.replace("-", "\\-");
}
- String regex = String.format(Locale.ROOT, conditionPattern, condition);
+ final String regex;
+ if (".".equals(condition)) {
+ regex = ".*"; // Zero condition is indicated by dot
+ } else if (condition.equals(strip)) {
+ regex = ".*"; // TODO: optimize this better:
+ // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
+ // but this is complicated...
+ } else {
+ regex = String.format(Locale.ROOT, conditionPattern, condition);
+ }
// deduplicate patterns
Integer patternIndex = seenPatterns.get(regex);
@@ -421,17 +461,17 @@ public class Dictionary {
throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
}
seenPatterns.put(regex, patternIndex);
- Pattern pattern = Pattern.compile(regex);
+ CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
patterns.add(pattern);
}
- scratch.copyChars(strip);
- int stripOrd = stripLookup.add(scratch);
- if (stripOrd < 0) {
- // already exists in our hash
- stripOrd = (-stripOrd)-1;
- } else if (stripOrd > Character.MAX_VALUE) {
- throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
+ Integer stripOrd = seenStrips.get(strip);
+ if (stripOrd == null) {
+ stripOrd = seenStrips.size();
+ seenStrips.put(strip, stripOrd);
+ if (stripOrd > Character.MAX_VALUE) {
+ throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
+ }
}
if (appendFlags == null) {
@@ -449,7 +489,7 @@ public class Dictionary {
}
affixWriter.writeShort((short)flag);
- affixWriter.writeShort((short)stripOrd);
+ affixWriter.writeShort((short)stripOrd.intValue());
// encode crossProduct into patternIndex
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
affixWriter.writeShort((short)patternOrd);
@@ -774,6 +814,9 @@ public class Dictionary {
}
static char[] decodeFlags(BytesRef b) {
+ if (b.length == 0) {
+ return CharsRef.EMPTY_CHARS;
+ }
int len = b.length >>> 1;
char flags[] = new char[len];
int upto = 0;
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Wed Mar 12 13:50:49 2014
@@ -22,7 +22,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
-import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.store.ByteArrayDataInput;
@@ -31,6 +30,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It
@@ -160,7 +160,7 @@ final class Stemmer {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<CharsRef>();
- if (doPrefix) {
+ if (doPrefix && dictionary.prefixes != null) {
for (int i = length - 1; i >= 0; i--) {
IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
if (prefixes == null) {
@@ -197,12 +197,19 @@ final class Stemmer {
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
- dictionary.stripLookup.get(stripOrd, scratch);
- String strippedWord = new StringBuilder().append(scratch.utf8ToString())
- .append(word, deAffixedStart, deAffixedLength)
- .toString();
+ int stripStart = dictionary.stripOffsets[stripOrd];
+ int stripEnd = dictionary.stripOffsets[stripOrd+1];
+ int stripLength = stripEnd - stripStart;
- List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix);
+ if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
+ continue;
+ }
+
+ char strippedWord[] = new char[stripLength + deAffixedLength];
+ System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
+ System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
+
+ List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix);
stems.addAll(stemList);
}
@@ -210,7 +217,7 @@ final class Stemmer {
}
}
- if (doSuffix) {
+ if (doSuffix && dictionary.suffixes != null) {
for (int i = 0; i < length; i++) {
IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
if (suffixes == null) {
@@ -246,11 +253,20 @@ final class Stemmer {
if (compatible) {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
- // TODO: can we do this in-place?
- dictionary.stripLookup.get(stripOrd, scratch);
- String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
- List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix);
+ int stripStart = dictionary.stripOffsets[stripOrd];
+ int stripEnd = dictionary.stripOffsets[stripOrd+1];
+ int stripLength = stripEnd - stripStart;
+
+ if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
+ continue;
+ }
+
+ char strippedWord[] = new char[stripLength + deAffixedLength];
+ System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
+ System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
+
+ List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix);
stems.addAll(stemList);
}
@@ -260,6 +276,30 @@ final class Stemmer {
return stems;
}
+
+ /** checks condition of the concatenation of two strings */
+ // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
+ // but this is a little bit more complicated.
+ private boolean checkCondition(int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
+ if (condition != 0) {
+ CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
+ int state = pattern.getInitialState();
+ for (int i = c1off; i < c1off + c1len; i++) {
+ state = pattern.step(state, c1[i]);
+ if (state == -1) {
+ return false;
+ }
+ }
+ for (int i = c2off; i < c2off + c2len; i++) {
+ state = pattern.step(state, c2[i]);
+ if (state == -1) {
+ return false;
+ }
+ }
+ return pattern.isAccept(state);
+ }
+ return true;
+ }
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
@@ -273,10 +313,7 @@ final class Stemmer {
* @param prefix true if we are removing a prefix (false if its a suffix)
* @return List of stems for the word, or an empty list if none are found
*/
- List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
- segment.setLength(0);
- segment.append(strippedWord, 0, length);
-
+ List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
// TODO: just pass this in from before, no need to decode it twice
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
@@ -285,11 +322,6 @@ final class Stemmer {
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
-
- Pattern pattern = dictionary.patterns.get(condition);
- if (!pattern.matcher(segment).matches()) {
- return Collections.emptyList();
- }
List<CharsRef> stems = new ArrayList<CharsRef>();
@@ -338,9 +370,9 @@ final class Stemmer {
if (prefix) {
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
- // COMPLEXPREFIXES = false: combine with another suffix
- stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix));
- } else if (!dictionary.complexPrefixes) {
+ // COMPLEXPREFIXES = false: combine with a suffix
+ stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
+ } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
@@ -350,7 +382,7 @@ final class Stemmer {
if (prefix && dictionary.complexPrefixes) {
// we took away the second prefix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
- } else if (prefix == false && dictionary.complexPrefixes == false) {
+ } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a prefix, then a suffix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java Wed Mar 12 13:50:49 2014
@@ -171,7 +171,7 @@ public class TestAllDictionaries extends
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
- "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+ "strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java Wed Mar 12 13:50:49 2014
@@ -187,7 +187,7 @@ public class TestAllDictionaries2 extend
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
- "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+ "strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +