You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/19 19:10:22 UTC
[lucene-solr] branch master updated: LUCENE-9790: Hunspell: avoid
slow dictionary lookup if the word's hash isn't there (#2405)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 58e3b7a LUCENE-9790: Hunspell: avoid slow dictionary lookup if the word's hash isn't there (#2405)
58e3b7a is described below
commit 58e3b7a8542fa8d94a77215fdf13d575a5657546
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Feb 19 20:10:06 2021 +0100
LUCENE-9790: Hunspell: avoid slow dictionary lookup if the word's hash isn't there (#2405)
---
.../lucene/analysis/hunspell/Dictionary.java | 39 ++++++++++++++--------
.../analysis/hunspell/TestAllDictionaries.java | 3 +-
.../org/apache/lucene/analysis/CharArrayMap.java | 12 +++----
.../src/java/org/apache/lucene/util/CharsRef.java | 13 ++++++--
4 files changed, 43 insertions(+), 24 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 7b1fdf9..5a23d76 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -51,6 +51,8 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
@@ -98,6 +100,9 @@ public class Dictionary {
*/
FST<IntsRef> words;
+ /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */
+ FixedBitSet wordHashes;
+
/**
* The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish
* this is 756), otherwise humans wouldn't be able to deal with it either.
@@ -249,7 +254,9 @@ public class Dictionary {
readAffixFile(affixStream, decoder, flagEnumerator);
// read dictionary entries
- IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
+ IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
+ int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
+ wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10));
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator);
flagLookup = flagEnumerator.finish();
@@ -264,6 +271,11 @@ public class Dictionary {
/** Looks up Hunspell word forms from the dictionary */
IntsRef lookupWord(char[] word, int offset, int length) {
+ int hash = CharsRef.stringHashCode(word, offset, length);
+ if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) {
+ return null;
+ }
+
return lookup(words, word, offset, length);
}
@@ -1015,15 +1027,12 @@ public class Dictionary {
}
}
- private IndexOutput mergeDictionaries(
- Directory tempDir,
- String tempFileNamePrefix,
- List<InputStream> dictionaries,
- CharsetDecoder decoder)
+ private int mergeDictionaries(
+ List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
throws IOException {
StringBuilder sb = new StringBuilder();
- IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
- try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
+ int wordCount = 0;
+ try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
lines.readLine(); // first line is number of entries (approximately, sometimes)
@@ -1045,16 +1054,17 @@ public class Dictionary {
}
}
- writeNormalizedWordEntry(sb, writer, line);
+ wordCount += writeNormalizedWordEntry(sb, writer, line);
}
}
- CodecUtil.writeFooter(unsorted);
+ CodecUtil.writeFooter(output);
}
- return unsorted;
+ return wordCount;
}
- private void writeNormalizedWordEntry(
- StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
+ /** @return the number of word entries written */
+ private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
+ throws IOException {
int flagSep = line.indexOf(FLAG_SEPARATOR);
int morphSep = line.indexOf(MORPH_SEPARATOR);
assert morphSep > 0;
@@ -1078,7 +1088,9 @@ public class Dictionary {
WordCase wordCase = WordCase.caseOf(written, sep);
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
+ return 2;
}
+ return 1;
}
private void addHiddenCapitalizedWord(
@@ -1221,6 +1233,7 @@ public class Dictionary {
}
}
+ wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length());
grouper.add(entry, wordForm, morphDataID);
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
index acef45b..6fac33d 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -160,7 +160,8 @@ public class TestAllDictionaries extends LuceneTestCase {
try {
Dictionary dic = loadDictionary(aff);
totalMemory.addAndGet(RamUsageTester.sizeOf(dic));
- totalWords.addAndGet(RamUsageTester.sizeOf(dic.words));
+ totalWords.addAndGet(
+ RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes));
System.out.println(aff + "\t" + memoryUsageSummary(dic));
} catch (Throwable e) {
failures.add(aff);
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
index ea94f96..eace7d0 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
@@ -22,6 +22,7 @@ import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
+import org.apache.lucene.util.CharsRef;
/**
* A simple class that stores key Strings as char[]'s in a hash table. Note that this is not a
@@ -266,20 +267,17 @@ public class CharArrayMap<V> extends AbstractMap<Object, V> {
private int getHashCode(char[] text, int offset, int len) {
if (text == null) throw new NullPointerException();
- int code = 0;
- final int stop = offset + len;
if (ignoreCase) {
+ int stop = offset + len;
+ int code = 0;
for (int i = offset; i < stop; ) {
final int codePointAt = Character.codePointAt(text, i, stop);
code = code * 31 + Character.toLowerCase(codePointAt);
i += Character.charCount(codePointAt);
}
- } else {
- for (int i = offset; i < stop; i++) {
- code = code * 31 + text[i];
- }
+ return code;
}
- return code;
+ return CharsRef.stringHashCode(text, offset, len);
}
private int getHashCode(CharSequence text) {
diff --git a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
index cf1f969..395bd86 100644
--- a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
+++ b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
@@ -74,11 +74,18 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence, Clone
@Override
public int hashCode() {
- final int prime = 31;
+ return stringHashCode(chars, offset, length);
+ }
+
+ /**
+ * @return the hash code of the given char sub-array, calculated by {@link String#hashCode()}
+ * specification
+ */
+ public static int stringHashCode(char[] chars, int offset, int length) {
+ int end = offset + length;
int result = 0;
- final int end = offset + length;
for (int i = offset; i < end; i++) {
- result = prime * result + chars[i];
+ result = 31 * result + chars[i];
}
return result;
}