You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by br...@apache.org on 2019/12/13 13:44:01 UTC
[lucene-solr] branch master updated: LUCENE-9089: FST Builder
renamed FSTCompiler with fluent-style Builder.
This is an automated email from the ASF dual-hosted git repository.
broustant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 1812b36 LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
1812b36 is described below
commit 1812b367abad0fe9f123fd8f9ffb11c7dc155404
Author: Bruno Roustant <br...@salesforce.com>
AuthorDate: Tue Dec 10 16:41:43 2019 +0100
LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
Closes #1070
---
lucene/CHANGES.txt | 3 +
lucene/MIGRATE.txt | 5 +
.../analysis/charfilter/NormalizeCharMap.java | 7 +-
.../lucene/analysis/hunspell/Dictionary.java | 22 +-
.../miscellaneous/StemmerOverrideFilter.java | 7 +-
.../apache/lucene/analysis/synonym/SynonymMap.java | 9 +-
.../lucene/analysis/hunspell/TestDictionary.java | 14 +-
.../lucene/analysis/ja/dict/UserDictionary.java | 8 +-
.../ja/util/TokenInfoDictionaryBuilder.java | 8 +-
.../lucene/analysis/ko/dict/UserDictionary.java | 8 +-
.../ko/util/TokenInfoDictionaryBuilder.java | 8 +-
.../BooleanPerceptronClassifier.java | 8 +-
.../blockterms/VariableGapTermsIndexWriter.java | 12 +-
.../blocktreeords/OrdsBlockTreeTermsWriter.java | 16 +-
.../lucene/codecs/memory/FSTOrdTermsWriter.java | 10 +-
.../lucene/codecs/memory/FSTTermsWriter.java | 10 +-
.../codecs/simpletext/SimpleTextFieldsReader.java | 12 +-
.../lucene/codecs/uniformsplit/FSTDictionary.java | 9 +-
.../codecs/blocktree/BlockTreeTermsWriter.java | 16 +-
.../src/java/org/apache/lucene/util/fst/FST.java | 134 +++++-----
.../util/fst/{Builder.java => FSTCompiler.java} | 289 ++++++++++++---------
.../java/org/apache/lucene/util/fst/NodeHash.java | 16 +-
.../test/org/apache/lucene/util/fst/Test2BFST.java | 35 ++-
...ddressing.java => TestFSTDirectAddressing.java} | 53 ++--
.../test/org/apache/lucene/util/fst/TestFSTs.java | 158 +++++------
.../test/org/apache/lucene/util/fst/TestUtil.java | 10 +-
.../org/apache/lucene/util/fst/ListOfOutputs.java | 4 +-
.../lucene/util/fst/UpToTwoPositiveIntOutputs.java | 4 +-
.../org/apache/lucene/util/fst/TestFSTsMisc.java | 34 +--
.../idversion/VersionBlockTreeTermsWriter.java | 17 +-
.../suggest/analyzing/AnalyzingSuggester.java | 10 +-
.../suggest/analyzing/FreeTextSuggester.java | 8 +-
.../suggest/document/NRTSuggesterBuilder.java | 14 +-
.../search/suggest/fst/FSTCompletionBuilder.java | 13 +-
.../search/suggest/fst/WFSTCompletionLookup.java | 8 +-
.../java/org/apache/lucene/util/fst/FSTTester.java | 29 +--
36 files changed, 546 insertions(+), 482 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ccc7e91..6f67e6e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -38,6 +38,9 @@ API Changes
* LUCENE-8905: Better defence against malformed arguments in TopDocsCollector
(Atri Sharma)
+* LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
+ (Bruno Roustant)
+
Improvements
* LUCENE-8757: When provided with an ExecutorService to run queries across
diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt
index a4c181b..f2a3e96 100644
--- a/lucene/MIGRATE.txt
+++ b/lucene/MIGRATE.txt
@@ -1,5 +1,10 @@
# Apache Lucene Migration Guide
+## o.a.l.util.fst.Builder is renamed FSTCompiler with fluent-style Builder (LUCENE-9089) ##
+
+Simply use FSTCompiler instead of the previous Builder. Use either the simple constructor with default settings, or
+the FSTCompiler.Builder to tune and tweak any parameter.
+
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
index b3efcf7..a217b4c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
@@ -25,6 +25,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
@@ -106,13 +107,13 @@ public class NormalizeCharMap {
final FST<CharsRef> map;
try {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
- final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
+ final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
- builder.add(Util.toUTF16(ent.getKey(), scratch),
+ fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
new CharsRef(ent.getValue()));
}
- map = builder.finish();
+ map = fstCompiler.compile();
pendingPairs.clear();
} catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 443f010..ff4a800 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -64,7 +64,7 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntSequenceOutputs;
@@ -231,9 +231,9 @@ public class Dictionary {
// read dictionary entries
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
- Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
- readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b);
- words = b.finish();
+ FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
+ readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
+ words = fstCompiler.compile();
aliases = null; // no longer needed
morphAliases = null; // no longer needed
success = true;
@@ -414,7 +414,7 @@ public class Dictionary {
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
- Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
+ FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch);
@@ -423,9 +423,9 @@ public class Dictionary {
for (Integer c : entries) {
output.ints[output.length++] = c;
}
- builder.add(scratch.get(), output);
+ fstCompiler.add(scratch.get(), output);
}
- return builder.finish();
+ return fstCompiler.compile();
}
static String escapeDash(String re) {
@@ -608,14 +608,14 @@ public class Dictionary {
}
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
- Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
+ FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String,String> entry : mappings.entrySet()) {
Util.toUTF16(entry.getKey(), scratchInts);
- builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
+ fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
}
- return builder.finish();
+ return fstCompiler.compile();
}
/** pattern accepts optional BOM + SET + any whitespace */
@@ -776,7 +776,7 @@ public class Dictionary {
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
- private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
+ private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, FSTCompiler<IntsRef> words) throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
index 078865f..7a061d3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
@@ -35,6 +35,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
+import org.apache.lucene.util.fst.FSTCompiler;
/**
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
@@ -203,7 +204,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
*/
public StemmerOverrideMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
- org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
+ FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
FST.INPUT_TYPE.BYTE4, outputs);
final int[] sort = hash.sort();
IntsRefBuilder intsSpare = new IntsRefBuilder();
@@ -213,9 +214,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
int id = sort[i];
BytesRef bytesRef = hash.get(id, spare);
intsSpare.copyUTF8Bytes(bytesRef);
- builder.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
+ fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
}
- return new StemmerOverrideMap(builder.finish(), ignoreCase);
+ return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
index 97db1de..ed64e0d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
@@ -39,6 +39,7 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Util;
/**
@@ -213,8 +214,8 @@ public class SynonymMap {
public SynonymMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options?
- org.apache.lucene.util.fst.Builder<BytesRef> builder =
- new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
+ FSTCompiler<BytesRef> fstCompiler =
+ new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
@@ -278,10 +279,10 @@ public class SynonymMap {
scratch.setLength(scratchOutput.getPosition());
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
- builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
+ fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
}
- FST<BytesRef> fst = builder.finish();
+ FST<BytesRef> fst = fstCompiler.compile();
return new SynonymMap(fst, words, maxHorizontalContext);
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index b7312cb..908b027 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -30,7 +30,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
@@ -196,26 +196,26 @@ public class TestDictionary extends LuceneTestCase {
public void testReplacements() throws Exception {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
- Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
+ FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
// a -> b
Util.toUTF16("a", scratchInts);
- builder.add(scratchInts.get(), new CharsRef("b"));
+ fstCompiler.add(scratchInts.get(), new CharsRef("b"));
// ab -> c
Util.toUTF16("ab", scratchInts);
- builder.add(scratchInts.get(), new CharsRef("c"));
+ fstCompiler.add(scratchInts.get(), new CharsRef("c"));
// c -> de
Util.toUTF16("c", scratchInts);
- builder.add(scratchInts.get(), new CharsRef("de"));
+ fstCompiler.add(scratchInts.get(), new CharsRef("de"));
// def -> gh
Util.toUTF16("def", scratchInts);
- builder.add(scratchInts.get(), new CharsRef("gh"));
+ fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
- FST<CharsRef> fst = builder.finish();
+ FST<CharsRef> fst = fstCompiler.compile();
StringBuilder sb = new StringBuilder("atestanother");
Dictionary.applyMappings(fst, sb);
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
index fef24f9..120fbb5 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@@ -29,7 +29,7 @@ import java.util.TreeMap;
import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -99,7 +99,7 @@ public final class UserDictionary implements Dictionary {
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
- Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
@@ -136,11 +136,11 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i));
}
- fstBuilder.add(scratch.get(), ord);
+ fstCompiler.add(scratch.get(), ord);
segmentations.add(wordIdAndLength);
ord++;
}
- this.fst = new TokenInfoFST(fstBuilder.finish(), false);
+ this.fst = new TokenInfoFST(fstCompiler.compile(), false);
this.data = data.toArray(new String[data.size()]);
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
index 4bb8d59..8f1befd 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
@@ -31,7 +31,7 @@ import java.util.stream.Stream;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(entry -> entry[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
- Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;
@@ -120,12 +120,12 @@ class TokenInfoDictionaryBuilder {
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i));
}
- fstBuilder.add(scratch.get(), ord);
+ fstCompiler.add(scratch.get(), ord);
}
dictionary.addMapping((int) ord, offset);
offset = next;
}
- dictionary.setFST(fstBuilder.finish());
+ dictionary.setFST(fstCompiler.compile());
return dictionary;
}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
index 186990e..931ce48 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
@@ -25,7 +25,7 @@ import java.util.List;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -83,7 +83,7 @@ public final class UserDictionary implements Dictionary {
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
- Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
String lastToken = null;
@@ -129,11 +129,11 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, token.charAt(i));
}
- fstBuilder.add(scratch.get(), ord);
+ fstCompiler.add(scratch.get(), ord);
lastToken = token;
ord ++;
}
- this.fst = new TokenInfoFST(fstBuilder.finish());
+ this.fst = new TokenInfoFST(fstCompiler.compile());
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
this.rightIds = new short[rightIds.size()];
for (int i = 0; i < rightIds.size(); i++) {
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
index 4f4f0b7..0c6d5f1 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
@@ -30,7 +30,7 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -90,7 +90,7 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(left -> left[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
- Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;
@@ -116,12 +116,12 @@ class TokenInfoDictionaryBuilder {
for (int i = 0; i < surfaceForm.length(); i++) {
scratch.setIntAt(i, surfaceForm.charAt(i));
}
- fstBuilder.add(scratch.get(), ord);
+ fstCompiler.add(scratch.get(), ord);
}
dictionary.addMapping((int) ord, offset);
offset = next;
}
- dictionary.setFST(fstBuilder.finish());
+ dictionary.setFST(fstCompiler.compile());
return dictionary;
}
}
diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java
index 0008375..87c9064 100644
--- a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java
+++ b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java
@@ -41,7 +41,7 @@ import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@@ -183,15 +183,15 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
private void updateFST(SortedMap<String, Double> weights) throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRefBuilder scratchBytes = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, Double> entry : weights.entrySet()) {
scratchBytes.copyChars(entry.getKey());
- fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
+ fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
.getValue().longValue());
}
- fst = fstBuilder.finish();
+ fst = fstCompiler.compile();
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
index 47c6e76..b878505 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
@@ -33,7 +33,7 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@@ -219,7 +219,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
}
private class FSTFieldWriter extends FieldWriter {
- private final Builder<Long> fstBuilder;
+ private final FSTCompiler<Long> fstCompiler;
private final PositiveIntOutputs fstOutputs;
private final long startTermsFilePointer;
@@ -233,12 +233,12 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton();
- fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
+ fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
indexStart = out.getFilePointer();
////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
- fstBuilder.add(new IntsRef(), termsFilePointer);
+ fstCompiler.add(new IntsRef(), termsFilePointer);
startTermsFilePointer = termsFilePointer;
}
@@ -269,7 +269,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
final int lengthSave = text.length;
text.length = indexedTermPrefixLength(lastTerm.get(), text);
try {
- fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
+ fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
} finally {
text.length = lengthSave;
}
@@ -278,7 +278,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
@Override
public void finish(long termsFilePointer) throws IOException {
- fst = fstBuilder.finish();
+ fst = fstCompiler.compile();
if (fst != null) {
fst.save(out);
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
index 769c732..b305e61 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
@@ -45,7 +45,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
@@ -361,16 +361,14 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
}
}
- final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
- 0, 0, true, false, Integer.MAX_VALUE,
- FST_OUTPUTS, true, 15);
+ final FSTCompiler<Output> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).shouldShareNonSingletonNodes(false).build();
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
//indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0;
- indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef),
+ fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef),
FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length),
0, Long.MAX_VALUE-(sumTotalTermCount-1)));
scratchBytes.reset();
@@ -381,7 +379,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
for(PendingBlock block : blocks) {
if (block.subIndices != null) {
for(SubIndex subIndex : block.subIndices) {
- append(indexBuilder, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
+ append(fstCompiler, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
}
block.subIndices = null;
}
@@ -391,7 +389,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
assert sumTotalTermCount == totFloorTermCount;
- index = indexBuilder.finish();
+ index = fstCompiler.compile();
assert subIndices == null;
/*
@@ -405,7 +403,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current
// FST.
- private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
+ private void append(FSTCompiler<Output> fstCompiler, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<Output> indexEnt;
while ((indexEnt = subIndexEnum.next()) != null) {
@@ -416,7 +414,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
//long blockTermCount = output.endOrd - output.startOrd + 1;
Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
//System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
- builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
+ fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
}
}
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java
index 0077045..a31a2f9 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java
@@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@@ -287,7 +287,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
}
final class TermsWriter {
- private final Builder<Long> builder;
+ private final FSTCompiler<Long> fstCompiler;
private final PositiveIntOutputs outputs;
private final FieldInfo fieldInfo;
private final int longsSize;
@@ -311,7 +311,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = PositiveIntOutputs.getSingleton();
- this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
this.lastBlockStatsFP = 0;
this.lastBlockMetaLongsFP = 0;
@@ -346,7 +346,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
}
metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP);
- builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
+ fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms);
numTerms++;
lastMetaBytesFP = metaBytesOut.size();
@@ -365,7 +365,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
metadata.statsOut = statsOut;
metadata.metaLongsOut = metaLongsOut;
metadata.metaBytesOut = metaBytesOut;
- metadata.dict = builder.finish();
+ metadata.dict = fstCompiler.compile();
fields.add(metadata);
}
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
index 22eff0b..2ef1565 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
@@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
@@ -247,7 +247,7 @@ public class FSTTermsWriter extends FieldsConsumer {
}
final class TermsWriter {
- private final Builder<FSTTermOutputs.TermData> builder;
+ private final FSTCompiler<FSTTermOutputs.TermData> fstCompiler;
private final FSTTermOutputs outputs;
private final FieldInfo fieldInfo;
private final int longsSize;
@@ -261,7 +261,7 @@ public class FSTTermsWriter extends FieldsConsumer {
this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
- this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
}
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
@@ -276,14 +276,14 @@ public class FSTTermsWriter extends FieldsConsumer {
meta.bytes = metaWriter.toArrayCopy();
metaWriter.reset();
}
- builder.add(Util.toIntsRef(text, scratchTerm), meta);
+ fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
numTerms++;
}
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// save FST dict
if (numTerms > 0) {
- final FST<FSTTermOutputs.TermData> fst = builder.finish();
+ final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
}
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
index 1dec0c8..a5464fc 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
@@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
@@ -539,11 +539,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
- final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
+ final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
outputsInner);
- b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder();
@@ -556,7 +556,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
SimpleTextUtil.readLine(in, scratch);
if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
if (lastDocsStart != -1) {
- b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
+ fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
sumTotalTermFreq += totalTermFreq;
@@ -574,7 +574,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
if (lastDocsStart != -1) {
- b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
+ fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
}
lastDocsStart = in.getFilePointer();
@@ -589,7 +589,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
}
docCount = visitedDocs.cardinality();
- fst = b.finish();
+ fst = fstCompiler.compile();
/*
PrintStream ps = new PrintStream("out.dot");
fst.toDot(ps);
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
index 3f173ac..2ffb687 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
@@ -30,6 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@@ -202,19 +203,19 @@ public class FSTDictionary implements IndexDictionary {
*/
public static class Builder implements IndexDictionary.Builder {
- protected final org.apache.lucene.util.fst.Builder<Long> fstBuilder;
+ protected final FSTCompiler<Long> fstCompiler;
protected final IntsRefBuilder scratchInts;
public Builder() {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- fstBuilder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
scratchInts = new IntsRefBuilder();
}
@Override
public void add(BytesRef blockKey, long blockFilePointer) {
try {
- fstBuilder.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
+ fstCompiler.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
} catch (IOException e) {
// Should never happen.
throw new RuntimeException(e);
@@ -224,7 +225,7 @@ public class FSTDictionary implements IndexDictionary {
@Override
public FSTDictionary build() {
try {
- return new FSTDictionary(fstBuilder.finish());
+ return new FSTDictionary(fstCompiler.compile());
} catch (IOException e) {
// Should never happen.
throw new RuntimeException(e);
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
index 3059d5a..dde6810 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
@@ -44,7 +44,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
@@ -454,29 +454,27 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
- final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
- 0, 0, true, false, Integer.MAX_VALUE,
- outputs, true, 15);
+ final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).shouldShareNonSingletonNodes(false).build();
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
//indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0;
- indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
+ fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
scratchBytes.reset();
// Copy over index for all sub-blocks
for(PendingBlock block : blocks) {
if (block.subIndices != null) {
for(FST<BytesRef> subIndex : block.subIndices) {
- append(indexBuilder, subIndex, scratchIntsRef);
+ append(fstCompiler, subIndex, scratchIntsRef);
}
block.subIndices = null;
}
}
- index = indexBuilder.finish();
+ index = fstCompiler.compile();
assert subIndices == null;
@@ -491,14 +489,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current
// FST.
- private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
+ private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
while((indexEnt = subIndexEnum.next()) != null) {
//if (DEBUG) {
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
//}
- builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
+ fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
}
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 46d4b78..571c1e5e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -605,7 +605,7 @@ public final class FST<T> implements Accountable {
// serializes new node by appending its bytes to the end
// of the current byte[]
- long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
+ long addNode(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
T NO_OUTPUT = outputs.getNoOutput();
//System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
@@ -616,28 +616,28 @@ public final class FST<T> implements Accountable {
return NON_FINAL_END_NODE;
}
}
- final long startAddress = builder.bytes.getPosition();
+ final long startAddress = fstCompiler.bytes.getPosition();
//System.out.println(" startAddr=" + startAddress);
- final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn);
+ final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn);
if (doFixedLengthArcs) {
//System.out.println(" fixed length arcs");
- if (builder.numBytesPerArc.length < nodeIn.numArcs) {
- builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
- builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length];
+ if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) {
+ fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
+ fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length];
}
}
- builder.arcCount += nodeIn.numArcs;
+ fstCompiler.arcCount += nodeIn.numArcs;
final int lastArc = nodeIn.numArcs-1;
- long lastArcStart = builder.bytes.getPosition();
+ long lastArcStart = fstCompiler.bytes.getPosition();
int maxBytesPerArc = 0;
int maxBytesPerArcWithoutLabel = 0;
for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) {
- final Builder.Arc<T> arc = nodeIn.arcs[arcIdx];
- final Builder.CompiledNode target = (Builder.CompiledNode) arc.target;
+ final FSTCompiler.Arc<T> arc = nodeIn.arcs[arcIdx];
+ final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target;
int flags = 0;
//System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node);
@@ -645,7 +645,7 @@ public final class FST<T> implements Accountable {
flags += BIT_LAST_ARC;
}
- if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) {
+ if (fstCompiler.lastFrozenNode == target.node && !doFixedLengthArcs) {
// TODO: for better perf (but more RAM used) we
// could avoid this except when arc is "near" the
// last arc:
@@ -671,36 +671,36 @@ public final class FST<T> implements Accountable {
flags += BIT_ARC_HAS_OUTPUT;
}
- builder.bytes.writeByte((byte) flags);
- long labelStart = builder.bytes.getPosition();
- writeLabel(builder.bytes, arc.label);
- int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart);
+ fstCompiler.bytes.writeByte((byte) flags);
+ long labelStart = fstCompiler.bytes.getPosition();
+ writeLabel(fstCompiler.bytes, arc.label);
+ int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart);
// System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output));
if (arc.output != NO_OUTPUT) {
- outputs.write(arc.output, builder.bytes);
+ outputs.write(arc.output, fstCompiler.bytes);
//System.out.println(" write output");
}
if (arc.nextFinalOutput != NO_OUTPUT) {
//System.out.println(" write final output");
- outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes);
+ outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes);
}
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
assert target.node > 0;
//System.out.println(" write target");
- builder.bytes.writeVLong(target.node);
+ fstCompiler.bytes.writeVLong(target.node);
}
// just write the arcs "like normal" on first pass, but record how many bytes each one took
// and max byte size:
if (doFixedLengthArcs) {
- int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart);
- builder.numBytesPerArc[arcIdx] = numArcBytes;
- builder.numLabelBytesPerArc[arcIdx] = numLabelBytes;
- lastArcStart = builder.bytes.getPosition();
+ int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart);
+ fstCompiler.numBytesPerArc[arcIdx] = numArcBytes;
+ fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes;
+ lastArcStart = fstCompiler.bytes.getPosition();
maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes);
maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
//System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes);
@@ -733,18 +733,18 @@ public final class FST<T> implements Accountable {
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
assert labelRange > 0;
- if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
- writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
- builder.directAddressingNodeCount++;
+ if (shouldExpandNodeWithDirectAddressing(fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
+ writeNodeForDirectAddressing(fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
+ fstCompiler.directAddressingNodeCount++;
} else {
- writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc);
- builder.binarySearchNodeCount++;
+ writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc);
+ fstCompiler.binarySearchNodeCount++;
}
}
- final long thisNodeAddress = builder.bytes.getPosition()-1;
- builder.bytes.reverse(startAddress, thisNodeAddress);
- builder.nodeCount++;
+ final long thisNodeAddress = fstCompiler.bytes.getPosition()-1;
+ fstCompiler.bytes.reverse(startAddress, thisNodeAddress);
+ fstCompiler.nodeCount++;
return thisNodeAddress;
}
@@ -757,8 +757,8 @@ public final class FST<T> implements Accountable {
* of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear
* scan) on lookup by arc label.
*/
- private boolean shouldExpandNodeWithFixedLengthArcs(Builder<T> builder, Builder.UnCompiledNode<T> node) {
- return builder.allowFixedLengthArcs &&
+ private boolean shouldExpandNodeWithFixedLengthArcs(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> node) {
+ return fstCompiler.allowFixedLengthArcs &&
((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
}
@@ -769,18 +769,18 @@ public final class FST<T> implements Accountable {
* Prefer direct addressing for performance if it does not oversize binary search byte size too much,
* so that the arcs can be directly addressed by label.
*
- * @see Builder#getDirectAddressingMaxOversizingFactor()
+ * @see FSTCompiler#getDirectAddressingMaxOversizingFactor()
*/
- private boolean shouldExpandNodeWithDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn,
+ private boolean shouldExpandNodeWithDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn,
int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) {
// Anticipate precisely the size of the encodings.
int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs;
- int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0]
+ int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + fstCompiler.numLabelBytesPerArc[0]
+ maxBytesPerArcWithoutLabel * nodeIn.numArcs;
// Determine the allowed oversize compared to binary search.
// This is defined by a parameter of FST Builder (default 1: no oversize).
- int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor());
+ int allowedOversize = (int) (sizeForBinarySearch * fstCompiler.getDirectAddressingMaxOversizingFactor());
int expansionCost = sizeForDirectAddressing - allowedOversize;
// Select direct addressing if either:
@@ -790,46 +790,46 @@ public final class FST<T> implements Accountable {
// In this case, decrement the credit by the oversize.
// In addition, do not try to oversize to a clearly too large node size
// (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter).
- if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost
+ if (expansionCost <= 0 || (fstCompiler.directAddressingExpansionCredit >= expansionCost
&& sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) {
- builder.directAddressingExpansionCredit -= expansionCost;
+ fstCompiler.directAddressingExpansionCredit -= expansionCost;
return true;
}
return false;
}
- private void writeNodeForBinarySearch(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
+ private void writeNodeForBinarySearch(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
// Build the header in a buffer.
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
- builder.fixedLengthArcsBuffer
+ fstCompiler.fixedLengthArcsBuffer
.resetPosition()
.writeByte(ARCS_FOR_BINARY_SEARCH)
.writeVInt(nodeIn.numArcs)
.writeVInt(maxBytesPerArc);
- int headerLen = builder.fixedLengthArcsBuffer.getPosition();
+ int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
// Expand the arcs in place, backwards.
- long srcPos = builder.bytes.getPosition();
+ long srcPos = fstCompiler.bytes.getPosition();
long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc;
assert destPos >= srcPos;
if (destPos > srcPos) {
- builder.bytes.skipBytes((int) (destPos - srcPos));
+ fstCompiler.bytes.skipBytes((int) (destPos - srcPos));
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
destPos -= maxBytesPerArc;
- int arcLen = builder.numBytesPerArc[arcIdx];
+ int arcLen = fstCompiler.numBytesPerArc[arcIdx];
srcPos -= arcLen;
if (srcPos != destPos) {
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs;
- builder.bytes.copyBytes(srcPos, destPos, arcLen);
+ fstCompiler.bytes.copyBytes(srcPos, destPos, arcLen);
}
}
}
// Write the header.
- builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
+ fstCompiler.bytes.writeBytes(startAddress, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
}
- private void writeNodeForDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
+ private void writeNodeForDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
// Expand the arcs backwards in a buffer because we remove the labels.
// So the obtained arcs might occupy less space. This is the reason why this
// whole method is more complex.
@@ -837,64 +837,64 @@ public final class FST<T> implements Accountable {
// the presence bits, and the first label. Keep the first label.
int headerMaxLen = 11;
int numPresenceBytes = getNumPresenceBytes(labelRange);
- long srcPos = builder.bytes.getPosition();
- int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
+ long srcPos = fstCompiler.bytes.getPosition();
+ int totalArcBytes = fstCompiler.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
- byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
+ byte[] buffer = fstCompiler.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
// Copy the arcs to the buffer, dropping all labels except first one.
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
bufferOffset -= maxBytesPerArcWithoutLabel;
- int srcArcLen = builder.numBytesPerArc[arcIdx];
+ int srcArcLen = fstCompiler.numBytesPerArc[arcIdx];
srcPos -= srcArcLen;
- int labelLen = builder.numLabelBytesPerArc[arcIdx];
+ int labelLen = fstCompiler.numLabelBytesPerArc[arcIdx];
// Copy the flags.
- builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
+ fstCompiler.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
// Skip the label, copy the remaining.
int remainingArcLen = srcArcLen - 1 - labelLen;
if (remainingArcLen != 0) {
- builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
+ fstCompiler.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
}
if (arcIdx == 0) {
// Copy the label of the first arc only.
bufferOffset -= labelLen;
- builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
+ fstCompiler.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
}
}
assert bufferOffset == headerMaxLen + numPresenceBytes;
// Build the header in the buffer.
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
- builder.fixedLengthArcsBuffer
+ fstCompiler.fixedLengthArcsBuffer
.resetPosition()
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
.writeVInt(labelRange) // labelRange instead of numArcs.
.writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
- int headerLen = builder.fixedLengthArcsBuffer.getPosition();
+ int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
// Prepare the builder byte store. Enlarge or truncate if needed.
long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes;
- long currentPosition = builder.bytes.getPosition();
+ long currentPosition = fstCompiler.bytes.getPosition();
if (nodeEnd >= currentPosition) {
- builder.bytes.skipBytes((int) (nodeEnd - currentPosition));
+ fstCompiler.bytes.skipBytes((int) (nodeEnd - currentPosition));
} else {
- builder.bytes.truncate(nodeEnd);
+ fstCompiler.bytes.truncate(nodeEnd);
}
- assert builder.bytes.getPosition() == nodeEnd;
+ assert fstCompiler.bytes.getPosition() == nodeEnd;
// Write the header.
long writeOffset = startAddress;
- builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
+ fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
writeOffset += headerLen;
// Write the presence bits
- writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes);
+ writePresenceBits(fstCompiler, nodeIn, writeOffset, numPresenceBytes);
writeOffset += numPresenceBytes;
// Write the first label and the arcs.
- builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
+ fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
}
- private void writePresenceBits(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
+ private void writePresenceBits(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
long bytePos = dest;
byte presenceBits = 1; // The first arc is always present.
int presenceIndex = 0;
@@ -904,7 +904,7 @@ public final class FST<T> implements Accountable {
assert label > previousLabel;
presenceIndex += label - previousLabel;
while (presenceIndex >= Byte.SIZE) {
- builder.bytes.writeByte(bytePos++, presenceBits);
+ fstCompiler.bytes.writeByte(bytePos++, presenceBits);
presenceBits = 0;
presenceIndex -= Byte.SIZE;
}
@@ -915,7 +915,7 @@ public final class FST<T> implements Accountable {
assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8;
assert presenceBits != 0; // The last byte is not 0.
assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present.
- builder.bytes.writeByte(bytePos++, presenceBits);
+ fstCompiler.bytes.writeByte(bytePos++, presenceBits);
assert bytePos - dest == numPresenceBytes;
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
similarity index 74%
rename from lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
rename to lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
index a35c4d7..556e5c2 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
@@ -49,31 +49,9 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
* @lucene.experimental
*/
-public class Builder<T> {
+public class FSTCompiler<T> {
- /**
- * Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
- * Default is 1: ensure no oversizing on average.
- * <p>
- * This factor does not determine whether to encode a node with a list of variable length arcs or with
- * fixed length arcs. It only determines the effective encoding of a node that is already known to be
- * encoded with fixed length arcs.
- * See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
- * and {@code FST.shouldExpandNodeWithDirectAddressing()}.
- * <p>
- * For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
- * and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
- * <p>
- * For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
- * and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
- * <p>
- * Use {@code TestFstDirectAddressing.main()}
- * and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
- * to evaluate a change.
- *
- * @see #setDirectAddressingMaxOversizingFactor
- */
- static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
+ static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f;
private final NodeHash<T> dedupHash;
final FST<T> fst;
@@ -117,75 +95,29 @@ public class Builder<T> {
long binarySearchNodeCount;
long directAddressingNodeCount;
- boolean allowFixedLengthArcs;
- float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
+ final boolean allowFixedLengthArcs;
+ final float directAddressingMaxOversizingFactor;
long directAddressingExpansionCredit;
- BytesStore bytes;
+ final BytesStore bytes;
/**
- * Instantiates an FST/FSA builder without any pruning. A shortcut to {@link
- * #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with
- * pruning options turned off.
+ * Instantiates an FST/FSA builder with default settings and pruning options turned off.
+ * For more tuning and tweaking, see {@link Builder}.
*/
- public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
- this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
+ public FSTCompiler(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
+ this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f);
}
- /**
- * Instantiates an FST/FSA builder with all the possible tuning and construction
- * tweaks. Read parameter documentation carefully.
- *
- * @param inputType
- * The input type (transition labels). Can be anything from {@link INPUT_TYPE}
- * enumeration. Shorter types will consume less memory. Strings (character sequences) are
- * represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
- *
- * @param minSuffixCount1
- * If pruning the input graph during construction, this threshold is used for telling
- * if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node
- * is kept.
- *
- * @param minSuffixCount2
- * (Note: only Mike McCandless knows what this one is really doing...)
- *
- * @param doShareSuffix
- * If <code>true</code>, the shared suffixes will be compacted into unique paths.
- * This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
- * <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
- * FST, but requires substantially less memory and CPU during building.
- *
- * @param doShareNonSingletonNodes
- * Only used if doShareSuffix is true. Set this to
- * true to ensure FST is fully minimal, at cost of more
- * CPU and more RAM during building.
- *
- * @param shareMaxTailLength
- * Only used if doShareSuffix is true. Set this to
- * Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
- * CPU and more RAM during building.
- *
- * @param outputs The output type for each input sequence. Applies only if building an FST. For
- * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
- * singleton output object.
- *
- * @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
- * direct addressing) while building the FST; this will make the resulting FST smaller but slower to
- * traverse.
- *
- * @param bytesPageBits How many bits wide to make each
- * byte[] block in the BytesStore; if you know the FST
- * will be large then make this larger. For example 15
- * bits = 32768 byte pages.
- */
- public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
- boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
- boolean allowFixedLengthArcs, int bytesPageBits) {
+ private FSTCompiler(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
+ boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
+ boolean allowFixedLengthArcs, int bytesPageBits, float directAddressingMaxOversizingFactor) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
this.allowFixedLengthArcs = allowFixedLengthArcs;
+ this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;
@@ -205,22 +137,145 @@ public class Builder<T> {
}
/**
- * Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
- * of arcs instead of binary search.
+ * Fluent-style constructor for FST {@link FSTCompiler}.
* <p>
- * Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
- * only binary search nodes will be created.
- *
- * @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
+ * Creates an FST/FSA builder with all the possible tuning and construction tweaks.
+ * Read parameter documentation carefully.
*/
- public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
- directAddressingMaxOversizingFactor = factor;
- return this;
+ public static class Builder<T> {
+
+ private final INPUT_TYPE inputType;
+ private final Outputs<T> outputs;
+ private int minSuffixCount1;
+ private int minSuffixCount2;
+ private boolean shouldShareSuffix = true;
+ private boolean shouldShareNonSingletonNodes = true;
+ private int shareMaxTailLength = Integer.MAX_VALUE;
+ private boolean allowFixedLengthArcs = true;
+ private int bytesPageBits = 15;
+ private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
+
+ /**
+ * @param inputType The input type (transition labels). Can be anything from {@link INPUT_TYPE}
+ * enumeration. Shorter types will consume less memory. Strings (character sequences) are
+ * represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
+ * @param outputs The output type for each input sequence. Applies only if building an FST. For
+ * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
+ * singleton output object.
+ */
+ public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
+ this.inputType = inputType;
+ this.outputs = outputs;
+ }
+
+ /**
+ * If pruning the input graph during construction, this threshold is used for telling if a node is kept
+ * or pruned. If transition_count(node) >= minSuffixCount1, the node is kept.
+ * <p>
+ * Default = 0.
+ */
+ public Builder<T> minSuffixCount1(int minSuffixCount1) {
+ this.minSuffixCount1 = minSuffixCount1;
+ return this;
+ }
+
+ /**
+ * Better pruning: we prune node (and all following nodes) if the prior node has less than this number
+ * of terms go through it.
+ * <p>
+ * Default = 0.
+ */
+ public Builder<T> minSuffixCount2(int minSuffixCount2) {
+ this.minSuffixCount2 = minSuffixCount2;
+ return this;
+ }
+
+ /**
+ * If {@code true}, the shared suffixes will be compacted into unique paths.
+ * This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
+ * {@code false} creates a single suffix path for all input sequences. This will result in a larger
+ * FST, but requires substantially less memory and CPU during building.
+ * <p>
+ * Default = {@code true}.
+ */
+ public Builder<T> shouldShareSuffix(boolean shouldShareSuffix) {
+ this.shouldShareSuffix = shouldShareSuffix;
+ return this;
+ }
+
+ /**
+ * Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully minimal,
+ * at cost of more CPU and more RAM during building.
+ * <p>
+ * Default = {@code true}.
+ */
+ public Builder<T> shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) {
+ this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes;
+ return this;
+ }
+
+ /**
+ * Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST is
+ * fully minimal, at cost of more CPU and more RAM during building.
+ * <p>
+ * Default = {@link Integer#MAX_VALUE}.
+ */
+ public Builder<T> shareMaxTailLength(int shareMaxTailLength) {
+ this.shareMaxTailLength = shareMaxTailLength;
+ return this;
+ }
+
+ /**
+ * Pass {@code false} to disable the fixed length arc optimization (binary search or direct addressing)
+ * while building the FST; this will make the resulting FST smaller but slower to traverse.
+ * <p>
+ * Default = {@code true}.
+ */
+ public Builder<T> allowFixedLengthArcs(boolean allowFixedLengthArcs) {
+ this.allowFixedLengthArcs = allowFixedLengthArcs;
+ return this;
+ }
+
+ /**
+ * How many bits wide to make each byte[] block in the BytesStore; if you know the FST
+ * will be large then make this larger. For example 15 bits = 32768 byte pages.
+ * <p>
+ * Default = 15.
+ */
+ public Builder<T> bytesPageBits(int bytesPageBits) {
+ this.bytesPageBits = bytesPageBits;
+ return this;
+ }
+
+ /**
+ * Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
+ * of arcs instead of binary search.
+ * <p>
+ * Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
+ * only binary search nodes will be created.
+ * <p>
+ * This factor does not determine whether to encode a node with a list of variable length arcs or with
+ * fixed length arcs. It only determines the effective encoding of a node that is already known to be
+ * encoded with fixed length arcs.
+ * <p>
+ * Default = 1.
+ */
+ public Builder<T> directAddressingMaxOversizingFactor(float factor) {
+ this.directAddressingMaxOversizingFactor = factor;
+ return this;
+ }
+
+ /**
+ * Creates a new {@link FSTCompiler}.
+ */
+ public FSTCompiler<T> build() {
+ FSTCompiler<T> fstCompiler = new FSTCompiler<>(inputType, minSuffixCount1, minSuffixCount2, shouldShareSuffix,
+ shouldShareNonSingletonNodes, shareMaxTailLength, outputs, allowFixedLengthArcs, bytesPageBits,
+ directAddressingMaxOversizingFactor);
+ return fstCompiler;
+ }
}
- /**
- * @see #setDirectAddressingMaxOversizingFactor(float)
- */
public float getDirectAddressingMaxOversizingFactor() {
return directAddressingMaxOversizingFactor;
}
@@ -514,7 +569,7 @@ public class Builder<T> {
/** Returns final FST. NOTE: this will return null if
* nothing is accepted by the FST. */
- public FST<T> finish() throws IOException {
+ public FST<T> compile() throws IOException {
final UnCompiledNode<T> root = frontier[0];
@@ -554,19 +609,19 @@ public class Builder<T> {
}
/** Expert: holds a pending (seen but not yet serialized) arc. */
- public static class Arc<T> {
- public int label; // really an "unsigned" byte
- public Node target;
- public boolean isFinal;
- public T output;
- public T nextFinalOutput;
+ static class Arc<T> {
+ int label; // really an "unsigned" byte
+ Node target;
+ boolean isFinal;
+ T output;
+ T nextFinalOutput;
}
// NOTE: not many instances of Node or CompiledNode are in
// memory while the FST is being built; it's only the
// current "frontier":
- static interface Node {
+ interface Node {
boolean isCompiled();
}
@@ -583,20 +638,20 @@ public class Builder<T> {
}
/** Expert: holds a pending (seen but not yet serialized) Node. */
- public static final class UnCompiledNode<T> implements Node {
- final Builder<T> owner;
- public int numArcs;
- public Arc<T>[] arcs;
+ static final class UnCompiledNode<T> implements Node {
+ final FSTCompiler<T> owner;
+ int numArcs;
+ Arc<T>[] arcs;
// TODO: instead of recording isFinal/output on the
// node, maybe we should use -1 arc to mean "end" (like
// we do when reading the FST). Would simplify much
// code here...
- public T output;
- public boolean isFinal;
- public long inputCount;
+ T output;
+ boolean isFinal;
+ long inputCount;
/** This node's depth, starting from the automaton root. */
- public final int depth;
+ final int depth;
/**
* @param depth
@@ -605,7 +660,7 @@ public class Builder<T> {
* fanout size).
*/
@SuppressWarnings({"rawtypes","unchecked"})
- public UnCompiledNode(Builder<T> owner, int depth) {
+ UnCompiledNode(FSTCompiler<T> owner, int depth) {
this.owner = owner;
arcs = (Arc<T>[]) new Arc[1];
arcs[0] = new Arc<>();
@@ -618,7 +673,7 @@ public class Builder<T> {
return false;
}
- public void clear() {
+ void clear() {
numArcs = 0;
isFinal = false;
output = owner.NO_OUTPUT;
@@ -628,13 +683,13 @@ public class Builder<T> {
// for nodes on the frontier (even when reused).
}
- public T getLastOutput(int labelToMatch) {
+ T getLastOutput(int labelToMatch) {
assert numArcs > 0;
assert arcs[numArcs-1].label == labelToMatch;
return arcs[numArcs-1].output;
}
- public void addArc(int label, Node target) {
+ void addArc(int label, Node target) {
assert label >= 0;
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
if (numArcs == arcs.length) {
@@ -651,7 +706,7 @@ public class Builder<T> {
arc.isFinal = false;
}
- public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
+ void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
@@ -661,14 +716,14 @@ public class Builder<T> {
arc.isFinal = isFinal;
}
- public void deleteLast(int label, Node target) {
+ void deleteLast(int label, Node target) {
assert numArcs > 0;
assert label == arcs[numArcs-1].label;
assert target == arcs[numArcs-1].target;
numArcs--;
}
- public void setLastOutput(int labelToMatch, T newOutput) {
+ void setLastOutput(int labelToMatch, T newOutput) {
assert owner.validOutput(newOutput);
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
@@ -677,7 +732,7 @@ public class Builder<T> {
}
// pushes an output prefix forward onto all arcs
- public void prependOutput(T outputPrefix) {
+ void prependOutput(T outputPrefix) {
assert owner.validOutput(outputPrefix);
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
index af152fe..9fcf5f5 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
@@ -39,7 +39,7 @@ final class NodeHash<T> {
this.in = in;
}
- private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
+ private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
fst.readFirstRealTargetArc(address, scratchArc, in);
// Fail fast for a node with fixed length arcs.
@@ -58,10 +58,10 @@ final class NodeHash<T> {
}
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
- final Builder.Arc<T> arc = node.arcs[arcUpto];
+ final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label() ||
!arc.output.equals(scratchArc.output()) ||
- ((Builder.CompiledNode) arc.target).node != scratchArc.target() ||
+ ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() ||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) ||
arc.isFinal != scratchArc.isFinal()) {
return false;
@@ -82,16 +82,16 @@ final class NodeHash<T> {
// hash code for an unfrozen node. This must be identical
// to the frozen case (below)!!
- private long hash(Builder.UnCompiledNode<T> node) {
+ private long hash(FSTCompiler.UnCompiledNode<T> node) {
final int PRIME = 31;
//System.out.println("hash unfrozen");
long h = 0;
// TODO: maybe if number of arcs is high we can safely subsample?
for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) {
- final Builder.Arc<T> arc = node.arcs[arcIdx];
+ final FSTCompiler.Arc<T> arc = node.arcs[arcIdx];
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
h = PRIME * h + arc.label;
- long n = ((Builder.CompiledNode) arc.target).node;
+ long n = ((FSTCompiler.CompiledNode) arc.target).node;
h = PRIME * h + (int) (n^(n>>32));
h = PRIME * h + arc.output.hashCode();
h = PRIME * h + arc.nextFinalOutput.hashCode();
@@ -127,7 +127,7 @@ final class NodeHash<T> {
return h & Long.MAX_VALUE;
}
- public long add(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
+ public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
//System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
final long h = hash(nodeIn);
long pos = h & mask;
@@ -136,7 +136,7 @@ final class NodeHash<T> {
final long v = table.get(pos);
if (v == 0) {
// freeze & add
- final long node = fst.addNode(builder, nodeIn);
+ final long node = fst.addNode(fstCompiler, nodeIn);
//System.out.println(" now freeze node=" + node);
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++;
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
index cc093b3..9ee6947 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
@@ -54,8 +54,7 @@ public class Test2BFST extends LuceneTestCase {
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
- final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
- true, 15);
+ final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
int count = 0;
Random r = new Random(seed);
@@ -66,21 +65,21 @@ public class Test2BFST extends LuceneTestCase {
for(int i=10;i<ints2.length;i++) {
ints2[i] = r.nextInt(256);
}
- b.add(input2, NO_OUTPUT);
+ fstCompiler.add(input2, NO_OUTPUT);
count++;
if (count % 100000 == 0) {
- System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes");
+ System.out.println(count + ": " + fstCompiler.fstRamBytesUsed() + " bytes; " + fstCompiler.getNodeCount() + " nodes");
}
- if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
+ if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
break;
}
nextInput(r, ints2);
}
- FST<Object> fst = b.finish();
+ FST<Object> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) {
- System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
+ System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
Arrays.fill(ints2, 0);
r = new Random(seed);
@@ -136,8 +135,7 @@ public class Test2BFST extends LuceneTestCase {
{
System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
- final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
- true, 15);
+ final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes);
@@ -147,10 +145,10 @@ public class Test2BFST extends LuceneTestCase {
while(true) {
r.nextBytes(outputBytes);
//System.out.println("add: " + input + " -> " + output);
- b.add(input, BytesRef.deepCopyOf(output));
+ fstCompiler.add(input, BytesRef.deepCopyOf(output));
count++;
if (count % 10000 == 0) {
- long size = b.fstRamBytesUsed();
+ long size = fstCompiler.fstRamBytesUsed();
if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes");
}
@@ -161,10 +159,10 @@ public class Test2BFST extends LuceneTestCase {
nextInput(r, ints);
}
- FST<BytesRef> fst = b.finish();
+ FST<BytesRef> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) {
- System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
+ System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
r = new Random(seed);
Arrays.fill(ints, 0);
@@ -216,8 +214,7 @@ public class Test2BFST extends LuceneTestCase {
{
System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
- final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
- true, 15);
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
long output = 1;
@@ -226,11 +223,11 @@ public class Test2BFST extends LuceneTestCase {
Random r = new Random(seed);
while(true) {
//System.out.println("add: " + input + " -> " + output);
- b.add(input, output);
+ fstCompiler.add(input, output);
output += 1+r.nextInt(10);
count++;
if (count % 10000 == 0) {
- long size = b.fstRamBytesUsed();
+ long size = fstCompiler.fstRamBytesUsed();
if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes");
}
@@ -241,11 +238,11 @@ public class Test2BFST extends LuceneTestCase {
nextInput(r, ints);
}
- FST<Long> fst = b.finish();
+ FST<Long> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) {
- System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
+ System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
Arrays.fill(ints, 0);
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirectAddressing.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java
similarity index 77%
rename from lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirectAddressing.java
rename to lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java
index a1e18a3..c6cfa13 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirectAddressing.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java
@@ -35,7 +35,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
-public class TestFstDirectAddressing extends LuceneTestCase {
+public class TestFSTDirectAddressing extends LuceneTestCase {
public void testDenseWithGap() throws Exception {
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
@@ -86,13 +86,13 @@ public class TestFstDirectAddressing extends LuceneTestCase {
Collections.sort(wordList);
// Disable direct addressing and measure the FST size.
- Builder<Object> builder = createBuilder(-1f);
- FST<Object> fst = buildFST(wordList, builder);
+ FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
+ FST<Object> fst = buildFST(wordList, fstCompiler);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size.
- builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
- fst = buildFST(wordList, builder);
+ fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
+ fst = buildFST(wordList, fstCompiler);
long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents.
@@ -107,42 +107,43 @@ public class TestFstDirectAddressing extends LuceneTestCase {
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
}
- private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
- System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor());
+ private static void printStats(FSTCompiler<Object> fstCompiler, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
+ System.out.println("directAddressingMaxOversizingFactor = " + fstCompiler.getDirectAddressingMaxOversizingFactor());
System.out.println("ramBytesUsed = "
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
- System.out.println("num nodes = " + builder.nodeCount);
- long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount;
+ System.out.println("num nodes = " + fstCompiler.nodeCount);
+ long fixedLengthArcNodeCount = fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
- ((double) fixedLengthArcNodeCount / builder.nodeCount * 100)));
- System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount)
+ ((double) fixedLengthArcNodeCount / fstCompiler.nodeCount * 100)));
+ System.out.println("num binary-search nodes = " + (fstCompiler.binarySearchNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
- ((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
- System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount)
+ ((double) (fstCompiler.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
+ System.out.println("num direct-addressing nodes = " + (fstCompiler.directAddressingNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
- ((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
+ ((double) (fstCompiler.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
}
- private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) {
- return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15)
- .setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor);
+ private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
+ return new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, NoOutputs.getSingleton())
+ .directAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor)
+ .build();
}
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
- return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
+ return buildFST(entries, createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
}
- private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception {
+ private static FST<Object> buildFST(List<BytesRef> entries, FSTCompiler<Object> fstCompiler) throws Exception {
BytesRef last = null;
for (BytesRef entry : entries) {
if (entry.equals(last) == false) {
- builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
+ fstCompiler.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
}
last = entry;
}
- return builder.finish();
+ return fstCompiler.compile();
}
public static void main(String... args) throws Exception {
@@ -195,18 +196,18 @@ public class TestFstDirectAddressing extends LuceneTestCase {
Collections.sort(wordList);
// Disable direct addressing and measure the FST size.
- Builder<Object> builder = createBuilder(-1f);
- FST<Object> fst = buildFST(wordList, builder);
+ FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
+ FST<Object> fst = buildFST(wordList, fstCompiler);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size.
- builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
- fst = buildFST(wordList, builder);
+ fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
+ fst = buildFST(wordList, fstCompiler);
long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents.
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
- printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
+ printStats(fstCompiler, ramBytesUsed, directAddressingMemoryIncreasePercent);
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
index 316cbdf..72c09e9 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
@@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase {
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
@@ -373,15 +373,15 @@ public class TestFSTs extends LuceneTestCase {
} else {
output = termsEnum.docFreq();
}
- builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
+ fstCompiler.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
- FST<Long> fst = builder.finish();
+ FST<Long> fst = fstCompiler.compile();
if (VERBOSE) {
- System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
+ System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
}
if (ord > 0) {
@@ -460,7 +460,7 @@ public class TestFSTs extends LuceneTestCase {
private final Path wordsFileIn;
private int inputMode;
private final Outputs<T> outputs;
- private final Builder<T> builder;
+ private final FSTCompiler<T> fstCompiler;
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
this.dirOut = dirOut;
@@ -468,7 +468,11 @@ public class TestFSTs extends LuceneTestCase {
this.inputMode = inputMode;
this.outputs = outputs;
- builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
+ fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
+ .minSuffixCount2(prune)
+ .shouldShareSuffix(prune == 0)
+ .allowFixedLengthArcs(!noArcArrays)
+ .build();
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
break;
}
toIntsRef(w, inputMode, intsRef);
- builder.add(intsRef.get(),
+ fstCompiler.add(intsRef.get(),
getOutput(intsRef.get(), ord));
ord++;
@@ -503,8 +507,8 @@ public class TestFSTs extends LuceneTestCase {
long tMid = System.currentTimeMillis();
System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
- assert builder.getTermCount() == ord;
- FST<T> fst = builder.finish();
+ assert fstCompiler.getTermCount() == ord;
+ FST<T> fst = fstCompiler.compile();
long tEnd = System.currentTimeMillis();
System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
if (fst == null) {
@@ -516,8 +520,8 @@ public class TestFSTs extends LuceneTestCase {
return;
}
- System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
- if (builder.getNodeCount() < 100) {
+ System.out.println(ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
+ if (fstCompiler.getNodeCount() < 100) {
Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8);
Util.toDot(fst, w, false, false);
w.close();
@@ -717,9 +721,9 @@ public class TestFSTs extends LuceneTestCase {
public void testSingleString() throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
- final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
- b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
- final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(b.finish());
+ final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
+ final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fstCompiler.compile());
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
}
@@ -728,12 +732,12 @@ public class TestFSTs extends LuceneTestCase {
public void testDuplicateFSAString() throws Exception {
String str = "foobar";
final Outputs<Object> outputs = NoOutputs.getSingleton();
- final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder ints = new IntsRefBuilder();
for(int i=0; i<10; i++) {
- b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
+ fstCompiler.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
}
- FST<Object> fst = b.finish();
+ FST<Object> fst = fstCompiler.compile();
// count the input paths
int count = 0;
@@ -797,17 +801,17 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
// Build an FST mapping BytesRef -> Long
- final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRef a = new BytesRef("a");
final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c");
- builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
- builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
- builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
+ fstCompiler.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
+ fstCompiler.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
+ fstCompiler.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
- final FST<Long> fst = builder.finish();
+ final FST<Long> fst = fstCompiler.compile();
assertEquals(13824324872317238L, (long) Util.get(fst, c));
assertEquals(42, (long) Util.get(fst, b));
@@ -1035,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase {
FST<Object> compile(String[] lines) throws IOException {
final NoOutputs outputs = NoOutputs.getSingleton();
final Object nothing = outputs.getNoOutput();
- final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
int line = 0;
final BytesRefBuilder term = new BytesRefBuilder();
@@ -1046,10 +1050,10 @@ public class TestFSTs extends LuceneTestCase {
break;
}
term.copyChars(w);
- b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
+ fstCompiler.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
}
- return b.finish();
+ return fstCompiler.compile();
}
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
@@ -1110,10 +1114,10 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
- builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
- builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
- final FST<Long> fst = builder.finish();
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build();
+ fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
+ fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
+ final FST<Long> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false);
@@ -1124,10 +1128,10 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
- builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
- builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
- final FST<Long> fst = builder.finish();
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
+ fstCompiler.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
+ final FST<Long> fst = fstCompiler.compile();
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
Util.toDot(fst, w, false, false);
@@ -1145,20 +1149,20 @@ public class TestFSTs extends LuceneTestCase {
public void testNonFinalStopNode() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Long nothing = outputs.getNoOutput();
- final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
//final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15);
- final FST<Long> fst = b.fst;
+ final FST<Long> fst = fstCompiler.fst;
- final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<>(b, 0);
+ final FSTCompiler.UnCompiledNode<Long> rootNode = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
// Add final stop node
{
- final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
+ final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
node.isFinal = true;
rootNode.addArc('a', node);
- final Builder.CompiledNode frozen = new Builder.CompiledNode();
- frozen.node = fst.addNode(b, node);
+ final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
+ frozen.node = fst.addNode(fstCompiler, node);
rootNode.arcs[0].nextFinalOutput = 17L;
rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing;
@@ -1167,16 +1171,16 @@ public class TestFSTs extends LuceneTestCase {
// Add non-final stop node
{
- final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
+ final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
rootNode.addArc('b', node);
- final Builder.CompiledNode frozen = new Builder.CompiledNode();
- frozen.node = fst.addNode(b, node);
+ final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
+ frozen.node = fst.addNode(fstCompiler, node);
rootNode.arcs[1].nextFinalOutput = nothing;
rootNode.arcs[1].output = 42L;
rootNode.arcs[1].target = frozen;
}
- fst.finish(fst.addNode(b, rootNode));
+ fst.finish(fst.addNode(fstCompiler, rootNode));
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
@@ -1225,13 +1229,13 @@ public class TestFSTs extends LuceneTestCase {
public void testShortestPaths() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
- builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
- builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
- builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
- final FST<Long> fst = builder.finish();
+ fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
+ final FST<Long> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
@@ -1256,16 +1260,16 @@ public class TestFSTs extends LuceneTestCase {
public void testRejectNoLimits() throws IOException {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
- builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
- builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
- builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
- builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
- builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
- final FST<Long> fst = builder.finish();
+ fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
+ final FST<Long> fst = fstCompiler.compile();
final AtomicInteger rejectCount = new AtomicInteger();
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
@Override
@@ -1320,13 +1324,13 @@ public class TestFSTs extends LuceneTestCase {
PositiveIntOutputs.getSingleton() // output
);
- final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
- builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
- builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
- builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
- final FST<Pair<Long,Long>> fst = builder.finish();
+ fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
+ fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
+ fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
+ final FST<Pair<Long,Long>> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
@@ -1361,7 +1365,7 @@ public class TestFSTs extends LuceneTestCase {
final TreeSet<String> allPrefixes = new TreeSet<>();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
for (int i = 0; i < numWords; i++) {
@@ -1382,10 +1386,10 @@ public class TestFSTs extends LuceneTestCase {
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
- builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
+ fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
}
- final FST<Long> fst = builder.finish();
+ final FST<Long> fst = fstCompiler.compile();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
@@ -1479,7 +1483,7 @@ public class TestFSTs extends LuceneTestCase {
PositiveIntOutputs.getSingleton(), // weight
PositiveIntOutputs.getSingleton() // output
);
- final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
Random random = random();
@@ -1504,10 +1508,10 @@ public class TestFSTs extends LuceneTestCase {
//System.out.println("add: " + e);
long weight = e.getValue().a;
long output = e.getValue().b;
- builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
+ fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
}
- final FST<Pair<Long,Long>> fst = builder.finish();
+ final FST<Pair<Long,Long>> fst = fstCompiler.compile();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
@@ -1563,7 +1567,7 @@ public class TestFSTs extends LuceneTestCase {
public void testLargeOutputsOnArrayArcs() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
- final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final byte[] bytes = new byte[300];
final IntsRefBuilder input = new IntsRefBuilder();
@@ -1572,10 +1576,10 @@ public class TestFSTs extends LuceneTestCase {
for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc);
output.bytes[0] = (byte) arc;
- builder.add(input.get(), BytesRef.deepCopyOf(output));
+ fstCompiler.add(input.get(), BytesRef.deepCopyOf(output));
}
- final FST<BytesRef> fst = builder.finish();
+ final FST<BytesRef> fst = fstCompiler.compile();
for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc);
final BytesRef result = Util.get(fst, input.get());
@@ -1608,15 +1612,15 @@ public class TestFSTs extends LuceneTestCase {
Collections.sort(termsList);
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
- Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder input = new IntsRefBuilder();
for(BytesRef term : termsList) {
Util.toIntsRef(term, input);
- builder.add(input.get(), term);
+ fstCompiler.add(input.get(), term);
}
- FST<BytesRef> fst = builder.finish();
+ FST<BytesRef> fst = fstCompiler.compile();
Arc<BytesRef> arc = new FST.Arc<>();
fst.getFirstArc(arc);
@@ -1638,17 +1642,17 @@ public class TestFSTs extends LuceneTestCase {
public void testSimpleDepth() throws Exception {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRef ab = new BytesRef("ab");
BytesRef ac = new BytesRef("ac");
BytesRef bd = new BytesRef("bd");
- builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
- builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
- builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
+ fstCompiler.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
+ fstCompiler.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
+ fstCompiler.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
- FST<Long> fst = builder.finish();
+ FST<Long> fst = fstCompiler.compile();
assertEquals(3, (long) Util.get(fst, ab));
assertEquals(5, (long) Util.get(fst, ac));
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java
index 2793e7e..a14be3c 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java
@@ -83,15 +83,17 @@ public class TestUtil extends LuceneTestCase {
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
- final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
+ final FSTCompiler.Builder<Object> builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
+ .allowFixedLengthArcs(allowArrayArcs);
if (!allowDirectAddressing) {
- b.setDirectAddressingMaxOversizingFactor(-1f);
+ builder.directAddressingMaxOversizingFactor(-1f);
}
+ final FSTCompiler<Object> fstCompiler = builder.build();
for (String word : words) {
- b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
+ fstCompiler.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
}
- return b.finish();
+ return fstCompiler.compile();
}
private List<String> createRandomDictionary(int width, int depth) {
diff --git a/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java b/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java
index aa08344..da5ce4d 100644
--- a/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java
+++ b/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java
@@ -30,7 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* more of its output values. You can use this when a single
* input may need to map to more than one output,
* maintaining order: pass the same input with a different
- * output by calling {@link Builder#add(IntsRef,Object)} multiple
+ * output by calling {@link FSTCompiler#add(IntsRef,Object)} multiple
* times. The builder will then combine the outputs using
* the {@link Outputs#merge(Object,Object)} method.
*
@@ -41,7 +41,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* <p>NOTE: the only way to create multiple outputs is to
* add the same input to the FST multiple times in a row. This is
* how the FST maps a single input to multiple outputs (e.g. you
- * cannot pass a List<Object> to {@link Builder#add}). If
+ * cannot pass a List<Object> to {@link FSTCompiler#add}). If
* your outputs are longs, and you need at most 2, then use
* {@link UpToTwoPositiveIntOutputs} instead since it stores
* the outputs more compactly (by stealing a bit from each
diff --git a/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java b/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java
index 327de1f..af1766e 100644
--- a/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java
+++ b/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java
@@ -35,14 +35,14 @@ import org.apache.lucene.util.SuppressForbidden;
* <p>NOTE: the only way to create a TwoLongs output is to
* add the same input to the FST twice in a row. This is
* how the FST maps a single input to two outputs (e.g. you
- * cannot pass a TwoLongs to {@link Builder#add}. If you
+ * cannot pass a TwoLongs to {@link FSTCompiler#add}. If you
* need more than two then use {@link ListOfOutputs}, but if
* you only have at most 2 then this implementation will
* require fewer bytes as it steals one bit from each long
* value.
*
* <p>NOTE: the resulting FST is not guaranteed to be minimal!
- * See {@link Builder}.
+ * See {@link FSTCompiler}.
*
* @lucene.experimental
*/
diff --git a/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java b/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java
index 91a8001..9911182 100644
--- a/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java
+++ b/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java
@@ -164,16 +164,16 @@ public class TestFSTsMisc extends LuceneTestCase {
public void testListOfOutputs() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
- final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
// Add the same input more than once and the outputs
// are merged:
- builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
- builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
- builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
- builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
- final FST<Object> fst = builder.finish();
+ fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
+ final FST<Object> fst = fstCompiler.compile();
Object output = Util.get(fst, new BytesRef("a"));
assertNotNull(output);
@@ -193,20 +193,20 @@ public class TestFSTsMisc extends LuceneTestCase {
public void testListOfOutputsEmptyString() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
- final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
- builder.add(scratch.get(), 0L);
- builder.add(scratch.get(), 1L);
- builder.add(scratch.get(), 17L);
- builder.add(scratch.get(), 1L);
-
- builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
- builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
- builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
- builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
+ fstCompiler.add(scratch.get(), 0L);
+ fstCompiler.add(scratch.get(), 1L);
+ fstCompiler.add(scratch.get(), 17L);
+ fstCompiler.add(scratch.get(), 1L);
+
+ fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
+ fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
- final FST<Object> fst = builder.finish();
+ final FST<Object> fst = fstCompiler.compile();
Object output = Util.get(fst, new BytesRef(""));
assertNotNull(output);
diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
index 3f0c2bd..9e2f754 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
@@ -43,7 +43,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
@@ -350,29 +350,28 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
}
}
- final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
- 0, 0, true, false, Integer.MAX_VALUE,
- FST_OUTPUTS, true, 15);
+ final FSTCompiler<Pair<BytesRef,Long>> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
+ .shouldShareNonSingletonNodes(false).build();
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
//indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0;
- indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
+ fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
scratchBytes.reset();
// Copy over index for all sub-blocks
for(PendingBlock block : blocks) {
if (block.subIndices != null) {
for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) {
- append(indexBuilder, subIndex, scratchIntsRef);
+ append(fstCompiler, subIndex, scratchIntsRef);
}
block.subIndices = null;
}
}
- index = indexBuilder.finish();
+ index = fstCompiler.compile();
assert subIndices == null;
@@ -387,14 +386,14 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current
// FST.
- private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
+ private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
while((indexEnt = subIndexEnum.next()) != null) {
//if (DEBUG) {
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
//}
- builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
+ fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
}
}
}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
index 6ca21e7..a03ae22 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@@ -52,7 +52,7 @@ import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FST;
@@ -496,7 +496,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
- Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ FSTCompiler<Pair<Long,BytesRef>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRefBuilder previousAnalyzed = null;
@@ -570,7 +570,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
- builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
+ fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
@@ -579,10 +579,10 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
br.length = br.bytes.length;
- builder.add(scratchInts.get(), outputs.newPair(cost, br));
+ fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
- fst = builder.finish();
+ fst = fstCompiler.compile();
//Util.dotToFile(fst, "/tmp/suggest.dot");
} finally {
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
index 10e9bb4..81c079e 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
@@ -66,7 +66,7 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
@@ -304,7 +304,7 @@ public class FreeTextSuggester extends Lookup implements Accountable {
TermsEnum termsEnum = terms.iterator();
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
- Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
while (true) {
@@ -320,10 +320,10 @@ public class FreeTextSuggester extends Lookup implements Accountable {
totTokens += termsEnum.totalTermFreq();
}
- builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
+ fstCompiler.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
}
- fst = builder.finish();
+ fst = fstCompiler.compile();
if (fst == null) {
throw new IllegalArgumentException("need at least one suggestion");
}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
index 5133855..34b1508 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
@@ -25,7 +25,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
@@ -53,7 +53,7 @@ final class NRTSuggesterBuilder {
public static final int END_BYTE = 0x0;
private final PairOutputs<Long, BytesRef> outputs;
- private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
+ private final FSTCompiler<PairOutputs.Pair<Long, BytesRef>> fstCompiler;
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
private final BytesRefBuilder analyzed = new BytesRefBuilder();
private final PriorityQueue<Entry> entries;
@@ -70,7 +70,7 @@ final class NRTSuggesterBuilder {
this.endByte = END_BYTE;
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.entries = new PriorityQueue<>();
- this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
}
/**
@@ -108,7 +108,7 @@ final class NRTSuggesterBuilder {
}
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
Util.toIntsRef(analyzed.get(), scratchInts);
- builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
+ fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
}
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
entries.clear();
@@ -119,11 +119,11 @@ final class NRTSuggesterBuilder {
* {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})}
*/
public boolean store(DataOutput output) throws IOException {
- final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
- if (build == null) {
+ final FST<PairOutputs.Pair<Long, BytesRef>> fst = fstCompiler.compile();
+ if (fst == null) {
return false;
}
- build.save(output);
+ fst.save(output);
/* write some more meta-info */
assert maxAnalyzedPathsPerOutput > 0;
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
index 3d20412..e3237db 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
@@ -169,7 +169,7 @@ public class FSTCompletionBuilder {
* @param shareMaxTailLength
* Max shared suffix sharing length.
*
- * See the description of this parameter in {@link Builder}'s constructor.
+ * See the description of this parameter in {@link org.apache.lucene.util.fst.FSTCompiler.Builder}.
* In general, for very large inputs you'll want to construct a non-minimal
* automaton which will be larger, but the construction will take far less ram.
* For minimal automata, set it to {@link Integer#MAX_VALUE}.
@@ -234,10 +234,9 @@ public class FSTCompletionBuilder {
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
- final Builder<Object> builder = new Builder<>(
- FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
- shareMaxTailLength, outputs, true, 15);
-
+ final FSTCompiler<Object> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
+ .shareMaxTailLength(shareMaxTailLength).build();
+
BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef entry;
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
@@ -246,11 +245,11 @@ public class FSTCompletionBuilder {
while((entry = iter.next()) != null) {
count++;
if (scratch.get().compareTo(entry) != 0) {
- builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
+ fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
scratch.copyBytes(entry);
}
}
- return count == 0 ? null : builder.finish();
+ return count == 0 ? null : fstCompiler.compile();
}
}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
index cb87492..9c8ebb8 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
@@ -40,7 +40,7 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
-import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
@@ -116,7 +116,7 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
long cost = iter.weight();
@@ -127,11 +127,11 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
// added
}
Util.toIntsRef(scratch, scratchInts);
- builder.add(scratchInts.get(), cost);
+ fstCompiler.add(scratchInts.get(), cost);
previous.copyBytes(scratch);
count++;
}
- fst = builder.finish();
+ fst = fstCompiler.compile();
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java
index 590841d..19111d1 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java
@@ -272,27 +272,26 @@ public class FSTTester<T> {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
- final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
- prune1, prune2,
- prune1==0 && prune2==0,
- allowRandomSuffixSharing ? random.nextBoolean() : true,
- allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
- outputs,
- true,
- 15);
+ final FSTCompiler<T> fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
+ .minSuffixCount1(prune1)
+ .minSuffixCount2(prune2)
+ .shouldShareSuffix(prune1==0 && prune2==0)
+ .shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true)
+ .shareMaxTailLength(allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE)
+ .build();
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) {
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
- @SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
+ @SuppressWarnings("unchecked") final FSTCompiler<Object> fstCompilerObject = (FSTCompiler<Object>) fstCompiler;
for(Long value : longValues) {
- builderObject.add(pair.input, value);
+ fstCompilerObject.add(pair.input, value);
}
} else {
- builder.add(pair.input, pair.output);
+ fstCompiler.add(pair.input, pair.output);
}
}
- FST<T> fst = builder.finish();
+ FST<T> fst = fstCompiler.compile();
if (random.nextBoolean() && fst != null) {
IOContext context = LuceneTestCase.newIOContext(random);
@@ -320,7 +319,7 @@ public class FSTTester<T> {
if (fst == null) {
System.out.println(" fst has 0 nodes (fully pruned)");
} else {
- System.out.println(" fst has " + builder.getNodeCount() + " nodes and " + builder.getArcCount() + " arcs");
+ System.out.println(" fst has " + fstCompiler.getNodeCount() + " nodes and " + fstCompiler.getArcCount() + " arcs");
}
}
@@ -330,8 +329,8 @@ public class FSTTester<T> {
verifyPruned(inputMode, fst, prune1, prune2);
}
- nodeCount = builder.getNodeCount();
- arcCount = builder.getArcCount();
+ nodeCount = fstCompiler.getNodeCount();
+ arcCount = fstCompiler.getArcCount();
return fst;
}