You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2019/10/14 18:21:12 UTC
[lucene-solr] branch branch_8_3 updated: LUCENE-8920: Disable
direct addressing of arcs. (#950)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_8_3
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8_3 by this push:
new f023961 LUCENE-8920: Disable direct addressing of arcs. (#950)
f023961 is described below
commit f023961ded315077921a30ac032763ec60e6bb26
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Mon Oct 14 18:30:36 2019 +0200
LUCENE-8920: Disable direct addressing of arcs. (#950)
---
.../analysis/ja/dict/TokenInfoDictionary$fst.dat | Bin 1698570 -> 1698570 bytes
.../analysis/ko/dict/TokenInfoDictionary$fst.dat | Bin 5641400 -> 5640903 bytes
.../java/org/apache/lucene/util/fst/Builder.java | 3 -
.../src/java/org/apache/lucene/util/fst/FST.java | 110 +++------------------
4 files changed, 11 insertions(+), 102 deletions(-)
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
index 9328c53..c06fd4a 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
index 4bacb9b..fa0cb32 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
index bb9a682..c54b144 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
@@ -50,9 +50,6 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
public class Builder<T> {
- // The amount of Arc array oversizing used to enable direct addressing of Arcs by their labels
- static final int DIRECT_ARC_LOAD_FACTOR = 4;
-
private final NodeHash<T> dedupHash;
final FST<T> fst;
private final T NO_OUTPUT;
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 26f5e51..e0692b4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -88,8 +88,6 @@ public final class FST<T> implements Accountable {
// this means either of these things in different contexts
// in the midst of a direct array:
private static final byte BIT_MISSING_ARC = 1 << 6;
- // at the start of a direct array:
- private static final byte ARCS_AS_ARRAY_WITH_GAPS = BIT_MISSING_ARC;
/**
* @see #shouldExpand(Builder, Builder.UnCompiledNode)
@@ -109,7 +107,7 @@ public final class FST<T> implements Accountable {
// Increment version to change it
private static final String FILE_FORMAT_NAME = "FST";
private static final int VERSION_START = 6;
- private static final int VERSION_CURRENT = 7;
+ private static final int VERSION_CURRENT = VERSION_START;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@@ -645,35 +643,19 @@ public final class FST<T> implements Accountable {
assert maxBytesPerArc > 0;
// 2nd pass just "expands" all arcs to take up a fixed byte size
- // If more than (1 / DIRECT_ARC_LOAD_FACTOR) of the "slots" would be occupied, write an arc
- // array that may have holes in it so that we can address the arcs directly by label without
- // binary search
- int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
- boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
-
- //System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs);
// create the header
// TODO: clean this up: or just rewind+reuse and deal with it
byte header[] = new byte[MAX_HEADER_SIZE];
ByteArrayDataOutput bad = new ByteArrayDataOutput(header);
// write a "false" first arc:
- if (writeDirectly) {
- bad.writeByte(ARCS_AS_ARRAY_WITH_GAPS);
- bad.writeVInt(labelRange);
- } else {
- bad.writeByte(ARCS_AS_ARRAY_PACKED);
- bad.writeVInt(nodeIn.numArcs);
- }
+ bad.writeByte(ARCS_AS_ARRAY_PACKED);
+ bad.writeVInt(nodeIn.numArcs);
bad.writeVInt(maxBytesPerArc);
int headerLen = bad.getPosition();
final long fixedArrayStart = startAddress + headerLen;
- if (writeDirectly) {
- writeArrayWithGaps(builder, nodeIn, fixedArrayStart, maxBytesPerArc, labelRange);
- } else {
- writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc);
- }
+ writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc);
// now write the header
builder.bytes.writeBytes(startAddress, header, 0, headerLen);
@@ -707,45 +689,7 @@ public final class FST<T> implements Accountable {
}
}
- private void writeArrayWithGaps(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long fixedArrayStart, int maxBytesPerArc, int labelRange) {
- // expand the arcs in place, backwards
- long srcPos = builder.bytes.getPosition();
- long destPos = fixedArrayStart + labelRange * maxBytesPerArc;
- // if destPos == srcPos it means all the arcs were the same length, and the array of them is *already* direct
- assert destPos >= srcPos;
- if (destPos > srcPos) {
- builder.bytes.skipBytes((int) (destPos - srcPos));
- int arcIdx = nodeIn.numArcs - 1;
- int firstLabel = nodeIn.arcs[0].label;
- int nextLabel = nodeIn.arcs[arcIdx].label;
- for (int directArcIdx = labelRange - 1; directArcIdx >= 0; directArcIdx--) {
- destPos -= maxBytesPerArc;
- if (directArcIdx == nextLabel - firstLabel) {
- int arcLen = builder.reusedBytesPerArc[arcIdx];
- srcPos -= arcLen;
- //System.out.println(" direct pack idx=" + directArcIdx + " arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos + " label=" + nextLabel);
- if (srcPos != destPos) {
- //System.out.println(" copy len=" + builder.reusedBytesPerArc[arcIdx]);
- assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + builder.reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
- builder.bytes.copyBytes(srcPos, destPos, arcLen);
- if (arcIdx == 0) {
- break;
- }
- }
- --arcIdx;
- nextLabel = nodeIn.arcs[arcIdx].label;
- } else {
- assert directArcIdx > arcIdx;
- // mark this as a missing arc
- //System.out.println(" direct pack idx=" + directArcIdx + " no arc");
- builder.bytes.writeByte(destPos, BIT_MISSING_ARC);
- }
- }
- }
- }
-
- /** Fills virtual 'start' arc, ie, an empty incoming arc to
- * the FST's start node */
+ /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */
public Arc<T> getFirstArc(Arc<T> arc) {
T NO_OUTPUT = outputs.getNoOutput();
@@ -786,18 +730,13 @@ public final class FST<T> implements Accountable {
} else {
in.setPosition(follow.target);
final byte b = in.readByte();
- if (b == ARCS_AS_ARRAY_PACKED || b == ARCS_AS_ARRAY_WITH_GAPS) {
+ if (b == ARCS_AS_ARRAY_PACKED) {
// array: jump straight to end
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
arc.posArcsStart = in.getPosition();
- if (b == ARCS_AS_ARRAY_WITH_GAPS) {
- arc.arcIdx = Integer.MIN_VALUE;
- arc.nextArc = arc.posArcsStart - (arc.numArcs - 1) * arc.bytesPerArc;
- } else {
- arc.arcIdx = arc.numArcs - 2;
- }
+ arc.arcIdx = arc.numArcs - 2;
} else {
arc.flags = b;
// non-array: linear scan
@@ -868,7 +807,7 @@ public final class FST<T> implements Accountable {
//System.out.println(" flags=" + arc.flags);
byte flags = in.readByte();
- if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
+ if (flags == ARCS_AS_ARRAY_PACKED) {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
@@ -901,7 +840,7 @@ public final class FST<T> implements Accountable {
} else {
in.setPosition(follow.target);
byte flags = in.readByte();
- return flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS;
+ return flags == ARCS_AS_ARRAY_PACKED;
}
}
@@ -931,7 +870,7 @@ public final class FST<T> implements Accountable {
in.setPosition(pos);
final byte flags = in.readByte();
- if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
+ if (flags == ARCS_AS_ARRAY_PACKED) {
//System.out.println(" nextArc fixed array");
in.readVInt();
@@ -1140,34 +1079,7 @@ public final class FST<T> implements Accountable {
// System.out.println("fta label=" + (char) labelToMatch);
byte flags = in.readByte();
- if (flags == ARCS_AS_ARRAY_WITH_GAPS) {
- arc.numArcs = in.readVInt();
- arc.bytesPerArc = in.readVInt();
- arc.posArcsStart = in.getPosition();
-
- // Array is direct; address by label
- in.skipBytes(1);
- int firstLabel = readLabel(in);
- int arcPos = labelToMatch - firstLabel;
- if (arcPos == 0) {
- arc.nextArc = arc.posArcsStart;
- } else if (arcPos > 0) {
- if (arcPos >= arc.numArcs) {
- return null;
- }
- in.setPosition(arc.posArcsStart - arc.bytesPerArc * arcPos);
- flags = in.readByte();
- if (flag(flags, BIT_MISSING_ARC)) {
- return null;
- }
- // point to flags that we just read
- arc.nextArc = in.getPosition() + 1;
- } else {
- return null;
- }
- arc.arcIdx = Integer.MIN_VALUE;
- return readNextRealArc(arc, in);
- } else if (flags == ARCS_AS_ARRAY_PACKED) {
+ if (flags == ARCS_AS_ARRAY_PACKED) {
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
arc.posArcsStart = in.getPosition();