You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2019/10/14 18:21:12 UTC

[lucene-solr] branch branch_8_3 updated: LUCENE-8920: Disable direct addressing of arcs. (#950)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8_3
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8_3 by this push:
     new f023961  LUCENE-8920: Disable direct addressing of arcs. (#950)
f023961 is described below

commit f023961ded315077921a30ac032763ec60e6bb26
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Mon Oct 14 18:30:36 2019 +0200

    LUCENE-8920: Disable direct addressing of arcs. (#950)
---
 .../analysis/ja/dict/TokenInfoDictionary$fst.dat   | Bin 1698570 -> 1698570 bytes
 .../analysis/ko/dict/TokenInfoDictionary$fst.dat   | Bin 5641400 -> 5640903 bytes
 .../java/org/apache/lucene/util/fst/Builder.java   |   3 -
 .../src/java/org/apache/lucene/util/fst/FST.java   | 110 +++------------------
 4 files changed, 11 insertions(+), 102 deletions(-)

diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
index 9328c53..c06fd4a 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
index 4bacb9b..fa0cb32 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
index bb9a682..c54b144 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
@@ -50,9 +50,6 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
 
 public class Builder<T> {
 
-  // The amount of Arc array oversizing used to enable direct addressing of Arcs by their labels
-  static final int DIRECT_ARC_LOAD_FACTOR = 4;
-
   private final NodeHash<T> dedupHash;
   final FST<T> fst;
   private final T NO_OUTPUT;
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 26f5e51..e0692b4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -88,8 +88,6 @@ public final class FST<T> implements Accountable {
   // this means either of these things in different contexts
   // in the midst of a direct array:
   private static final byte BIT_MISSING_ARC = 1 << 6;
-  // at the start of a direct array:
-  private static final byte ARCS_AS_ARRAY_WITH_GAPS = BIT_MISSING_ARC;
 
   /**
    * @see #shouldExpand(Builder, Builder.UnCompiledNode)
@@ -109,7 +107,7 @@ public final class FST<T> implements Accountable {
   // Increment version to change it
   private static final String FILE_FORMAT_NAME = "FST";
   private static final int VERSION_START = 6;
-  private static final int VERSION_CURRENT = 7;
+  private static final int VERSION_CURRENT = VERSION_START;
 
   // Never serialized; just used to represent the virtual
   // final node w/ no arcs:
@@ -645,35 +643,19 @@ public final class FST<T> implements Accountable {
       assert maxBytesPerArc > 0;
       // 2nd pass just "expands" all arcs to take up a fixed byte size
 
-      // If more than (1 / DIRECT_ARC_LOAD_FACTOR) of the "slots" would be occupied, write an arc
-      // array that may have holes in it so that we can address the arcs directly by label without
-      // binary search
-      int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
-      boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
-
-      //System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs);
       // create the header
       // TODO: clean this up: or just rewind+reuse and deal with it
       byte header[] = new byte[MAX_HEADER_SIZE]; 
       ByteArrayDataOutput bad = new ByteArrayDataOutput(header);
       // write a "false" first arc:
-      if (writeDirectly) {
-        bad.writeByte(ARCS_AS_ARRAY_WITH_GAPS);
-        bad.writeVInt(labelRange);
-      } else {
-        bad.writeByte(ARCS_AS_ARRAY_PACKED);
-        bad.writeVInt(nodeIn.numArcs);
-      }
+      bad.writeByte(ARCS_AS_ARRAY_PACKED);
+      bad.writeVInt(nodeIn.numArcs);
       bad.writeVInt(maxBytesPerArc);
       int headerLen = bad.getPosition();
       
       final long fixedArrayStart = startAddress + headerLen;
 
-      if (writeDirectly) {
-        writeArrayWithGaps(builder, nodeIn, fixedArrayStart, maxBytesPerArc, labelRange);
-      } else {
-        writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc);
-      }
+      writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc);
       
       // now write the header
       builder.bytes.writeBytes(startAddress, header, 0, headerLen);
@@ -707,45 +689,7 @@ public final class FST<T> implements Accountable {
     }
   }
 
-  private void writeArrayWithGaps(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long fixedArrayStart, int maxBytesPerArc, int labelRange) {
-    // expand the arcs in place, backwards
-    long srcPos = builder.bytes.getPosition();
-    long destPos = fixedArrayStart + labelRange * maxBytesPerArc;
-    // if destPos == srcPos it means all the arcs were the same length, and the array of them is *already* direct
-    assert destPos >= srcPos;
-    if (destPos > srcPos) {
-      builder.bytes.skipBytes((int) (destPos - srcPos));
-      int arcIdx = nodeIn.numArcs - 1;
-      int firstLabel = nodeIn.arcs[0].label;
-      int nextLabel = nodeIn.arcs[arcIdx].label;
-      for (int directArcIdx = labelRange - 1; directArcIdx >= 0; directArcIdx--) {
-        destPos -= maxBytesPerArc;
-        if (directArcIdx == nextLabel - firstLabel) {
-          int arcLen = builder.reusedBytesPerArc[arcIdx];
-          srcPos -= arcLen;
-          //System.out.println("  direct pack idx=" + directArcIdx + " arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos + " label=" + nextLabel);
-          if (srcPos != destPos) {
-            //System.out.println("  copy len=" + builder.reusedBytesPerArc[arcIdx]);
-            assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + builder.reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
-            builder.bytes.copyBytes(srcPos, destPos, arcLen);
-            if (arcIdx == 0) {
-              break;
-            }
-          }
-          --arcIdx;
-          nextLabel = nodeIn.arcs[arcIdx].label;
-        } else {
-          assert directArcIdx > arcIdx;
-          // mark this as a missing arc
-          //System.out.println("  direct pack idx=" + directArcIdx + " no arc");
-          builder.bytes.writeByte(destPos, BIT_MISSING_ARC);
-        }
-      }
-    }
-  }
-
-  /** Fills virtual 'start' arc, ie, an empty incoming arc to
-   *  the FST's start node */
+  /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */
   public Arc<T> getFirstArc(Arc<T> arc) {
     T NO_OUTPUT = outputs.getNoOutput();
 
@@ -786,18 +730,13 @@ public final class FST<T> implements Accountable {
     } else {
       in.setPosition(follow.target);
       final byte b = in.readByte();
-      if (b == ARCS_AS_ARRAY_PACKED || b == ARCS_AS_ARRAY_WITH_GAPS) {
+      if (b == ARCS_AS_ARRAY_PACKED) {
         // array: jump straight to end
         arc.numArcs = in.readVInt();
         arc.bytesPerArc = in.readVInt();
         //System.out.println("  array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
         arc.posArcsStart = in.getPosition();
-        if (b == ARCS_AS_ARRAY_WITH_GAPS) {
-          arc.arcIdx = Integer.MIN_VALUE;
-          arc.nextArc = arc.posArcsStart - (arc.numArcs - 1) * arc.bytesPerArc;
-        } else {
-          arc.arcIdx = arc.numArcs - 2;
-        }
+        arc.arcIdx = arc.numArcs - 2;
       } else {
         arc.flags = b;
         // non-array: linear scan
@@ -868,7 +807,7 @@ public final class FST<T> implements Accountable {
     //System.out.println("   flags=" + arc.flags);
 
     byte flags = in.readByte();
-    if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
+    if (flags == ARCS_AS_ARRAY_PACKED) {
       //System.out.println("  fixedArray");
       // this is first arc in a fixed-array
       arc.numArcs = in.readVInt();
@@ -901,7 +840,7 @@ public final class FST<T> implements Accountable {
     } else {
       in.setPosition(follow.target);
       byte flags = in.readByte();
-      return flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS;
+      return flags == ARCS_AS_ARRAY_PACKED;
     }
   }
 
@@ -931,7 +870,7 @@ public final class FST<T> implements Accountable {
       in.setPosition(pos);
 
       final byte flags = in.readByte();
-      if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
+      if (flags == ARCS_AS_ARRAY_PACKED) {
         //System.out.println("    nextArc fixed array");
         in.readVInt();
 
@@ -1140,34 +1079,7 @@ public final class FST<T> implements Accountable {
     // System.out.println("fta label=" + (char) labelToMatch);
 
     byte flags = in.readByte();
-    if (flags == ARCS_AS_ARRAY_WITH_GAPS) {
-      arc.numArcs = in.readVInt();
-      arc.bytesPerArc = in.readVInt();
-      arc.posArcsStart = in.getPosition();
-
-      // Array is direct; address by label
-      in.skipBytes(1);
-      int firstLabel = readLabel(in);
-      int arcPos = labelToMatch - firstLabel;
-      if (arcPos == 0) {
-        arc.nextArc = arc.posArcsStart;
-      } else if (arcPos > 0) {
-        if (arcPos >= arc.numArcs) {
-          return null;
-        }
-        in.setPosition(arc.posArcsStart - arc.bytesPerArc * arcPos);
-        flags = in.readByte();
-        if (flag(flags, BIT_MISSING_ARC)) {
-          return null;
-        }
-        // point to flags that we just read
-        arc.nextArc = in.getPosition() + 1;
-      } else {
-        return null;
-      }
-      arc.arcIdx = Integer.MIN_VALUE;
-      return readNextRealArc(arc, in);
-    } else if (flags == ARCS_AS_ARRAY_PACKED) {
+    if (flags == ARCS_AS_ARRAY_PACKED) {
       arc.numArcs = in.readVInt();
       arc.bytesPerArc = in.readVInt();
       arc.posArcsStart = in.getPosition();