You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by mh...@apache.org on 2013/09/24 20:33:24 UTC
[48/50] [abbrv] git commit: fixed fst errors
fixed fst errors
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b7ca14a7
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b7ca14a7
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b7ca14a7
Branch: refs/heads/branch_4x
Commit: b7ca14a77922a0e4d216781f74b9c1379f350698
Parents: 96a95e3
Author: James Blair <jm...@gmail.com>
Authored: Tue Aug 13 15:49:10 2013 -0400
Committer: Paul Irwin <pa...@gmail.com>
Committed: Tue Aug 13 16:18:11 2013 -0400
----------------------------------------------------------------------
src/core/Util/Fst/FST.cs | 478 +++++++++++++++++++++++++++++++++++++-----
1 file changed, 423 insertions(+), 55 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b7ca14a7/src/core/Util/Fst/FST.cs
----------------------------------------------------------------------
diff --git a/src/core/Util/Fst/FST.cs b/src/core/Util/Fst/FST.cs
index c6cbcd6..70f4e8e 100644
--- a/src/core/Util/Fst/FST.cs
+++ b/src/core/Util/Fst/FST.cs
@@ -9,13 +9,15 @@ using Lucene.Net.Codecs;
namespace Lucene.Net.Util.Fst
{
- public class FST<T> : FST
+ public sealed class FST<T> : FST
{
private readonly INPUT_TYPE inputType;
public INPUT_TYPE InputType { get { return inputType; } }
private int[] bytesPerArc = new int[0];
+ // if non-null, this FST accepts the empty string and
+ // produces this output
private T emptyOutput;
public T EmptyOutput
{
@@ -37,18 +39,24 @@ namespace Lucene.Net.Util.Fst
private readonly Outputs<T> outputs;
public Outputs<T> Outputs { get { return outputs; } }
+ // Used for the BIT_TARGET_NEXT optimization (whereby
+ // instead of storing the address of the target node for
+ // a given arc, we mark a single bit noting that the next
+ // node in the byte[] is the target node):
private long lastFrozenNode;
private readonly T NO_OUTPUT;
internal long NodeCount { get; set; }
-
public long ArcCount { get; set; }
public long ArcWithOutputCount { get; set; }
private readonly bool packed;
private PackedInts.IReader nodeRefToAddress;
+ /// <summary>
+ /// If arc has this label then that arc is final/accepted
+ /// </summary>
public const int END_LABEL = -1;
private readonly bool allowArrayArcs;
@@ -63,10 +71,15 @@ namespace Lucene.Net.Util.Fst
private GrowableWriter NodeAddress;
+ // TODO: we could be smarter here, and prune periodically
+ // as we go; high in-count nodes will "usually" become
+ // clear early on:
private GrowableWriter InCounts;
private readonly int Version;
+ // make a new empty FST, for building; Builder invokes
+ // this ctor
internal FST(INPUT_TYPE inputType, Outputs<T> outputs, bool willPackFST, float acceptableOverheadRatio,
bool allowArrayArcs, int bytesPageBits)
{
@@ -75,6 +88,8 @@ namespace Lucene.Net.Util.Fst
this.allowArrayArcs = allowArrayArcs;
Version = VERSION_CURRENT;
bytes = new BytesStore(bytesPageBits);
+ // pad: ensure no node gets address 0 which is reserved to mean
+ // the stop state w/ no arcs
bytes.WriteByte((Byte)0);
NO_OUTPUT = outputs.GetNoOutput();
if (willPackFST)
@@ -95,31 +110,53 @@ namespace Lucene.Net.Util.Fst
public static readonly int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
+ /// <summary>
+ /// Load a previously saved FST.
+ /// </summary>
+ /// <param name="input"></param>
+ /// <param name="outputs"></param>
public FST(DataInput input, Outputs<T> outputs)
: this(input, outputs, DEFAULT_MAX_BLOCK_BITS)
{
}
+ /// <summary>
+ /// Load a previously saved FST; maxBlockBits allows you to
+ /// control the size of the byte[] pages used to hold the FST bytes.
+ /// </summary>
+ /// <param name="dataInput"></param>
+ /// <param name="outputs"></param>
+ /// <param name="maxBlockBits"></param>
public FST(DataInput dataInput, Outputs<T> outputs, int maxBlockBits)
{
this.outputs = outputs;
- if (maxBlockBits < 1 || maxBlockBits > 30) throw new ArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits, "maxBlockBits");
- Version = CodecUtil.CheckHeader(dataInput, FILE_FORMAT_NAME, VERSIONpacked, VERSION_VINT_TARGET);
+ if (maxBlockBits < 1 || maxBlockBits > 30)
+ throw new ArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits, "maxBlockBits");
+
+ // NOTE: only reads most recent format; we don't have
+ // back-compat promise for FSTs (they are experimental):
+ Version = CodecUtil.CheckHeader(dataInput, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
packed = dataInput.ReadByte() == 1;
if (dataInput.ReadByte() == 1)
{
+ // accepts empty string
+ // 1 KB blocks:
var emptyBytes = new BytesStore(10);
int numBytes = dataInput.ReadVInt();
emptyBytes.CopyBytes(dataInput, numBytes);
+ // De-serialize empty-string output:
FST.BytesReader reader;
if (packed)
reader = emptyBytes.GetForwardReader();
else
{
reader = emptyBytes.GetReverseReader();
+ // NoOutputs uses 0 bytes when writing its output,
+ // so we have to check here else BytesStore gets
+ // angry:
if (numBytes > 0)
reader.Position = numBytes - 1;
}
@@ -159,9 +196,16 @@ namespace Lucene.Net.Util.Fst
CacheRootArcs();
+ // NOTE: bogus because this is only used during
+ // building; we need to break out mutable FST from
+ // immutable
allowArrayArcs = false;
}
+ /// <summary>
+ /// Returns bytes used to represent the FST.
+ /// </summary>
+ /// <returns></returns>
public long SizeInBytes()
{
var size = bytes.GetPosition();
@@ -197,6 +241,7 @@ namespace Lucene.Net.Util.Fst
return node;
}
+ // Caches first 128 labels
private void CacheRootArcs()
{
cachedRootArcs = new Arc<T>[0x80];
@@ -238,6 +283,8 @@ namespace Lucene.Net.Util.Fst
else
output.WriteByte((sbyte)0);
+ // TODO: really we should encode this as an arc, arriving
+ // to the root node, instead of special casing here:
if (emptyOutput != null)
{
// Accepts empty string
@@ -298,10 +345,14 @@ namespace Lucene.Net.Util.Fst
bytes.WriteTo(output);
}
- public void Save(FileStream fileStream)
+ /// <summary>
+ /// Writes an automaton to a file
+ /// </summary>
+ /// <param name="file"></param>
+ public void Save(FileInfo fileInfo)
{
var success = false;
- var bs = new BufferedStream(fileStream);
+ var bs = new BufferedStream(new FileStream(fileInfo.FullName));
try
{
Save(new OutputStreamDataOutput(bs));
@@ -316,9 +367,16 @@ namespace Lucene.Net.Util.Fst
}
}
- public static FST<TMethod> Read<TMethod>(FileStream fileStream, Outputs<TMethod> outputs) where TMethod : class
+ /// <summary>
+ /// Reads an automaton from a file
+ /// </summary>
+ /// <typeparam name="TMethod"></typeparam>
+ /// <param name="fileInfo"></param>
+ /// <param name="outputs"></param>
+ /// <returns></returns>
+ public static FST<TMethod> Read<TMethod>(FileStream fileInfo, Outputs<TMethod> outputs) where TMethod : class
{
- var bs = new BufferedStream(fileStream);
+ var bs = new BufferedStream(new FileStream(fileInfo));
var success = false;
try
{
@@ -373,11 +431,20 @@ namespace Lucene.Net.Util.Fst
return v;
}
+ /// <summary>
+ /// Returns true if the node at this address
+ /// has any outgoing arcs
+ /// </summary>
+ /// <typeparam name="TMethod"></typeparam>
+ /// <param name="arc"></param>
+ /// <returns></returns>
public static bool TargetHasArcs<TMethod>(Arc<TMethod> arc)
{
return arc.Target > 0;
}
+ // Serializes new node by appending its bytes to the end
+ // of the current byte[]
internal long AddNode(Builder<T>.UnCompiledNode<T> nodeIn)
{
if (nodeIn.NumArcs == 0)
@@ -408,6 +475,9 @@ namespace Lucene.Net.Util.Fst
flags += BIT_LAST_ARC;
if (lastFrozenNode == target.Node && !doFixedArray)
+ // TODO: for better perf (but more RAM used) we
+ // could avoid this except when arc is "near" the
+ // last arc:
flags += BIT_TARGET_NEXT;
if (arc.IsFinal)
@@ -418,7 +488,6 @@ namespace Lucene.Net.Util.Fst
}
else
{
- // TODO: Assert is correct here?
//Debug.Assert(arc.NextFinalOutput == NO_OUTPUT);
}
@@ -453,6 +522,9 @@ namespace Lucene.Net.Util.Fst
bytes.WriteVLong(target.Node);
}
+ // just write the arcs "like normal" on first pass,
+ // but record how many bytes each one took, and max
+ // byte size:
if (doFixedArray)
{
bytesPerArc[arcIdx] = (int)(bytes.GetPosition() - lastArcStart);
@@ -461,14 +533,36 @@ namespace Lucene.Net.Util.Fst
}
}
+ // TODO: try to avoid wasteful cases: disable doFixedArray in that case
+ /*
+ *
+ * LUCENE-4682: what is a fair heuristic here?
+ * It could involve some of these:
+ * 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount?
+ * 2. how much binSearch saves over scan: nodeIn.numArcs
+ * 3. waste: numBytes vs numBytesExpanded
+ *
+ * the one below just looks at #3
+ if (doFixedArray) {
+ // rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor????
+ int numBytes = lastArcStart - startAddress;
+ int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs;
+ if (numBytesExpanded > numBytes*1.25) {
+ doFixedArray = false;
+ }
+ }
+ */
+
if (doFixedArray)
{
var MAX_HEADER_SIZE = 11;
- // TODO: assert correct here?
- Debug.Assert(maxBytesPerArc > 0);
+ // assert maxBytesPerArc > 0;
+ // create the header
+ // TODO: clean this up: or just rewind+reuse and deal with it
var header = new byte[MAX_HEADER_SIZE];
var bad = new ByteArrayDataOutput(header);
+ // write a "false" first arc:
bad.WriteByte(ARCS_AS_FIXED_ARRAY);
bad.WriteVInt(nodeIn.NumArcs);
bad.WriteVInt(maxBytesPerArc);
@@ -476,10 +570,10 @@ namespace Lucene.Net.Util.Fst
long fixedArrayStart = startAddress + headerLen;
+ // expand the arcs in place, backwards
var srcPos = bytes.GetPosition();
var destPos = fixedArrayStart + nodeIn.NumArcs * maxBytesPerArc;
- // TODO: assert correct here?
- Debug.Assert(destPos >= srcPos);
+ // assert destPos >= srcPos
if (destPos > srcPos)
{
bytes.SkipBytes((int)(destPos - srcPos));
@@ -517,6 +611,7 @@ namespace Lucene.Net.Util.Fst
long node;
if (NodeAddress != null)
{
+ // Nodes are addressed by 1+ord:
if ((int)NodeCount == NodeAddress.Size())
{
NodeAddress =
@@ -536,6 +631,12 @@ namespace Lucene.Net.Util.Fst
return node;
}
+ /// <summary>
+ /// Fills virtual 'start' arc, ie, an empty incoming arc
+ /// to the FST's start node
+ /// </summary>
+ /// <param name="arc"></param>
+ /// <returns></returns>
public Arc<T> GetFirstArc(Arc<T> arc)
{
if (EmptyOutput != null)
@@ -550,15 +651,26 @@ namespace Lucene.Net.Util.Fst
}
arc.Output = NO_OUTPUT;
+ // If there are no nodes, ie, the FST only accepts the
+ // empty string, then startNode is 0
+ arc.Target = startNode;
return arc;
}
+ /// <summary>
+ /// Follows the <code>follow</code> arc and reads the last
+ /// arc of its target; this changes the provided
+ /// <code>arc</code> (2nd arg) in-place and returns it
+ /// </summary>
+ /// <param name="follow"></param>
+ /// <param name="arc"></param>
+ /// <param name="input"></param>
+ /// <returns>Returns the second argument</returns>
public Arc<T> ReadLastTargetArc(Arc<T> follow, Arc<T> arc, FST.BytesReader input)
{
if (!TargetHasArcs(follow))
{
- // TODO: assert correct here?
- Debug.Assert(follow.IsFinal());
+ // assert follow.isFinal();
arc.Label = END_LABEL;
arc.Target = FINAL_END_NODE;
arc.Output = follow.NextFinalOutput;
@@ -572,6 +684,7 @@ namespace Lucene.Net.Util.Fst
var b = input.ReadByte();
if (b == ARCS_AS_FIXED_ARRAY)
{
+ // array: jump straight to end
arc.NumArcs = input.ReadVInt();
if (packed || Version >= VERSION_VINT_TARGET)
arc.BytesPerArc = input.ReadVInt();
@@ -584,15 +697,21 @@ namespace Lucene.Net.Util.Fst
else
{
arc.Flags = (sbyte)b;
+ // non-array: linear scan
arc.BytesPerArc = 0;
while (!arc.IsLast())
{
+ // skip this arc:
ReadLabel(input);
if (arc.Flag(BIT_ARC_HAS_OUTPUT))
+ {
Outputs.Read(input);
+ }
if (arc.Flag(BIT_ARC_HAS_FINAL_OUTPUT))
+ {
Outputs.ReadFinalOutput(input);
+ }
if (arc.Flag(BIT_STOP_NODE))
{
}
@@ -606,14 +725,13 @@ namespace Lucene.Net.Util.Fst
arc.Flags = (sbyte)input.ReadByte();
}
-
+ // Undo the byte flags we read:
input.SkipBytes(-1);
arc.NextArc = input.Position;
}
ReadNextRealArc(arc, input);
- // TODO: assert is correct here?
- //Debug.Assert(arc.IsLast());
+ // assert arc.isLast();
return arc;
}
}
@@ -629,10 +747,20 @@ namespace Lucene.Net.Util.Fst
return target;
}
+ /// <summary>
+ /// Follow the <code>follow</code> arc and read the first arc of its target;
+ /// this changes the provided <code>arc</code> (2nd arg) in-place and returns it.
+ /// </summary>
+ /// <param name="follow"></param>
+ /// <param name="arc"></param>
+ /// <param name="input"></param>
+ /// <returns>Returns the second argument (<code>arc</code>)</returns>
public Arc<T> ReadFirstTargetArc(Arc<T> follow, Arc<T> arc, FST.BytesReader input)
{
+ // int pos = address;
if (follow.IsFinal())
{
+ // Insert "fake" final first arc:
arc.Label = END_LABEL;
arc.Output = follow.NextFinalOutput;
arc.Flags = (sbyte)BIT_FINAL_ARC;
@@ -641,6 +769,7 @@ namespace Lucene.Net.Util.Fst
else
{
arc.Node = follow.Target;
+ // NOTE: nextArc is a node (not an address!) in this case:
arc.NextArc = follow.Target;
}
arc.Target = FINAL_END_NODE;
@@ -679,6 +808,13 @@ namespace Lucene.Net.Util.Fst
return ReadNextRealArc(arc, input);
}
+ /// <summary>
+ /// Checks if <code>arc</code>'s target state is in expanded (or vector) format.
+ /// </summary>
+ /// <param name="follow"></param>
+ /// <param name="input"></param>
+ /// <returns>Returns <code>true</code> if <code>arc</code> points to a state
+ /// in an expanded array format.</returns>
internal bool IsExpandedTarget(Arc<T> follow, FST.BytesReader input)
{
if (!TargetHasArcs(follow))
@@ -690,10 +826,17 @@ namespace Lucene.Net.Util.Fst
}
}
+ /// <summary>
+ /// In-place read; returns the arc.
+ /// </summary>
+ /// <param name="arc"></param>
+ /// <param name="input"></param>
+ /// <returns></returns>
public Arc<T> ReadNextArc(Arc<T> arc, FST.BytesReader input)
{
if (arc.Label == END_LABEL)
{
+ // This was a fake isnerted "final" arc
if (arc.NextArc <= 0)
throw new ArgumentException("cannot readNextArc when arc.isLast()=true");
return ReadFirstRealTargetArc(arc.NextArc, arc, input);
@@ -704,10 +847,17 @@ namespace Lucene.Net.Util.Fst
}
}
+ /// <summary>
+ /// Peeks at next arc's label; does not alter arc. Do
+ /// not call this if arc.IsLast()!
+ /// </summary>
+ /// <param name="arc"></param>
+ /// <param name="input"></param>
+ /// <returns></returns>
public int ReadNextArcLabel(Arc<T> arc, FST.BytesReader input)
{
- // TODO: assert correct here?
- Debug.Assert(!arc.IsLast());
+ if (arc.IsLast)
+ throw new ArgumentException("cannot readNextArc when arc.isLast()=true");
if (arc.Label == END_LABEL)
{
@@ -719,6 +869,7 @@ namespace Lucene.Net.Util.Fst
{
input.ReadVInt();
+ // Skip bytesPerArc:
if (packed || Version >= VERSION_VINT_TARGET)
input.ReadVInt();
else
@@ -726,60 +877,101 @@ namespace Lucene.Net.Util.Fst
}
else
{
+ input.Position = pos;
+ }
+ }
+ else
+ {
+ if (arc.BytesPerArc != 0)
+ {
+ // arcs are at fixed entries
+ input.Position = arc.PosArcsStart;
+ input.SkipBytes((1 + arc.ArcIdx) * arc.BytesPerArc);
+ }
+ else
+ {
+ // arcs are packed
input.Position = arc.NextArc;
}
}
+ // skip flags
input.ReadByte();
return ReadLabel(input);
}
+ /// <summary>
+ /// Never returns null, but you should never call this if
+ /// arc.IsLast() is true.
+ /// </summary>
+ /// <param name="arc"></param>
+ /// <param name="input"></param>
+ /// <returns></returns>
public Arc<T> ReadNextRealArc(Arc<T> arc, FST.BytesReader input)
{
+ // TODO: can't assert this because we call from readFirstArc
+ // assert !flag(arc.flags, BIT_LAST_ARC);
+
+ // this is a continuing arc in a fixed array
if (arc.BytesPerArc != 0)
{
+ // arcs are at fixed entries
arc.ArcIdx++;
- // TODO: assert correct here?
- Debug.Assert(arc.ArcIdx < arc.NumArcs);
+ // assert arc.arcIdx < arc.numArcs;
input.Position = arc.PosArcsStart;
input.SkipBytes(arc.ArcIdx * arc.BytesPerArc);
}
else
{
+ // arcs are packed
input.Position = arc.NextArc;
}
arc.Flags = (sbyte)input.ReadByte();
arc.Label = ReadLabel(input);
if (arc.Flag(BIT_ARC_HAS_OUTPUT))
+ {
arc.Output = Outputs.Read(input);
+ }
else
+ {
arc.Output = Outputs.GetNoOutput();
+ }
if (arc.Flag(BIT_ARC_HAS_FINAL_OUTPUT))
+ {
arc.NextFinalOutput = Outputs.ReadFinalOutput(input);
+ }
else
+ {
arc.NextFinalOutput = Outputs.GetNoOutput();
+ }
if (arc.Flag(BIT_STOP_NODE))
{
if (arc.Flag(BIT_FINAL_ARC))
+ {
arc.Target = FINAL_END_NODE;
+ }
else
+ {
arc.Target = NON_FINAL_END_NODE;
-
+ }
arc.NextArc = input.Position;
}
else if (arc.Flag(BIT_TARGET_NEXT))
{
arc.NextArc = input.Position;
+ // TODO: would be nice to make this lazy -- maybe
+ // caller doesn't need the target and is scanning arcs...
if (NodeAddress == null)
{
if (!arc.Flag(BIT_LAST_ARC))
{
if (arc.BytesPerArc == 0)
{
+ // must scan
SeekToNextNode(input);
}
else
@@ -793,8 +985,7 @@ namespace Lucene.Net.Util.Fst
else
{
arc.Target = arc.Node - 1;
- // TODO: assert correct here?
- Debug.Assert(arc.Target > 0);
+ // assert arc.target > 0
}
}
else
@@ -805,14 +996,17 @@ namespace Lucene.Net.Util.Fst
var code = input.ReadVLong();
if (arc.Flag(BIT_TARGET_DELTA))
{
+ // Address is delta-coded from current address:
arc.Target = pos + code;
}
else if (code < nodeRefToAddress.Size())
{
+ // Deref
arc.Target = nodeRefToAddress.Get((int)code);
}
else
{
+ // Absolute
arc.Target = code;
}
}
@@ -825,10 +1019,22 @@ namespace Lucene.Net.Util.Fst
return arc;
}
+ // TODO: could we somehow [partially] tableize arc lookups
+ // look automaton?
+
+ /// <summary>
+ /// Finds an arc leaving the incoming arc, replacing the arc in place.
+ /// This returns null if the arc was not found, else the incoming arc.
+ /// </summary>
+ /// <param name="labelToMatch"></param>
+ /// <param name="follow"></param>
+ /// <param name="arc"></param>
+ /// <param name="input"></param>
+ /// <returns></returns>
public Arc<T> FindTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, FST.BytesReader input)
{
- // TODO: appropriate error message
- if (cachedRootArcs == null) throw new InvalidOperationException("cachedRootArcs cannot be null");
+ if (cachedRootArcs == null)
+ throw new InvalidOperationException("cachedRootArcs cannot be null");
if (labelToMatch == END_LABEL)
{
@@ -841,6 +1047,7 @@ namespace Lucene.Net.Util.Fst
else
{
arc.Flags = 0;
+ // NOTE: nextArc is a node (not an address!) in this case:
arc.NextArc = follow.Target;
arc.Node = follow.Target;
}
@@ -854,6 +1061,7 @@ namespace Lucene.Net.Util.Fst
}
}
+ // Short-circuit if this arc is in the root arc cache:
if (follow.Target == startNode && labelToMatch < cachedRootArcs.Length)
{
var result = cachedRootArcs[labelToMatch];
@@ -867,7 +1075,9 @@ namespace Lucene.Net.Util.Fst
}
if (!TargetHasArcs(follow))
+ {
return null;
+ }
input.Position = GetNodeAddress(follow.Target);
@@ -875,6 +1085,7 @@ namespace Lucene.Net.Util.Fst
if (input.ReadByte() == ARCS_AS_FIXED_ARRAY)
{
+ // Arcs are full array; do binary search:
arc.NumArcs = input.ReadVInt();
if (packed || Version >= VERSION_VINT_TARGET)
arc.BytesPerArc = input.ReadVInt();
@@ -903,6 +1114,7 @@ namespace Lucene.Net.Util.Fst
return null;
}
+ // Linear scan
ReadFirstRealTargetArc(follow.Target, arc, input);
while (true)
@@ -938,8 +1150,6 @@ namespace Lucene.Net.Util.Fst
if (Flag(flags, BIT_LAST_ARC))
return;
-
-
}
}
@@ -949,6 +1159,20 @@ namespace Lucene.Net.Util.Fst
return 1 + NodeCount;
}
+
+ /// <summary>
+ /// Nodes will be expanded if their depth (distance from the root node) is
+ /// <= this value and their number of arcs is >=
+ /// (FIXED_ARRAY_NUM_ARCS_SHALLOW).
+ ///
+ /// Fixed array consumes more RAM but enables binary search on the arcs
+ /// (instead of a linear scan) on lookup by arc label.
+ /// </summary>
+ /// <param name="node"></param>
+ /// <returns><code>true</code> if <code>node</code> should be stored
+ /// in an expanded (array) form.</returns>
+ /// <see cref="FIXED_ARRAY_NUM_ARCS_DEEP"/>
+ /// <see cref="Builder.UnCompiledNode"/>
private bool ShouldExpand(Builder<T>.UnCompiledNode<T> node)
{
return allowArrayArcs &&
@@ -956,11 +1180,22 @@ namespace Lucene.Net.Util.Fst
node.NumArcs >= FIXED_ARRAY_NUM_ARCS_DEEP);
}
+ /// <summary>
+ /// Returns a BytesReader for this FST, positioned at position 0.
+ /// </summary>
+ /// <returns></returns>
public FST.BytesReader GetBytesReader()
{
- return packed ?
- bytes.GetForwardReader() :
- bytes.GetReverseReader();
+ BytesReader input;
+ if (packed)
+ {
+ input = bytes.GetForwardReader();
+ }
+ else
+ {
+ input = bytes.GetReverseReader();
+ }
+ return input;
}
@@ -977,6 +1212,7 @@ namespace Lucene.Net.Util.Fst
// public abstract void SkipBytes(int count);
//}
+ // Creates a packed FST
private FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits)
{
Version = VERSION_CURRENT;
@@ -989,9 +1225,42 @@ namespace Lucene.Net.Util.Fst
allowArrayArcs = false;
}
+ /// <summary>
+ /// Expert: creates an FST by packing this one. This
+ /// process requires substantial additional RAM (currently
+ /// up to ~8 bytes per node depending on
+ /// <code>acceptableOverheadRatio</code>, but then should
+ /// produce a smaller FST.
+ ///
+ /// The implementation of this method uses ideas from
+ /// http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf
+ /// Smaller representation
+ /// which describes this techniques to reduce the size of a FST.
+ /// However, this is not a strict implementation of the
+ /// algorithms described in this paper.
+ /// </summary>
+ /// <param name="minInCountDeref"></param>
+ /// <param name="maxDerefNodes"></param>
+ /// <param name="acceptableOverheadRatio"></param>
+ /// <returns></returns>
internal FST<T> Pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio)
{
- if (NodeAddress == null) throw new ArgumentException("this FST was not built with willPackFST=true");
+ // NOTE: maxDerefNodes is intentionally int: we cannot
+ // support > 2.1B deref nodes
+
+ // TODO: other things to try
+ // - renumber the nodes to get more next / better locality?
+ // - allow multiple input labels on an arc, so
+ // singular chain of inputs can take one arc (on
+ // wikipedia terms this could save another ~6%)
+ // - in the ord case, the output '1' is presumably
+ // very common (after NO_OUTPUT)... maybe use a bit
+ // for it..?
+ // - use spare bits in flags.... for top few labels /
+ // outputs / targets
+
+ if (NodeAddress == null)
+ throw new ArgumentException("this FST was not built with willPackFST=true");
var arc = new Arc<T>();
@@ -999,8 +1268,10 @@ namespace Lucene.Net.Util.Fst
var topN = Math.Min(maxDerefNodes, InCounts.Size());
+ // Find top nodes with highest number of incoming arcs:
var q = new NodeQueue(topN);
+ // TODO: we could use more RAM efficent solution algo here...
NodeAndInCount bottom = null;
for (var node = 0; node < InCounts.Size(); node++)
{
@@ -1021,6 +1292,7 @@ namespace Lucene.Net.Util.Fst
// Free up RAM
InCounts = null;
+
var topNodeMap = new HashMap<long, long>();
for (var downTo = q.Size - 1; downTo >= 0; downTo--)
{
@@ -1028,9 +1300,11 @@ namespace Lucene.Net.Util.Fst
topNodeMap.Add(n.Node, downTo);
}
+ // +1 because node ords start at 1 (0 is reserved as stop node):
var newNodeAddress = new GrowableWriter(PackedInts.BitsRequired(bytes.GetPosition()),
(int)(1 + NodeCount), acceptableOverheadRatio);
+ // Fill initial coarse guess:
for (var node = 1; node <= NodeCount; node++)
newNodeAddress.Set(node, 1 + bytes.GetPosition() - NodeAddress.Get(node));
@@ -1041,16 +1315,19 @@ namespace Lucene.Net.Util.Fst
FST<T> fst;
+ // Iterate until we converge:
while (true)
{
var changed = false;
+ // for assert:
var negDelta = false;
fst = new FST<T>(InputType, Outputs, bytes.GetBlockBits());
var writer = fst.bytes;
+ // Skip 0 byte since 0 is reserved target:
writer.WriteByte((sbyte)0);
fst.ArcWithOutputCount = 0;
@@ -1063,6 +1340,9 @@ namespace Lucene.Net.Util.Fst
long addressError = 0;
+ // Since we re-reverse the bytes, we now write the
+ // nodes backwards, so taht BIT_TARGET_NEXT is
+ // unchanged:
for (var node = (int)NodeCount; node >= 1; node--)
{
fst.NodeCount++;
@@ -1081,11 +1361,15 @@ namespace Lucene.Net.Util.Fst
var retry = false;
+ // for assert:
var anyNegDelta = false;
// in Java Lucene there is a label 'writeNode:'
- // writeNode:
- while (true)
+ //writeNode:
+
+ // Retry loop: possibly iterate more than once, if
+ // this is an array'd node and bytesPerArc changes
+ while (true) // retry writing this node
{
ReadFirstRealTargetArc(node, arc, r);
@@ -1127,8 +1411,7 @@ namespace Lucene.Net.Util.Fst
}
else
{
- // TODO: assert is correct here?
- //Debug.Assert(arc.NextFinalOutput == NO_OUTPUT);
+ // assert arc.nextFinalOutput == NO_OUTPUT;
}
if (!TargetHasArcs(arc))
@@ -1148,8 +1431,7 @@ namespace Lucene.Net.Util.Fst
else
absPtr = topNodeMap.Count + newNodeAddress.Get((int)arc.Target) + addressError;
- var delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.GetPosition() -
- 2;
+ var delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.GetPosition() - 2;
if (delta < 0)
{
anyNegDelta = true;
@@ -1157,15 +1439,16 @@ namespace Lucene.Net.Util.Fst
}
if (delta < absPtr)
+ {
flags += (sbyte)BIT_TARGET_DELTA;
+ }
}
else
{
absPtr = 0;
}
- // TODO: assert correct here?
- Debug.Assert(flags != ARCS_AS_FIXED_ARRAY);
+ // assert flags != ARCS_AS_FIXED_ARRAY
writer.WriteByte(flags);
fst.WriteLabel(writer, arc.Label);
@@ -1212,6 +1495,13 @@ namespace Lucene.Net.Util.Fst
{
var arcBytes = (int)(writer.GetPosition() - arcStartPos);
maxBytesPerArc = Math.Max(maxBytesPerArc, arcBytes);
+ // NOTE: this may in fact go "backwards", if
+ // somehow (rarely, possibly never) we use
+ // more bytesPerArc in this rewrite than the
+ // incoming FST did... but in this case we
+ // will retry (below) so it's OK to ovewrite
+ // bytes:
+ //wasted += bytesPerArc - arcBytes;
writer.SkipBytes((int)(arcStartPos + bytesPerArc - writer.GetPosition()));
}
@@ -1224,7 +1514,10 @@ namespace Lucene.Net.Util.Fst
if (useArcArray)
{
if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc))
+ {
+ // converged
break;
+ }
}
else
{
@@ -1246,8 +1539,13 @@ namespace Lucene.Net.Util.Fst
if (!changed)
{
- // TODO: assert correct here?
- Debug.Assert(!negDelta);
+ // We don't renumber the nodes (just reverse their
+ // order) so nodes should only point forward to
+ // other nodes because we only produce acyclic FSTs
+ // w/ nodes only pointing "forwards":
+ // assert != negDelta
+
+ // Converged!
break;
}
}
@@ -1266,12 +1564,13 @@ namespace Lucene.Net.Util.Fst
fst.startNode = newNodeAddress.Get((int)startNode);
if (emptyOutput != null)
+ {
fst.emptyOutput = emptyOutput;
+ }
- // TODO: assert correct here?
- Debug.Assert(fst.NodeCount == NodeCount, "fst.NodeCount=" + fst.NodeCount + " NodeCount=" + NodeCount);
- Debug.Assert(fst.ArcCount == ArcCount);
- Debug.Assert(fst.ArcWithOutputCount == ArcWithOutputCount, "fst.ArcWithOutputCount=" + fst.ArcWithOutputCount + " ArcWithOutputCount=" + ArcWithOutputCount);
+ //assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount=" + nodeCount;
+ //assert fst.arcCount == arcCount;
+ //assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount=" + fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount;
fst.bytes.Finish();
fst.CacheRootArcs();
@@ -1290,69 +1589,136 @@ namespace Lucene.Net.Util.Fst
internal const int BIT_LAST_ARC = 1 << 1;
internal const int BIT_TARGET_NEXT = 1 << 2;
+ // TODO: we can free up a bit if we can nuke this:
internal const int BIT_STOP_NODE = 1 << 3;
internal const int BIT_ARC_HAS_OUTPUT = 1 << 4;
internal const int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
+ // Arcs are stored as fixed-size (per entry) array, so
+ // that we can find an arc using binary search. We do
+ // this when number of arcs is > NUM_ARCS_ARRAY:
+
+ // If set, thie target node is delta coded vs current position:
internal const int BIT_TARGET_DELTA = 1 << 6;
+ // We use this as a marker (because this one flag is
+ // illegal by itself ...):
internal const sbyte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT;
+ /// <summary>
+ /// <see cref="UnCompiledNode"/>
+ /// </summary>
internal const int FIXED_ARRAY_SHALLOW_DISTANCE = 3;
-
+
+ /// <summary>
+ /// <see cref="UnCompiledNode"/>
+ /// </summary>
internal const int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
+ /// <summary>
+ /// <see cref="UnCompiledNode"/>
+ /// </summary>
internal const int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
+ // Increment version to change it
internal const string FILE_FORMAT_NAME = "FST";
internal const int VERSION_START = 0;
+ /// <summary>
+ /// Changed numBytesPerArc for array'd case from byte to int.
+ /// </summary>
internal const int VERSION_INT_NUM_BYTES_PER_ARC = 1;
+ /// <summary>
+ /// Write BYTE2 labels as 2-byte short, not vInt.
+ /// </summary>
internal const int VERSION_SHORT_BYTE2_LABELS = 2;
- internal const int VERSIONpacked = 3;
+ /// <summary>
+ /// Added optional packed format.
+ /// </summary>
+ internal const int VERSION_PACKED = 3;
+ /// <summary>
+ /// Changed from int to vInt for encoding arc targets.
+ /// Also changed maxBytesPerArc from int to vInt in the array case.
+ /// </summary>
internal const int VERSION_VINT_TARGET = 4;
internal const int VERSION_CURRENT = VERSION_VINT_TARGET;
+ // Never serialized; just used to represent the virtual
+ // final node w/ no arcs:
internal const long FINAL_END_NODE = -1;
+ // Never serialized; just used to represent the virtual
+ // non-final node w/ no arcs:
internal const long NON_FINAL_END_NODE = 0;
-
+ /// <summary>
+ /// Reads bytes stored in an FST.
+ /// </summary>
public abstract class BytesReader : DataInput
{
+ /// <summary>
+ /// Current read position
+ /// </summary>
public abstract long Position { get; set; }
+ /// <summary>
+ /// Returns true if this reader uses reversed bytes
+ /// under-the-hood.
+ /// </summary>
+ /// <returns></returns>
public abstract bool Reversed();
+ /// <summary>
+ /// Skips bytes.
+ /// </summary>
+ /// <param name="count"></param>
public abstract void SkipBytes(int count);
}
+ /// <summary>
+ /// Specifies allowed range of each int input label for this FST.
+ /// </summary>
public enum INPUT_TYPE { BYTE1, BYTE2, BYTE4 }
+ /// <summary>
+ /// Represents a single arc.
+ /// </summary>
+ /// <typeparam name="T"></typeparam>
public sealed class Arc<T>
{
public int Label { get; set; }
public T Output { get; set; }
+ // From node (ord or address); currently only used when
+ // building an FST w/ willPackFST=true:
internal long Node { get; set; }
+ /// <summary>
+ /// To node (ord or address)
+ /// </summary>
public long Target { get; set; }
internal sbyte Flags { get; set; }
-
public T NextFinalOutput { get; set; }
+ // address (into the byte[]), or ord/address if label == END_LABEL
internal long NextArc { get; set; }
+ // This is non-zero if current arcs are fixed array:
internal long PosArcsStart { get; set; }
internal int BytesPerArc { get; set; }
internal int ArcIdx { get; set; }
internal int NumArcs { get; set; }
+ /// <summary>
+ /// Return this
+ /// </summary>
+ /// <param name="other"></param>
+ /// <returns></returns>
public Arc<T> CopyFrom(Arc<T> other)
{
Node = other.Node;
@@ -1436,8 +1802,11 @@ namespace Lucene.Net.Util.Fst
public int CompareTo(NodeAndInCount other)
{
- if (Count > other.Count) return 1;
- if (Count < other.Count) return -1;
+ if (Count > other.Count)
+ return 1;
+ if (Count < other.Count)
+ return -1;
+ // Tie-break: smaller node compares as greater than
return other.Node - Node;
}
}
@@ -1452,8 +1821,7 @@ namespace Lucene.Net.Util.Fst
public override bool LessThan(NodeAndInCount a, NodeAndInCount b)
{
var cmp = a.CompareTo(b);
- // TODO: assert correct here?
- Debug.Assert(cmp != 0);
+ // assert cmp != 0;
return cmp < 0;
}
}