You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/08 20:27:25 UTC
svn commit: r1228928 - in /lucene/dev/trunk/lucene: CHANGES.txt
src/java/org/apache/lucene/util/fst/FST.java
Author: mikemccand
Date: Sun Jan 8 19:27:25 2012
New Revision: 1228928
URL: http://svn.apache.org/viewvc?rev=1228928&view=rev
Log:
LUCENE-3681: use 2 bytes (unsigned short) to save label for FST.INPUT_TYPE.BYTE2 case
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1228928&r1=1228927&r2=1228928&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sun Jan 8 19:27:25 2012
@@ -714,6 +714,11 @@ Changes in backwards compatibility polic
contrib/queryparser. If you have used those classes in your code
just add the lucene-queryparser.jar file to your classpath.
(Uwe Schindler)
+
+* LUCENE-3681: FST now stores labels for BYTE2 input type as 2 bytes
+ instead of vInt; this can make FSTs smaller and faster, but it is a
+ break in the binary format so if you had built and saved any FSTs
+ then you need to rebuild them. (Robert Muir, Mike McCandless)
Security fixes
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java?rev=1228928&r1=1228927&r2=1228928&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java Sun Jan 8 19:27:25 2012
@@ -92,7 +92,10 @@ public class FST<T> {
/** Changed numBytesPerArc for array'd case from byte to int. */
private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1;
- private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC;
+ /** Write BYTE2 labels as 2-byte short, not vInt. */
+ private final static int VERSION_SHORT_BYTE2_LABELS = 2;
+
+ private final static int VERSION_CURRENT = VERSION_SHORT_BYTE2_LABELS;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@@ -199,7 +202,9 @@ public class FST<T> {
public FST(DataInput in, Outputs<T> outputs) throws IOException {
this.outputs = outputs;
writer = null;
- CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC);
+ // NOTE: only reads most recent format; we don't have
+ // back-compat promise for FSTs (they are experimental):
+ CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_SHORT_BYTE2_LABELS, VERSION_SHORT_BYTE2_LABELS);
if (in.readByte() == 1) {
// accepts empty string
int numBytes = in.readVInt();
@@ -389,7 +394,7 @@ public class FST<T> {
writer.writeByte((byte) v);
} else if (inputType == INPUT_TYPE.BYTE2) {
assert v <= 65535: "v=" + v;
- writer.writeVInt(v);
+ writer.writeShort((short) v);
} else {
//writeInt(v);
writer.writeVInt(v);
@@ -399,7 +404,11 @@ public class FST<T> {
int readLabel(DataInput in) throws IOException {
final int v;
if (inputType == INPUT_TYPE.BYTE1) {
+ // Unsigned byte:
v = in.readByte()&0xFF;
+ } else if (inputType == INPUT_TYPE.BYTE2) {
+ // Unsigned short:
+ v = in.readShort()&0xFFFF;
} else {
v = in.readVInt();
}