You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ha...@apache.org on 2013/07/08 18:08:32 UTC
svn commit: r1500814 - in
/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene:
codecs/temp/TempFSTTermsWriter.java codecs/temp/TempTermOutputs.java
util/fst/Outputs.java
Author: han
Date: Mon Jul 8 16:08:32 2013
New Revision: 1500814
URL: http://svn.apache.org/r1500814
Log:
LUCENE-3069: reader part, update logic in outputs
Modified:
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Outputs.java
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java?rev=1500814&r1=1500813&r2=1500814&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java Mon Jul 8 16:08:32 2013
@@ -46,7 +46,6 @@ import org.apache.lucene.codecs.CodecUti
/** FST based term dict, all the metadata held
* as output of FST */
-// nocommit: where is 'TermStats' ???
public class TempFSTTermsWriter extends FieldsConsumer {
static final String TERMS_EXTENSION = "tmp";
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java?rev=1500814&r1=1500813&r2=1500814&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java Mon Jul 8 16:08:32 2013
@@ -33,7 +33,7 @@ import org.apache.lucene.util.LongsRef;
public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
private final static TempMetaData NO_OUTPUT = new TempMetaData();
private static boolean DEBUG = false;
- private FieldInfo fieldInfo;
+ private boolean hasPos;
private int longsSize;
public static class TempMetaData {
@@ -104,23 +104,26 @@ public class TempTermOutputs extends Out
}
protected TempTermOutputs(FieldInfo fieldInfo, int longsSize) {
- this.fieldInfo = fieldInfo;
+ this.hasPos = (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY);
this.longsSize = longsSize;
}
@Override
//
- // Since longs blob is fixed length, when these two are 'comparable'
- // i.e. when every value in long[] fits the same ordering, the smaller one
- // will be the result.
+ // The return value will be the smaller one, when these two are
+ // 'comparable', i.e. every value in long[] fits the same ordering.
//
- // NOTE: only long[] is 'shared', i.e. if there are two byte[] on the successive
- // arcs, only the last byte[] is valid. (this somewhat saves nodes, but might affect
- // compression, since we'll have to load metadata block for other terms as well, currently,
- // we don't support this)
+ // NOTE:
+ // Only long[] is 'shared', byte[] and term stats simply act
+ // as 'attachment': when walking on the FST, if we see two byte[] on
+ // successive arcs, only the second byte[] is valid.
//
- // nocommit: get the byte[] from smaller one as well, so that
- // byte[] is actually inherited
+ // Therefore, during building, we always make sure that, for most nodes,
+ // the first output is 'pushed' one step towards root and reduced to
+ // be NO_OUTPUT, so that we get rid of the 'all zero' long[], and netly
+ // get smaller amount of total outputs.
+ //
+ // However, when decoding, terms might have to load redundant byte[] blob.
//
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
@@ -128,14 +131,11 @@ public class TempTermOutputs extends Out
if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
return NO_OUTPUT;
}
- assert t1.longs != null;
- assert t2.longs != null;
assert t1.longs.length == t2.longs.length;
- long accum = 0;
long[] longs1 = t1.longs, longs2 = t2.longs;
int pos = 0;
- boolean order = true;
+ boolean smaller = true;
TempMetaData ret;
while (pos < longsSize && longs1[pos] == longs2[pos]) {
@@ -143,56 +143,45 @@ public class TempTermOutputs extends Out
}
if (pos < longsSize) {
// unequal
- order = (longs1[pos] > longs2[pos]);
- if (order) {
- // check whether strictly longs1 >= longs2
- while (pos < longsSize && longs1[pos] >= longs2[pos]) {
- accum += longs2[pos];
+ smaller = (longs1[pos] < longs2[pos]);
+ if (smaller) {
+ // check whether strictly longs1 <= longs2
+ while (pos < longsSize && longs1[pos] <= longs2[pos]) {
pos++;
}
} else {
- // check whether strictly longs1 <= longs2
- while (pos < longsSize && longs1[pos] <= longs2[pos]) {
- accum += longs1[pos];
+ // check whether strictly longs1 >= longs2
+ while (pos < longsSize && longs1[pos] >= longs2[pos]) {
pos++;
}
}
- if (pos < longsSize || accum == 0) {
+ if (pos < longsSize) { // not fully 'comparable'
ret = NO_OUTPUT;
- } else if (order) {
- ret = new TempMetaData(longs2, null, 0, -1);
- } else {
- ret = new TempMetaData(longs1, null, 0, -1);
- }
- } else {
- // equal
- if (t1.bytes!= null && bytesEqual(t1, t2) && statsEqual(t1, t2)) { // all fields are equal
+ } else if (smaller) {
ret = t1;
- } else if (accum == 0) { // all zero case
- ret = NO_OUTPUT;
} else {
- ret = new TempMetaData(longs1, null, 0, -1);
+ ret = t2;
}
+ } else {
+ // equal, we won't check byte[] and docFreq
+ ret = t1;
}
if (DEBUG) System.out.println("ret:"+ret);
return ret;
}
@Override
- // nocommit:
- // this *actually* always assume that t2 <= t1 before calling the method
public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
if (t2 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+t1);
return t1;
}
- assert t1.longs != null;
- assert t2.longs != null;
+ assert t1.longs.length == t2.longs.length;
int pos = 0;
long diff = 0;
- long[] share = new long[longsSize]; // nocommit: reuse
+ long[] share = new long[longsSize];
while (pos < longsSize) {
share[pos] = t1.longs[pos] - t2.longs[pos];
@@ -201,7 +190,7 @@ public class TempTermOutputs extends Out
}
TempMetaData ret;
- if (diff == 0 && bytesEqual(t1, t2) && statsEqual(t1, t2)) {
+ if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) {
ret = NO_OUTPUT;
} else {
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
@@ -210,16 +199,7 @@ public class TempTermOutputs extends Out
return ret;
}
- static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
- return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
- }
- static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
- return Arrays.equals(t1.bytes, t2.bytes);
- }
-
@Override
- // nocommit: need to check all-zero case?
- // so we can reuse one long[]
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
if (t1 == NO_OUTPUT) {
@@ -229,22 +209,16 @@ public class TempTermOutputs extends Out
if (DEBUG) System.out.println("ret:"+t1);
return t1;
}
- assert t1.longs != null;
- assert t2.longs != null;
+ assert t1.longs.length == t2.longs.length;
int pos = 0;
- long[] accum = new long[longsSize]; // nocommit: reuse?
+ long[] accum = new long[longsSize];
while (pos < longsSize) {
accum[pos] = t1.longs[pos] + t2.longs[pos];
- assert(accum[pos] >= 0);
pos++;
}
TempMetaData ret;
- if (t2.bytes != null || t2.docFreq > 0) {
- ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
- } else {
- ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
- }
+ ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
if (DEBUG) System.out.println("ret:"+ret);
return ret;
}
@@ -263,7 +237,7 @@ public class TempTermOutputs extends Out
}
if (data.docFreq > 0) {
out.writeVInt(data.docFreq);
- if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ if (hasPos) {
out.writeVLong(data.totalTermFreq - data.docFreq);
}
}
@@ -272,26 +246,25 @@ public class TempTermOutputs extends Out
@Override
public TempMetaData read(DataInput in) throws IOException {
long[] longs = new long[longsSize];
+ byte[] bytes = null;
+ int docFreq = 0;
+ long totalTermFreq = -1;
for (int pos = 0; pos < longsSize; pos++) {
longs[pos] = in.readVLong();
}
int code = in.readVInt();
int bytesSize = code >>> 1;
- int docFreq = 0;
- long totalTermFreq = -1;
- byte[] bytes = null;
if (bytesSize > 0) {
bytes = new byte[bytesSize];
in.readBytes(bytes, 0, bytes.length);
}
if ((code & 1) == 1) {
docFreq = in.readVInt();
- if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ if (hasPos) {
totalTermFreq = docFreq + in.readVLong();
}
}
- TempMetaData meta = new TempMetaData(longs, bytes, docFreq, totalTermFreq);
- return meta;
+ return new TempMetaData(longs, bytes, docFreq, totalTermFreq);
}
@Override
@@ -303,5 +276,11 @@ public class TempTermOutputs extends Out
public String outputToString(TempMetaData data) {
return data.toString();
}
+
+ static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
+ return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
+ }
+ static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
+ return Arrays.equals(t1.bytes, t2.bytes);
+ }
}
-
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Outputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Outputs.java?rev=1500814&r1=1500813&r2=1500814&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Outputs.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Outputs.java Mon Jul 8 16:08:32 2013
@@ -40,7 +40,7 @@ public abstract class Outputs<T> {
// (new object per byte/char/int) if eg used during
// analysis
- /** Eg common("foo", "foobar") -> "foo" */
+ /** Eg common("foobar", "food") -> "foo" */
public abstract T common(T output1, T output2);
/** Eg subtract("foobar", "foo") -> "bar" */