You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/12 13:37:46 UTC
svn commit: r1299651 - in /lucene/dev/trunk/lucene/core/src:
java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
test/org/apache/lucene/index/TestPostingsOffsets.java
Author: rmuir
Date: Mon Mar 12 12:37:46 2012
New Revision: 1299651
URL: http://svn.apache.org/viewvc?rev=1299651&view=rev
Log:
LUCENE-3864: support offsets in memorypostings
Modified:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java?rev=1299651&r1=1299650&r2=1299651&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java Mon Mar 12 12:37:46 2012
@@ -98,8 +98,6 @@ public class MemoryPostingsFormat extend
return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")";
}
- private static final boolean VERBOSE = false;
-
private final static class TermsWriter extends TermsConsumer {
private final IndexOutput out;
private final FieldInfo field;
@@ -123,10 +121,13 @@ public class MemoryPostingsFormat extend
// NOTE: not private so we don't pay access check at runtime:
int docCount;
RAMOutputStream buffer = new RAMOutputStream();
+
+ int lastOffsetLength;
+ int lastOffset;
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
- if (VERBOSE) System.out.println(" startDoc docID=" + docID + " freq=" + termDocFreq);
+ //System.out.println(" startDoc docID=" + docID + " freq=" + termDocFreq);
final int delta = docID - lastDocID;
assert docID == 0 || delta > 0;
lastDocID = docID;
@@ -143,20 +144,23 @@ public class MemoryPostingsFormat extend
}
lastPos = 0;
+ lastOffset = 0;
}
@Override
public void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert payload == null || field.storePayloads;
- if (VERBOSE) System.out.println(" addPos pos=" + pos + " payload=" + payload);
+ //System.out.println(" addPos pos=" + pos + " payload=" + payload);
final int delta = pos - lastPos;
assert delta >= 0;
lastPos = pos;
+ int payloadLen = 0;
+
if (field.storePayloads) {
- final int payloadLen = payload == null ? 0 : payload.length;
+ payloadLen = payload == null ? 0 : payload.length;
if (payloadLen != lastPayloadLen) {
lastPayloadLen = payloadLen;
buffer.writeVInt((delta<<1)|1);
@@ -164,13 +168,28 @@ public class MemoryPostingsFormat extend
} else {
buffer.writeVInt(delta<<1);
}
-
- if (payloadLen > 0) {
- buffer.writeBytes(payload.bytes, payload.offset, payloadLen);
- }
} else {
buffer.writeVInt(delta);
}
+
+ if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
+ // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
+ // and the numbers aren't that much smaller anyways.
+ int offsetDelta = startOffset - lastOffset;
+ int offsetLength = endOffset - startOffset;
+ if (offsetLength != lastOffsetLength) {
+ buffer.writeVInt(offsetDelta << 1 | 1);
+ buffer.writeVInt(offsetLength);
+ } else {
+ buffer.writeVInt(offsetDelta << 1);
+ }
+ lastOffset = startOffset;
+ lastOffsetLength = offsetLength;
+ }
+
+ if (payloadLen > 0) {
+ buffer.writeBytes(payload.bytes, payload.offset, payloadLen);
+ }
}
@Override
@@ -182,6 +201,8 @@ public class MemoryPostingsFormat extend
lastDocID = 0;
docCount = 0;
lastPayloadLen = 0;
+ // force first offset to write its length
+ lastOffsetLength = -1;
return this;
}
}
@@ -190,7 +211,7 @@ public class MemoryPostingsFormat extend
@Override
public PostingsConsumer startTerm(BytesRef text) {
- if (VERBOSE) System.out.println(" startTerm term=" + text.utf8ToString());
+ //System.out.println(" startTerm term=" + text.utf8ToString());
return postingsWriter.reset();
}
@@ -224,12 +245,12 @@ public class MemoryPostingsFormat extend
spare.bytes = finalBuffer;
spare.length = totalBytes;
- if (VERBOSE) {
- System.out.println(" finishTerm term=" + text.utf8ToString() + " " + totalBytes + " bytes totalTF=" + stats.totalTermFreq);
- for(int i=0;i<totalBytes;i++) {
- System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF));
- }
- }
+
+ //System.out.println(" finishTerm term=" + text.utf8ToString() + " " + totalBytes + " bytes totalTF=" + stats.totalTermFreq);
+ //for(int i=0;i<totalBytes;i++) {
+ // System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF));
+ //}
+
builder.add(Util.toIntsRef(text, scratchIntsRef), BytesRef.deepCopyOf(spare));
termCount++;
}
@@ -249,7 +270,7 @@ public class MemoryPostingsFormat extend
fst = fst.pack(3, Math.max(10, fst.getNodeCount()/4));
}
fst.save(out);
- if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
+ //System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
}
}
@@ -270,10 +291,7 @@ public class MemoryPostingsFormat extend
return new FieldsConsumer() {
@Override
public TermsConsumer addField(FieldInfo field) {
- if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
- throw new UnsupportedOperationException("this codec cannot index offsets");
- }
- if (VERBOSE) System.out.println("\naddField field=" + field.name);
+ //System.out.println("\naddField field=" + field.name);
return new TermsWriter(out, field, doPackFST);
}
@@ -331,11 +349,9 @@ public class MemoryPostingsFormat extend
@Override
public int nextDoc() {
while(true) {
- if (VERBOSE) System.out.println(" nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
+ //System.out.println(" nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
if (docUpto == numDocs) {
- if (VERBOSE) {
- System.out.println(" END");
- }
+ // System.out.println(" END");
return docID = NO_MORE_DOCS;
}
docUpto++;
@@ -344,7 +360,7 @@ public class MemoryPostingsFormat extend
} else {
final int code = in.readVInt();
accum += code >>> 1;
- if (VERBOSE) System.out.println(" docID=" + accum + " code=" + code);
+ //System.out.println(" docID=" + accum + " code=" + code);
if ((code & 1) != 0) {
freq = 1;
} else {
@@ -352,8 +368,8 @@ public class MemoryPostingsFormat extend
assert freq > 0;
}
- if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
- // Skip positions
+ if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+ // Skip positions/payloads
for(int posUpto=0;posUpto<freq;posUpto++) {
if (!storePayloads) {
in.readVInt();
@@ -365,11 +381,26 @@ public class MemoryPostingsFormat extend
in.skipBytes(payloadLen);
}
}
+ } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
+ // Skip positions/offsets/payloads
+ for(int posUpto=0;posUpto<freq;posUpto++) {
+ int posCode = in.readVInt();
+ if (storePayloads && ((posCode & 1) != 0)) {
+ payloadLen = in.readVInt();
+ }
+ if ((in.readVInt() & 1) != 0) {
+ // new offset length
+ in.readVInt();
+ }
+ if (storePayloads) {
+ in.skipBytes(payloadLen);
+ }
+ }
}
}
if (liveDocs == null || liveDocs.get(accum)) {
- if (VERBOSE) System.out.println(" return docID=" + accum + " freq=" + freq);
+ //System.out.println(" return docID=" + accum + " freq=" + freq);
return (docID = accum);
}
}
@@ -413,26 +444,30 @@ public class MemoryPostingsFormat extend
private int posPending;
private int payloadLength;
private boolean payloadRetrieved;
+ final boolean storeOffsets;
+ int offsetLength;
+ int startOffset;
private int pos;
private final BytesRef payload = new BytesRef();
- public FSTDocsAndPositionsEnum(boolean storePayloads) {
+ public FSTDocsAndPositionsEnum(boolean storePayloads, boolean storeOffsets) {
this.storePayloads = storePayloads;
+ this.storeOffsets = storeOffsets;
}
- public boolean canReuse(boolean storePayloads) {
- return storePayloads == this.storePayloads;
+ public boolean canReuse(boolean storePayloads, boolean storeOffsets) {
+ return storePayloads == this.storePayloads && storeOffsets == this.storeOffsets;
}
public FSTDocsAndPositionsEnum reset(BytesRef bufferIn, Bits liveDocs, int numDocs) {
assert numDocs > 0;
- if (VERBOSE) {
- System.out.println("D&P reset bytes this=" + this);
- for(int i=bufferIn.offset;i<bufferIn.length;i++) {
- System.out.println(" " + Integer.toHexString(bufferIn.bytes[i]&0xFF));
- }
- }
+
+ // System.out.println("D&P reset bytes this=" + this);
+ // for(int i=bufferIn.offset;i<bufferIn.length;i++) {
+ // System.out.println(" " + Integer.toHexString(bufferIn.bytes[i]&0xFF));
+ // }
+
if (buffer.length < bufferIn.length - bufferIn.offset) {
buffer = ArrayUtil.grow(buffer, bufferIn.length - bufferIn.offset);
}
@@ -447,6 +482,8 @@ public class MemoryPostingsFormat extend
this.numDocs = numDocs;
posPending = 0;
payloadRetrieved = false;
+ startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
+ offsetLength = 0;
return this;
}
@@ -456,9 +493,9 @@ public class MemoryPostingsFormat extend
nextPosition();
}
while(true) {
- if (VERBOSE) System.out.println(" nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
+ //System.out.println(" nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
if (docUpto == numDocs) {
- if (VERBOSE) System.out.println(" END");
+ //System.out.println(" END");
return docID = NO_MORE_DOCS;
}
docUpto++;
@@ -474,8 +511,9 @@ public class MemoryPostingsFormat extend
if (liveDocs == null || liveDocs.get(accum)) {
pos = 0;
+ startOffset = storeOffsets ? 0 : -1;
posPending = freq;
- if (VERBOSE) System.out.println(" return docID=" + accum + " freq=" + freq);
+ //System.out.println(" return docID=" + accum + " freq=" + freq);
return (docID = accum);
}
@@ -487,8 +525,18 @@ public class MemoryPostingsFormat extend
final int skipCode = in.readVInt();
if ((skipCode & 1) != 0) {
payloadLength = in.readVInt();
- if (VERBOSE) System.out.println(" new payloadLen=" + payloadLength);
+ //System.out.println(" new payloadLen=" + payloadLength);
}
+ }
+
+ if (storeOffsets) {
+ if ((in.readVInt() & 1) != 0) {
+ // new offset length
+ offsetLength = in.readVInt();
+ }
+ }
+
+ if (storePayloads) {
in.skipBytes(payloadLength);
}
}
@@ -497,7 +545,7 @@ public class MemoryPostingsFormat extend
@Override
public int nextPosition() {
- if (VERBOSE) System.out.println(" nextPos storePayloads=" + storePayloads + " this=" + this);
+ //System.out.println(" nextPos storePayloads=" + storePayloads + " this=" + this);
assert posPending > 0;
posPending--;
if (!storePayloads) {
@@ -511,6 +559,18 @@ public class MemoryPostingsFormat extend
//} else {
//System.out.println(" same payloadLen=" + payloadLength);
}
+ }
+
+ if (storeOffsets) {
+ int offsetCode = in.readVInt();
+ if ((offsetCode & 1) != 0) {
+ // new offset length
+ offsetLength = in.readVInt();
+ }
+ startOffset += offsetCode >>> 1;
+ }
+
+ if (storePayloads) {
payload.offset = in.getPosition();
in.skipBytes(payloadLength);
payload.length = payloadLength;
@@ -520,18 +580,18 @@ public class MemoryPostingsFormat extend
payloadRetrieved = false;
}
- if (VERBOSE) System.out.println(" pos=" + pos + " payload=" + payload + " fp=" + in.getPosition());
+ //System.out.println(" pos=" + pos + " payload=" + payload + " fp=" + in.getPosition());
return pos;
}
@Override
public int startOffset() {
- return -1;
+ return startOffset;
}
@Override
public int endOffset() {
- return -1;
+ return startOffset + offsetLength;
}
@Override
@@ -594,14 +654,14 @@ public class MemoryPostingsFormat extend
totalTermFreq = -1;
}
current.output.offset = buffer.getPosition();
- if (VERBOSE) System.out.println(" df=" + docFreq + " totTF=" + totalTermFreq + " offset=" + buffer.getPosition() + " len=" + current.output.length);
+ //System.out.println(" df=" + docFreq + " totTF=" + totalTermFreq + " offset=" + buffer.getPosition() + " len=" + current.output.length);
didDecode = true;
}
}
@Override
public boolean seekExact(BytesRef text, boolean useCache /* ignored */) throws IOException {
- if (VERBOSE) System.out.println("te.seekExact text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
+ //System.out.println("te.seekExact text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
current = fstEnum.seekExact(text);
didDecode = false;
return current != null;
@@ -609,25 +669,24 @@ public class MemoryPostingsFormat extend
@Override
public SeekStatus seekCeil(BytesRef text, boolean useCache /* ignored */) throws IOException {
- if (VERBOSE) System.out.println("te.seek text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
+ //System.out.println("te.seek text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
current = fstEnum.seekCeil(text);
if (current == null) {
return SeekStatus.END;
} else {
- if (VERBOSE) {
- System.out.println(" got term=" + current.input.utf8ToString());
- for(int i=0;i<current.output.length;i++) {
- System.out.println(" " + Integer.toHexString(current.output.bytes[i]&0xFF));
- }
- }
+
+ // System.out.println(" got term=" + current.input.utf8ToString());
+ // for(int i=0;i<current.output.length;i++) {
+ // System.out.println(" " + Integer.toHexString(current.output.bytes[i]&0xFF));
+ // }
didDecode = false;
if (text.equals(current.input)) {
- if (VERBOSE) System.out.println(" found!");
+ //System.out.println(" found!");
return SeekStatus.FOUND;
} else {
- if (VERBOSE) System.out.println(" not found: " + current.input.utf8ToString());
+ //System.out.println(" not found: " + current.input.utf8ToString());
return SeekStatus.NOT_FOUND;
}
}
@@ -654,9 +713,9 @@ public class MemoryPostingsFormat extend
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
- if (needsOffsets) {
- // Not until we can index offsets...
- return null;
+ boolean hasOffsets = field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ if (needsOffsets && !hasOffsets) {
+ return null; // not available
}
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
@@ -665,14 +724,14 @@ public class MemoryPostingsFormat extend
decodeMetaData();
FSTDocsAndPositionsEnum docsAndPositionsEnum;
if (reuse == null || !(reuse instanceof FSTDocsAndPositionsEnum)) {
- docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads);
+ docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads, hasOffsets);
} else {
docsAndPositionsEnum = (FSTDocsAndPositionsEnum) reuse;
- if (!docsAndPositionsEnum.canReuse(field.storePayloads)) {
- docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads);
+ if (!docsAndPositionsEnum.canReuse(field.storePayloads, hasOffsets)) {
+ docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads, hasOffsets);
}
}
- if (VERBOSE) System.out.println("D&P reset this=" + this);
+ //System.out.println("D&P reset this=" + this);
return docsAndPositionsEnum.reset(current.output, liveDocs, docFreq);
}
@@ -683,14 +742,14 @@ public class MemoryPostingsFormat extend
@Override
public BytesRef next() throws IOException {
- if (VERBOSE) System.out.println("te.next");
+ //System.out.println("te.next");
current = fstEnum.next();
if (current == null) {
- if (VERBOSE) System.out.println(" END");
+ //System.out.println(" END");
return null;
}
didDecode = false;
- if (VERBOSE) System.out.println(" term=" + field.name + ":" + current.input.utf8ToString());
+ //System.out.println(" term=" + field.name + ":" + current.input.utf8ToString());
return current.input;
}
@@ -794,9 +853,7 @@ public class MemoryPostingsFormat extend
break;
}
final TermsReader termsReader = new TermsReader(state.fieldInfos, in, termCount);
- if (VERBOSE) {
- System.out.println("load field=" + termsReader.field.name);
- }
+ // System.out.println("load field=" + termsReader.field.name);
fields.put(termsReader.field.name, termsReader);
}
} finally {
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java?rev=1299651&r1=1299650&r2=1299651&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java Mon Mar 12 12:37:46 2012
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.MockPa
import org.apache.lucene.analysis.Token;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
+import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@@ -43,6 +44,8 @@ import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
+// TODO: we really need to test indexingoffsets, but then getting only docs / docs + freqs.
+// not all codecs store prx separate...
public class TestPostingsOffsets extends LuceneTestCase {
IndexWriterConfig iwc;
@@ -54,7 +57,11 @@ public class TestPostingsOffsets extends
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
- iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+ if (random.nextBoolean()) {
+ iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+ } else {
+ iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
+ }
}
}
@@ -126,7 +133,11 @@ public class TestPostingsOffsets extends
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
- iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+ if (random.nextBoolean()) {
+ iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+ } else {
+ iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
+ }
}
iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);