You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ha...@apache.org on 2013/07/07 11:14:17 UTC
svn commit: r1500391 - in /lucene/dev/branches/lucene3069/lucene/core/src:
java/org/apache/lucene/codecs/temp/ resources/META-INF/services/
test/org/apache/lucene/index/
Author: han
Date: Sun Jul 7 09:14:17 2013
New Revision: 1500391
URL: http://svn.apache.org/r1500391
Log:
reader part, support basic enums
Added:
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempBlockPostingsFormat.java
- copied, changed from r1500371, lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsFormat.java
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTPostingsFormat.java
Removed:
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsFormat.java
Modified:
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsReader.java
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsWriter.java
lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
lucene/dev/branches/lucene3069/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java
Copied: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempBlockPostingsFormat.java (from r1500371, lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsFormat.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempBlockPostingsFormat.java?p2=lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempBlockPostingsFormat.java&p1=lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsFormat.java&r1=1500371&r2=1500391&rev=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsFormat.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempBlockPostingsFormat.java Sun Jul 7 09:14:17 2013
@@ -56,7 +56,7 @@ import org.apache.lucene.util.packed.Pac
*
* <li>
* <b>Block structure</b>:
- * <p>When the postings are long enough, TempPostingsFormat will try to encode most integer data
+ * <p>When the postings are long enough, TempBlockPostingsFormat will try to encode most integer data
* as a packed block.</p>
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as two packed
* blocks, while the remaining 3 are encoded as one VInt block. </p>
@@ -159,7 +159,7 @@ import org.apache.lucene.util.packed.Pac
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
* file. In particular, it is the length of the TermFreq data.
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
- * (i.e. 8 in TempPostingsFormat).</li>
+ * (i.e. 8 in TempBlockPostingsFormat).</li>
* <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
* of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
* single document ID is written to the term dictionary.</li>
@@ -239,7 +239,7 @@ import org.apache.lucene.util.packed.Pac
* We use this trick since the definition of skip entry is a little different from base interface.
* In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
* skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However,
- * in TempPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
+ * in TempBlockPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
* 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case).
* When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one
* more skip data than TempSkipWriter. </li>
@@ -352,7 +352,7 @@ import org.apache.lucene.util.packed.Pac
* @lucene.experimental
*/
-public final class TempPostingsFormat extends PostingsFormat {
+public final class TempBlockPostingsFormat extends PostingsFormat {
/**
* Filename extension for document number, frequencies, and skip data.
* See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
@@ -381,20 +381,17 @@ public final class TempPostingsFormat ex
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
public final static int BLOCK_SIZE = 128;
- /** Creates {@code TempPostingsFormat} with default
+ /** Creates {@code TempBlockPostingsFormat} with default
* settings. */
- public TempPostingsFormat() {
- super("TempFST");
- minTermBlockSize = 0;
- maxTermBlockSize = 0;
- //this(TempBlockTermsWriter.DEFAULT_MIN_BLOCK_SIZE, TempBlockTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
+ public TempBlockPostingsFormat() {
+ this(TempBlockTermsWriter.DEFAULT_MIN_BLOCK_SIZE, TempBlockTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
- /** Creates {@code TempPostingsFormat} with custom
+ /** Creates {@code TempBlockPostingsFormat} with custom
* values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
* @see TempBlockTermsWriter#TempBlockTermsWriter(SegmentWriteState,TempPostingsWriterBase,int,int) */
- public TempPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
+ public TempBlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("TempBlock");
this.minTermBlockSize = minTermBlockSize;
assert minTermBlockSize > 1;
@@ -413,11 +410,10 @@ public final class TempPostingsFormat ex
boolean success = false;
try {
- //FieldsConsumer ret = new TempBlockTermsWriter(state,
- // postingsWriter,
- // minTermBlockSize,
- // maxTermBlockSize);
- FieldsConsumer ret = new TempFSTTermsWriter(state, postingsWriter);
+ FieldsConsumer ret = new TempBlockTermsWriter(state,
+ postingsWriter,
+ minTermBlockSize,
+ maxTermBlockSize);
success = true;
return ret;
} finally {
@@ -436,14 +432,13 @@ public final class TempPostingsFormat ex
state.segmentSuffix);
boolean success = false;
try {
- //FieldsProducer ret = new TempBlockTermsReader(state.directory,
- // state.fieldInfos,
- // state.segmentInfo,
- // postingsReader,
- // state.context,
- // state.segmentSuffix,
- // state.termsIndexDivisor);
- FieldsProducer ret = new TempFSTTermsReader(state, postingsReader);
+ FieldsProducer ret = new TempBlockTermsReader(state.directory,
+ state.fieldInfos,
+ state.segmentInfo,
+ postingsReader,
+ state.context,
+ state.segmentSuffix,
+ state.termsIndexDivisor);
success = true;
return ret;
} finally {
Added: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTPostingsFormat.java?rev=1500391&view=auto
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTPostingsFormat.java (added)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTPostingsFormat.java Sun Jul 7 09:14:17 2013
@@ -0,0 +1,77 @@
+package org.apache.lucene.codecs.temp;
+
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.TempPostingsReaderBase;
+import org.apache.lucene.codecs.TempPostingsWriterBase;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
+
+public final class TempFSTPostingsFormat extends PostingsFormat {
+ public TempFSTPostingsFormat() {
+ super("TempFST");
+ }
+
+ @Override
+ public String toString() {
+ return getName();
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ TempPostingsWriterBase postingsWriter = new TempPostingsWriter(state);
+
+ boolean success = false;
+ try {
+ FieldsConsumer ret = new TempFSTTermsWriter(state, postingsWriter);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(postingsWriter);
+ }
+ }
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+ TempPostingsReaderBase postingsReader = new TempPostingsReader(state.directory,
+ state.fieldInfos,
+ state.segmentInfo,
+ state.context,
+ state.segmentSuffix);
+ boolean success = false;
+ try {
+ FieldsProducer ret = new TempFSTTermsReader(state, postingsReader);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(postingsReader);
+ }
+ }
+ }
+}
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java?rev=1500391&r1=1500390&r2=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java Sun Jul 7 09:14:17 2013
@@ -18,8 +18,6 @@ package org.apache.lucene.codecs.temp;
*/
import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.File;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
@@ -43,23 +41,19 @@ import org.apache.lucene.store.IndexInpu
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
-import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
-import org.apache.lucene.util.fst.Util;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.TempPostingsReaderBase;
import org.apache.lucene.codecs.CodecUtil;
-
public class TempFSTTermsReader extends FieldsProducer {
final TempPostingsReaderBase postingsReader;
final IndexInput in;
- final TreeMap<String, FieldReader> fields = new TreeMap<String, FieldReader>();
-
+ final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
+ boolean DEBUG = false;
public TempFSTTermsReader(SegmentReadState state, TempPostingsReaderBase postingsReader) throws IOException {
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTTermsWriter.TERMS_EXTENSION);
@@ -83,8 +77,8 @@ public class TempFSTTermsReader extends
long sumDocFreq = in.readVLong();
int docCount = in.readVInt();
int longsSize = in.readVInt();
- FieldReader current = new FieldReader(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
- FieldReader previous = fields.put(fieldInfo.name, current);
+ TermsReader current = new TermsReader(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
+ TermsReader previous = fields.put(fieldInfo.name, current);
checkFieldSummary(state.segmentInfo, current, previous);
}
success = true;
@@ -96,7 +90,8 @@ public class TempFSTTermsReader extends
}
private int readHeader(IndexInput in) throws IOException {
- return CodecUtil.checkHeader(in, TempFSTTermsWriter.TERMS_CODEC_NAME,
+ return CodecUtil.checkHeader(in,
+ TempFSTTermsWriter.TERMS_CODEC_NAME,
TempFSTTermsWriter.TERMS_VERSION_START,
TempFSTTermsWriter.TERMS_VERSION_CURRENT);
}
@@ -104,7 +99,7 @@ public class TempFSTTermsReader extends
in.seek(in.length() - 8);
in.seek(in.readLong());
}
- private void checkFieldSummary(SegmentInfo info, FieldReader field, FieldReader previous) throws IOException {
+ private void checkFieldSummary(SegmentInfo info, TermsReader field, TermsReader previous) throws IOException {
// #docs with field must be <= #docs
if (field.docCount < 0 || field.docCount > info.getDocCount()) {
throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
@@ -147,7 +142,7 @@ public class TempFSTTermsReader extends
}
}
- final class FieldReader extends Terms {
+ final class TermsReader extends Terms {
final FieldInfo fieldInfo;
final long numTerms;
final long sumTotalTermFreq;
@@ -156,16 +151,14 @@ public class TempFSTTermsReader extends
final int longsSize;
final FST<TempTermOutputs.TempMetaData> dict;
- FieldReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
+ TermsReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.longsSize = longsSize;
- this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(longsSize));
- //PrintWriter pw = new PrintWriter(new File("../temp/xxx.txt"));
- //Util.toDot(dict, pw, false, false);
+ this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(fieldInfo, longsSize));
}
// nocommit: implement intersect
@@ -216,8 +209,32 @@ public class TempFSTTermsReader extends
}
// Iterates through terms in this field
- private final class SegmentTermsEnum extends TermsEnum {
- SegmentTermsEnum() {
+ final class SegmentTermsEnum extends TermsEnum {
+ final BytesRefFSTEnum<TempTermOutputs.TempMetaData> fstEnum;
+
+ /* Current term, null when enum ends or unpositioned */
+ BytesRef term;
+
+ /* Current term stats + decoded metadata (customized by PBF) */
+ final TempTermState state;
+
+ /* Current term stats + undecoded metadata (long[] & byte[]) */
+ TempTermOutputs.TempMetaData meta;
+ ByteArrayDataInput bytesReader;
+
+ /* True when current term's metadata is decoded */
+ boolean decoded;
+
+ /* True when current enum is 'positioned' by seekExact(TermState) */
+ boolean seekPending;
+
+ SegmentTermsEnum() throws IOException {
+ this.fstEnum = new BytesRefFSTEnum<TempTermOutputs.TempMetaData>(dict);
+ this.state = postingsReader.newTermState();
+ this.bytesReader = new ByteArrayDataInput();
+ this.term = null;
+ this.decoded = false;
+ this.seekPending = false;
}
@Override
@@ -226,56 +243,115 @@ public class TempFSTTermsReader extends
}
@Override
- public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
- return null;
- }
-
- @Override
- public BytesRef next() throws IOException {
- return null;
+ public TermState termState() throws IOException {
+ decodeMetaData();
+ return state.clone();
}
@Override
public BytesRef term() {
- return null;
+ return term;
}
@Override
public int docFreq() throws IOException {
- return 0;
+ return state.docFreq;
}
@Override
public long totalTermFreq() throws IOException {
- return 0;
+ return state.totalTermFreq;
+ }
+
+ // Let PBF decodes metadata from long[] and byte[]
+ private void decodeMetaData() throws IOException {
+ if (!decoded && !seekPending) {
+ if (meta.bytes != null) {
+ bytesReader.reset(meta.bytes, 0, meta.bytes.length);
+ }
+ postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state);
+ decoded = true;
+ }
+ }
+
+ // Update current enum according to FSTEnum
+ private void updateEnum(final InputOutput<TempTermOutputs.TempMetaData> pair) {
+ if (pair == null) {
+ term = null;
+ } else {
+ term = pair.input;
+ meta = pair.output;
+ state.docFreq = meta.docFreq;
+ state.totalTermFreq = meta.totalTermFreq;
+ }
+ decoded = false;
+ seekPending = false;
}
+ // nocommit: reuse?
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
- return null;
+ decodeMetaData();
+ return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
- return null;
+ if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+ return null;
+ }
+ decodeMetaData();
+ return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
}
@Override
- public void seekExact(BytesRef target, TermState otherState) {
+ public BytesRef next() throws IOException {
+ if (seekPending) { // previously positioned, but termOutputs not fetched
+ seekPending = false;
+ if (seekCeil(term, false) != SeekStatus.FOUND) {
+ return term;
+ }
+ }
+ updateEnum(fstEnum.next());
+ return term;
}
@Override
- public TermState termState() throws IOException {
- return null;
+ public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException {
+ updateEnum(fstEnum.seekExact(target));
+ return term != null;
+ }
+
+ // nocommit: when will we useCache?
+ @Override
+ public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
+ updateEnum(fstEnum.seekCeil(target));
+ if (term == null) {
+ return SeekStatus.END;
+ } else {
+ return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+ }
+ }
+
+ // nocommit: this method doesn't act as 'seekExact' right?
+ @Override
+ public void seekExact(BytesRef target, TermState otherState) {
+ if (term == null || target.compareTo(term) != 0) {
+ state.copyFrom(otherState);
+ term = BytesRef.deepCopyOf(target);
+ seekPending = true;
+ }
}
+ // nocommit: do we need this?
@Override
public void seekExact(long ord) throws IOException {
+ throw new UnsupportedOperationException();
}
@Override
public long ord() {
- return 0;
+ throw new UnsupportedOperationException();
}
}
}
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java?rev=1500391&r1=1500390&r2=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java Sun Jul 7 09:14:17 2013
@@ -46,6 +46,7 @@ import org.apache.lucene.codecs.CodecUti
/** FST based term dict, all the metadata held
* as output of FST */
+// nocommit: where is 'TermStats' ???
public class TempFSTTermsWriter extends FieldsConsumer {
static final String TERMS_EXTENSION = "tmp";
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
@@ -135,7 +136,7 @@ public class TempFSTTermsWriter extends
}
}
- class TermsWriter extends TermsConsumer {
+ final class TermsWriter extends TermsConsumer {
private final Builder<TempTermOutputs.TempMetaData> builder;
private final TempTermOutputs outputs;
private final FieldInfo fieldInfo;
@@ -143,13 +144,14 @@ public class TempFSTTermsWriter extends
private long numTerms;
private final IntsRef scratchTerm = new IntsRef();
+ private final RAMOutputStream statsWriter = new RAMOutputStream();
private final RAMOutputStream metaWriter = new RAMOutputStream();
TermsWriter(FieldInfo fieldInfo) {
this.numTerms = 0;
this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo);
- this.outputs = new TempTermOutputs(longsSize);
+ this.outputs = new TempTermOutputs(fieldInfo, longsSize);
this.builder = new Builder<TempTermOutputs.TempMetaData>(FST.INPUT_TYPE.BYTE1, outputs);
}
@@ -166,16 +168,14 @@ public class TempFSTTermsWriter extends
@Override
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ // write term meta data into fst
final TempTermOutputs.TempMetaData meta = new TempTermOutputs.TempMetaData();
meta.longs = new long[longsSize];
meta.bytes = null;
+ meta.docFreq = stats.docFreq;
+ meta.totalTermFreq = stats.totalTermFreq;
postingsWriter.finishTerm(meta.longs, metaWriter, stats);
- /*
- meta.bytes = new byte[(int)metaWriter.getFilePointer()];
- metaWriter.writeTo(meta.bytes, 0);
- metaWriter.reset();
- */
- int bytesSize = (int)metaWriter.getFilePointer();
+ final int bytesSize = (int)metaWriter.getFilePointer();
if (bytesSize > 0) {
meta.bytes = new byte[bytesSize];
metaWriter.writeTo(meta.bytes, 0);
@@ -191,6 +191,7 @@ public class TempFSTTermsWriter extends
// save FST dict
if (numTerms > 0) {
final FST<TempTermOutputs.TempMetaData> fst = builder.finish();
+ //fst.dump();
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
}
}
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsReader.java?rev=1500391&r1=1500390&r2=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsReader.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsReader.java Sun Jul 7 09:14:17 2013
@@ -71,7 +71,7 @@ public final class TempPostingsReader ex
IndexInput posIn = null;
IndexInput payIn = null;
try {
- docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.DOC_EXTENSION),
+ docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.DOC_EXTENSION),
ioContext);
CodecUtil.checkHeader(docIn,
TempPostingsWriter.DOC_CODEC,
@@ -80,7 +80,7 @@ public final class TempPostingsReader ex
forUtil = new ForUtil(docIn);
if (fieldInfos.hasProx()) {
- posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.POS_EXTENSION),
+ posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.POS_EXTENSION),
ioContext);
CodecUtil.checkHeader(posIn,
TempPostingsWriter.POS_CODEC,
@@ -88,7 +88,7 @@ public final class TempPostingsReader ex
TempPostingsWriter.VERSION_CURRENT);
if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) {
- payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.PAY_EXTENSION),
+ payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.PAY_EXTENSION),
ioContext);
CodecUtil.checkHeader(payIn,
TempPostingsWriter.PAY_CODEC,
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsWriter.java?rev=1500391&r1=1500390&r2=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsWriter.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempPostingsWriter.java Sun Jul 7 09:14:17 2013
@@ -119,7 +119,7 @@ public final class TempPostingsWriter ex
public TempPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
super();
- docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.DOC_EXTENSION),
+ docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.DOC_EXTENSION),
state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
@@ -129,7 +129,7 @@ public final class TempPostingsWriter ex
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[MAX_DATA_SIZE];
- posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.POS_EXTENSION),
+ posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
@@ -150,7 +150,7 @@ public final class TempPostingsWriter ex
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
- payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.PAY_EXTENSION),
+ payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.PAY_EXTENSION),
state.context);
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
}
Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java?rev=1500391&r1=1500390&r2=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java Sun Jul 7 09:14:17 2013
@@ -20,6 +20,8 @@ package org.apache.lucene.codecs.temp;
import java.io.IOException;
import java.util.Arrays;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.Outputs;
@@ -31,18 +33,25 @@ import org.apache.lucene.util.LongsRef;
public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
private final static TempMetaData NO_OUTPUT = new TempMetaData();
private static boolean DEBUG = false;
+ private FieldInfo fieldInfo;
private int longsSize;
public static class TempMetaData {
public long[] longs;
public byte[] bytes;
+ int docFreq;
+ long totalTermFreq;
TempMetaData() {
this.longs = null;
this.bytes = null;
+ this.docFreq = 0;
+ this.totalTermFreq = -1;
}
- TempMetaData(long[] longs, byte[] bytes) {
+ TempMetaData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) {
this.longs = longs;
this.bytes = bytes;
+ this.docFreq = docFreq;
+ this.totalTermFreq = totalTermFreq;
}
@Override
public int hashCode() {
@@ -79,12 +88,14 @@ public class TempTermOutputs extends Out
if (bytes != null) {
sb.append(" [ ");
for (int i = 0; i < bytes.length; i++) {
- sb.append(bytes[i]+" ");
+ sb.append(Integer.toHexString((int)bytes[i] & 0xff)+" ");
}
sb.append("]");
} else {
sb.append(" null");
}
+ sb.append(" "+docFreq);
+ sb.append(" "+totalTermFreq);
return sb.toString();
}
}
@@ -92,7 +103,8 @@ public class TempTermOutputs extends Out
private TempTermOutputs() {
}
- protected TempTermOutputs(int longsSize) {
+ protected TempTermOutputs(FieldInfo fieldInfo, int longsSize) {
+ this.fieldInfo = fieldInfo;
this.longsSize = longsSize;
}
@@ -102,13 +114,13 @@ public class TempTermOutputs extends Out
// i.e. when every value in long[] fits the same ordering, the smaller one
// will be the result.
//
- // NOTE: only long[] is 'shared', i.e. after sharing common value,
- // the output of smaller one will be a all-zero long[] with original byte[] blob.
+ // NOTE: only long[] is 'shared', i.e. if there are two byte[] on the successive
+ // arcs, only the last byte[] is valid. (this somewhat saves nodes, but might affect
+ // compression, since we'll have to load metadata block for other terms as well, currently,
+ // we don't support this)
//
- // nocommit: Builder.add() doesn't immediatelly consumes the output data,
- // which means, the longs after one add() should all be deeply copied
- // instead of being reused? quite hairly to detect it here, so the caller
- // must be careful about this.
+ // nocommit: get the byte[] from smaller one as well, so that
+ // byte[] is actually inherited
//
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
@@ -148,18 +160,18 @@ public class TempTermOutputs extends Out
if (pos < longsSize || accum == 0) {
ret = NO_OUTPUT;
} else if (order) {
- ret = new TempMetaData(longs2, null);
+ ret = new TempMetaData(longs2, null, 0, -1);
} else {
- ret = new TempMetaData(longs1, null);
+ ret = new TempMetaData(longs1, null, 0, -1);
}
} else {
// equal
- if (t1.bytes!= null && Arrays.equals(t1.bytes, t2.bytes)) { // all fields are equal
+ if (t1.bytes!= null && bytesEqual(t1, t2) && statsEqual(t1, t2)) { // all fields are equal
ret = t1;
} else if (accum == 0) { // all zero case
ret = NO_OUTPUT;
} else {
- ret = new TempMetaData(longs1, null);
+ ret = new TempMetaData(longs1, null, 0, -1);
}
}
if (DEBUG) System.out.println("ret:"+ret);
@@ -189,21 +201,27 @@ public class TempTermOutputs extends Out
}
TempMetaData ret;
- if (diff == 0 && (t1.bytes == null || t1.bytes.length == 0)) {
+ if (diff == 0 && bytesEqual(t1, t2) && statsEqual(t1, t2)) {
ret = NO_OUTPUT;
} else {
- ret = new TempMetaData(share, t1.bytes);
+ ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
}
if (DEBUG) System.out.println("ret:"+ret);
return ret;
}
+ static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
+ return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
+ }
+ static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
+ return Arrays.equals(t1.bytes, t2.bytes);
+ }
+
@Override
// nocommit: need to check all-zero case?
// so we can reuse one long[]
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
- // nocommit: necessary?
if (t1 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+t2);
return t2;
@@ -215,17 +233,17 @@ public class TempTermOutputs extends Out
assert t2.longs != null;
int pos = 0;
- long[] accum = new long[longsSize]; // nocommit: reuse
+ long[] accum = new long[longsSize]; // nocommit: reuse?
while (pos < longsSize) {
accum[pos] = t1.longs[pos] + t2.longs[pos];
assert(accum[pos] >= 0);
pos++;
}
TempMetaData ret;
- if (t2.bytes != null) {
- ret = new TempMetaData(accum, t2.bytes);
+ if (t2.bytes != null || t2.docFreq > 0) {
+ ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
} else {
- ret = new TempMetaData(accum, t1.bytes);
+ ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
}
if (DEBUG) System.out.println("ret:"+ret);
return ret;
@@ -236,14 +254,20 @@ public class TempTermOutputs extends Out
for (int pos = 0; pos < longsSize; pos++) {
out.writeVLong(data.longs[pos]);
}
+ int code = data.docFreq == 0 ? 0 : 1;
if (data.bytes != null) {
- out.writeVInt(data.bytes.length);
+ out.writeVInt((data.bytes.length << 1) | code);
out.writeBytes(data.bytes, 0, data.bytes.length);
} else {
- out.writeVInt(0);
+ out.writeVInt(code);
+ }
+ if (data.docFreq > 0) {
+ out.writeVInt(data.docFreq);
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ out.writeVLong(data.totalTermFreq - data.docFreq);
+ }
}
}
- // nocommit: can this non-null byte case be used in Final Output?
@Override
public TempMetaData read(DataInput in) throws IOException {
@@ -251,13 +275,22 @@ public class TempTermOutputs extends Out
for (int pos = 0; pos < longsSize; pos++) {
longs[pos] = in.readVLong();
}
- int bytesSize = in.readVInt();
+ int code = in.readVInt();
+ int bytesSize = code >>> 1;
+ int docFreq = 0;
+ long totalTermFreq = -1;
byte[] bytes = null;
if (bytesSize > 0) {
bytes = new byte[bytesSize];
in.readBytes(bytes, 0, bytes.length);
}
- TempMetaData meta = new TempMetaData(longs, bytes);
+ if ((code & 1) == 1) {
+ docFreq = in.readVInt();
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ totalTermFreq = docFreq + in.readVLong();
+ }
+ }
+ TempMetaData meta = new TempMetaData(longs, bytes, docFreq, totalTermFreq);
return meta;
}
Modified: lucene/dev/branches/lucene3069/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1500391&r1=1500390&r2=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat Sun Jul 7 09:14:17 2013
@@ -15,4 +15,5 @@
org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat
-org.apache.lucene.codecs.temp.TempPostingsFormat
+org.apache.lucene.codecs.temp.TempBlockPostingsFormat
+org.apache.lucene.codecs.temp.TempFSTPostingsFormat
Modified: lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java?rev=1500391&r1=1500390&r2=1500391&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java Sun Jul 7 09:14:17 2013
@@ -1015,7 +1015,7 @@ public class TestIndexWriterReader exten
// Don't proceed if picked Codec is in the list of illegal ones.
final String format = _TestUtil.getPostingsFormat("f");
assumeFalse("Format: " + format + " does not support ReaderTermsIndexDivisor!",
- (format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct")));
+ (format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct") || format.equals("TempFST")));
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, conf);