You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/01/20 19:53:58 UTC
svn commit: r1061453 [1/3] - in /lucene/dev/trunk:
lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/
lucene/contrib/memory/src/java/org/apache/lucene/index/memory/
lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/app...
Author: mikemccand
Date: Thu Jan 20 18:53:55 2011
New Revision: 1061453
URL: http://svn.apache.org/viewvc?rev=1061453&view=rev
Log:
LUCENE-2872: block-encode terms in between indexed terms
Added:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java (with props)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java (with props)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java (with props)
Removed:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
Modified:
lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java
lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java
lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java
lucene/dev/trunk/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Terms.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/store/DataInput.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestExternalCodecs.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestSearch.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestSearchForDuplicates.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexReader.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestMultiFields.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestFieldCache.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
lucene/dev/trunk/solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java
Modified: lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Thu Jan 20 18:53:55 2011
@@ -144,9 +144,9 @@ public class InstantiatedTermsEnum exten
}
@Override
- public SeekStatus seek(BytesRef term, TermState state) throws IOException {
+ public void seek(BytesRef term, TermState state) throws IOException {
assert state != null && state instanceof OrdTermState;
- return seek(((OrdTermState)state).ord); // just use the ord for simplicity
+ seek(((OrdTermState)state).ord); // just use the ord for simplicity
}
}
Modified: lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (original)
+++ lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Thu Jan 20 18:53:55 2011
@@ -938,9 +938,9 @@ public class MemoryIndex implements Seri
}
@Override
- public SeekStatus seek(BytesRef term, TermState state) throws IOException {
+ public void seek(BytesRef term, TermState state) throws IOException {
assert state != null;
- return this.seek(((OrdTermState)state).ord);
+ this.seek(((OrdTermState)state).ord);
}
@Override
Modified: lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java (original)
+++ lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java Thu Jan 20 18:53:55 2011
@@ -32,7 +32,7 @@ import org.apache.lucene.index.codecs.Po
import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
-import org.apache.lucene.index.codecs.PrefixCodedTermsReader;
+import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@@ -131,7 +131,7 @@ public class AppendingCodec extends Code
public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set<String> files)
throws IOException {
StandardPostingsReader.files(dir, segmentInfo, codecId, files);
- PrefixCodedTermsReader.files(dir, segmentInfo, codecId, files);
+ BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
}
Modified: lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java (original)
+++ lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java Thu Jan 20 18:53:55 2011
@@ -22,15 +22,15 @@ import java.util.Comparator;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.codecs.PostingsReaderBase;
-import org.apache.lucene.index.codecs.PrefixCodedTermsReader;
-import org.apache.lucene.index.codecs.PrefixCodedTermsWriter;
+import org.apache.lucene.index.codecs.BlockTermsReader;
+import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
-public class AppendingTermsDictReader extends PrefixCodedTermsReader {
+public class AppendingTermsDictReader extends BlockTermsReader {
public AppendingTermsDictReader(TermsIndexReaderBase indexReader,
Directory dir, FieldInfos fieldInfos, String segment,
@@ -43,7 +43,7 @@ public class AppendingTermsDictReader ex
@Override
protected void readHeader(IndexInput in) throws IOException {
CodecUtil.checkHeader(in, AppendingTermsDictWriter.CODEC_NAME,
- PrefixCodedTermsWriter.VERSION_START, PrefixCodedTermsWriter.VERSION_CURRENT);
+ BlockTermsWriter.VERSION_START, BlockTermsWriter.VERSION_CURRENT);
}
@Override
Modified: lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java (original)
+++ lucene/dev/trunk/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java Thu Jan 20 18:53:55 2011
@@ -22,13 +22,13 @@ import java.util.Comparator;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase;
-import org.apache.lucene.index.codecs.PrefixCodedTermsWriter;
+import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
-public class AppendingTermsDictWriter extends PrefixCodedTermsWriter {
+public class AppendingTermsDictWriter extends BlockTermsWriter {
final static String CODEC_NAME = "APPENDING_TERMS_DICT";
public AppendingTermsDictWriter(TermsIndexWriterBase indexWriter,
Modified: lucene/dev/trunk/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java (original)
+++ lucene/dev/trunk/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java Thu Jan 20 18:53:55 2011
@@ -215,7 +215,10 @@ public class TestParser extends LuceneTe
}
private void dumpResults(String qType,Query q, int numDocs) throws IOException
{
- TopDocs hits = searcher.search(q, null, numDocs);
+ if (VERBOSE) {
+ System.out.println("TEST: query=" + q);
+ }
+ TopDocs hits = searcher.search(q, null, numDocs);
assertTrue(qType +" should produce results ", hits.totalHits>0);
if(VERBOSE)
{
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java Thu Jan 20 18:53:55 2011
@@ -802,7 +802,7 @@ public class CheckIndex {
msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
} catch (Throwable e) {
- msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
+ msg("ERROR: " + e);
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Jan 20 18:53:55 2011
@@ -903,8 +903,7 @@ final class DocumentsWriter {
final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
/* if you increase this, you must fix field cache impl for
- * getTerms/getTermsIndex requires <= 32768. Also fix
- * DeltaBytesWriter's TERM_EOF if necessary. */
+ * getTerms/getTermsIndex requires <= 32768. */
final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2;
/* Initial chunks size of the shared int[] blocks used to
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Thu Jan 20 18:53:55 2011
@@ -156,12 +156,12 @@ public class FilterIndexReader extends I
}
@Override
- public int docFreq() {
+ public int docFreq() throws IOException {
return in.docFreq();
}
@Override
- public long totalTermFreq() {
+ public long totalTermFreq() throws IOException {
return in.totalTermFreq();
}
@@ -181,8 +181,8 @@ public class FilterIndexReader extends I
}
@Override
- public SeekStatus seek(BytesRef term, TermState state) throws IOException {
- return in.seek(term, state);
+ public void seek(BytesRef term, TermState state) throws IOException {
+ in.seek(term, state);
}
@Override
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Jan 20 18:53:55 2011
@@ -257,7 +257,7 @@ public final class MultiTermsEnum extend
}
@Override
- public int docFreq() {
+ public int docFreq() throws IOException {
int sum = 0;
for(int i=0;i<numTop;i++) {
sum += top[i].terms.docFreq();
@@ -266,7 +266,7 @@ public final class MultiTermsEnum extend
}
@Override
- public long totalTermFreq() {
+ public long totalTermFreq() throws IOException {
long sum = 0;
for(int i=0;i<numTop;i++) {
final long v = top[i].terms.totalTermFreq();
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Terms.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Terms.java Thu Jan 20 18:53:55 2011
@@ -100,11 +100,8 @@ public abstract class Terms {
* @see TermsEnum#seek(BytesRef, TermState) */
public DocsEnum docs(Bits skipDocs, BytesRef term, TermState termState, DocsEnum reuse) throws IOException {
final TermsEnum termsEnum = getThreadTermsEnum();
- if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
- return termsEnum.docs(skipDocs, reuse);
- } else {
- return null;
- }
+ termsEnum.seek(term, termState);
+ return termsEnum.docs(skipDocs, reuse);
}
/**
@@ -116,11 +113,8 @@ public abstract class Terms {
* @see TermsEnum#seek(BytesRef, TermState) */
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef term, TermState termState, DocsAndPositionsEnum reuse) throws IOException {
final TermsEnum termsEnum = getThreadTermsEnum();
- if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
- return termsEnum.docsAndPositions(skipDocs, reuse);
- } else {
- return null;
- }
+ termsEnum.seek(term, termState);
+ return termsEnum.docsAndPositions(skipDocs, reuse);
}
public long getUniqueTermCount() throws IOException {
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java Thu Jan 20 18:53:55 2011
@@ -80,7 +80,7 @@ public abstract class TermsEnum {
* use this method. Low-level implementations may position the TermsEnum
* without re-seeking the term dictionary.
* <p>
- * Seeking by {@link TermState} should only be used iff the enu the state was
+ * Seeking by {@link TermState} should only be used iff the enum the state was
* obtained from and the enum the state is used for seeking are obtained from
* the same {@link IndexReader}, otherwise a {@link #seek(BytesRef, TermState)} call can
* leave the enum in undefined state.
@@ -97,8 +97,8 @@ public abstract class TermsEnum {
* @param term the term the TermState corresponds to
* @param state the {@link TermState}
* */
- public SeekStatus seek(BytesRef term, TermState state) throws IOException {
- return seek(term);
+ public void seek(BytesRef term, TermState state) throws IOException {
+ seek(term);
}
/** Increments the enumeration to the next element.
@@ -124,7 +124,7 @@ public abstract class TermsEnum {
* term. Do not call this before calling next() for the
* first time, after next() returns null or seek returns
* {@link SeekStatus#END}.*/
- public abstract int docFreq();
+ public abstract int docFreq() throws IOException;
/** Returns the total number of occurrences of this term
* across all documents (the sum of the freq() for each
@@ -132,7 +132,7 @@ public abstract class TermsEnum {
* codec doesn't support this measure. Note that, like
* other term measures, this measure does not take
* deleted documents into account. */
- public abstract long totalTermFreq();
+ public abstract long totalTermFreq() throws IOException;
/** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link
@@ -242,7 +242,7 @@ public abstract class TermsEnum {
}
@Override
- public SeekStatus seek(BytesRef term, TermState state) throws IOException {
+ public void seek(BytesRef term, TermState state) throws IOException {
throw new IllegalStateException("this method should never be called");
}
};
Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java?rev=1061453&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java Thu Jan 20 18:53:55 2011
@@ -0,0 +1,55 @@
+package org.apache.lucene.index.codecs;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.OrdTermState;
+import org.apache.lucene.index.TermState;
+
+/**
+ * Holds all state required for {@link PostingsReaderBase}
+ * to produce a {@link DocsEnum} without re-seeking the
+ * terms dict.
+ */
+public class BlockTermState extends OrdTermState {
+ public int docFreq; // how many docs have this term
+ public long totalTermFreq; // total number of occurrences of this term
+
+ public int termCount; // term ord are in the current block
+ public long blockFilePointer; // fp into the terms dict primary file (_X.tib) that holds this term
+
+ public int blockTermCount; // how many terms in current block
+
+ @Override
+ public void copyFrom(TermState _other) {
+ assert _other instanceof BlockTermState : "can not copy from " + _other.getClass().getName();
+ BlockTermState other = (BlockTermState) _other;
+ super.copyFrom(_other);
+ docFreq = other.docFreq;
+ totalTermFreq = other.totalTermFreq;
+ termCount = other.termCount;
+ blockFilePointer = other.blockFilePointer;
+
+ // NOTE: don't copy blockTermCount;
+ // it's "transient": used only by the "primary"
+ // termState, and regenerated on seek by TermState
+ }
+
+ @Override
+ public String toString() {
+ return super.toString() + "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer;
+ }
+}
Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java?rev=1061453&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java Thu Jan 20 18:53:55 2011
@@ -0,0 +1,741 @@
+package org.apache.lucene.index.codecs;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader; // javadocs
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.DoubleBarrelLRUCache;
+
+/** Handles a terms dict, but decouples all details of
+ * doc/freqs/positions reading to an instance of {@link
+ * PostingsReaderBase}. This class is reusable for
+ * codecs that use a different format for
+ * docs/freqs/positions (though codecs are also free to
+ * make their own terms dict impl).
+ *
+ * <p>This class also interacts with an instance of {@link
+ * TermsIndexReaderBase}, to abstract away the specific
+ * implementation of the terms dict index.
+ * @lucene.experimental */
+
+public class BlockTermsReader extends FieldsProducer {
+ // Open input to the main terms dict file (_X.tis)
+ private final IndexInput in;
+
+ // Reads the terms dict entries, to gather state to
+ // produce DocsEnum on demand
+ private final PostingsReaderBase postingsReader;
+
+ private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
+
+ // Comparator that orders our terms
+ private final Comparator<BytesRef> termComp;
+
+ // Caches the most recently looked-up field + terms:
+ private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache;
+
+ // Reads the terms index
+ private TermsIndexReaderBase indexReader;
+
+ // keeps the dirStart offset
+ protected long dirOffset;
+
+ // Used as key for the terms cache
+ private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey {
+ String field;
+ BytesRef term;
+
+ public FieldAndTerm() {
+ }
+
+ public FieldAndTerm(FieldAndTerm other) {
+ field = other.field;
+ term = new BytesRef(other.term);
+ }
+
+ @Override
+ public boolean equals(Object _other) {
+ FieldAndTerm other = (FieldAndTerm) _other;
+ return other.field == field && term.bytesEquals(other.term);
+ }
+
+ @Override
+ public Object clone() {
+ return new FieldAndTerm(this);
+ }
+
+ @Override
+ public int hashCode() {
+ return field.hashCode() * 31 + term.hashCode();
+ }
+ }
+
+ private String segment;
+
+ public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize,
+ Comparator<BytesRef> termComp, int termsCacheSize, String codecId)
+ throws IOException {
+
+ this.postingsReader = postingsReader;
+ termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
+
+ this.termComp = termComp;
+ this.segment = segment;
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION),
+ readBufferSize);
+
+ boolean success = false;
+ try {
+ readHeader(in);
+
+ // Have PostingsReader init itself
+ postingsReader.init(in);
+
+ // Read per-field details
+ seekDir(in, dirOffset);
+
+ final int numFields = in.readVInt();
+
+ for(int i=0;i<numFields;i++) {
+ final int field = in.readVInt();
+ final long numTerms = in.readVLong();
+ assert numTerms >= 0;
+ final long termsStartPointer = in.readVLong();
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
+ assert !fields.containsKey(fieldInfo.name);
+ fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
+ }
+ success = true;
+ } finally {
+ if (!success) {
+ in.close();
+ }
+ }
+
+ this.indexReader = indexReader;
+ }
+
+ protected void readHeader(IndexInput input) throws IOException {
+ CodecUtil.checkHeader(in, BlockTermsWriter.CODEC_NAME,
+ BlockTermsWriter.VERSION_START,
+ BlockTermsWriter.VERSION_CURRENT);
+ dirOffset = in.readLong();
+ }
+
+ protected void seekDir(IndexInput input, long dirOffset)
+ throws IOException {
+ input.seek(dirOffset);
+ }
+
+ @Override
+ public void loadTermsIndex(int indexDivisor) throws IOException {
+ indexReader.loadTermsIndex(indexDivisor);
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ try {
+ if (indexReader != null) {
+ indexReader.close();
+ }
+ } finally {
+ // null so if an app hangs on to us (ie, we are not
+ // GCable, despite being closed) we still free most
+ // ram
+ indexReader = null;
+ if (in != null) {
+ in.close();
+ }
+ }
+ } finally {
+ try {
+ if (postingsReader != null) {
+ postingsReader.close();
+ }
+ } finally {
+ for(FieldReader field : fields.values()) {
+ field.close();
+ }
+ }
+ }
+ }
+
+ public static void files(Directory dir, SegmentInfo segmentInfo, String id, Collection<String> files) {
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, BlockTermsWriter.TERMS_EXTENSION));
+ }
+
+ public static void getExtensions(Collection<String> extensions) {
+ extensions.add(BlockTermsWriter.TERMS_EXTENSION);
+ }
+
+ @Override
+ public FieldsEnum iterator() {
+ return new TermFieldsEnum();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ return fields.get(field);
+ }
+
+ // Iterates through all fields
+ private class TermFieldsEnum extends FieldsEnum {
+ final Iterator<FieldReader> it;
+ FieldReader current;
+
+ TermFieldsEnum() {
+ it = fields.values().iterator();
+ }
+
+ @Override
+ public String next() {
+ if (it.hasNext()) {
+ current = it.next();
+ return current.fieldInfo.name;
+ } else {
+ current = null;
+ return null;
+ }
+ }
+
+ @Override
+ public TermsEnum terms() throws IOException {
+ return current.iterator();
+ }
+ }
+
+ private class FieldReader extends Terms implements Closeable {
+ final long numTerms;
+ final FieldInfo fieldInfo;
+ final long termsStartPointer;
+ final long sumTotalTermFreq;
+
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ this.numTerms = numTerms;
+ this.termsStartPointer = termsStartPointer;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return termComp;
+ }
+
+ @Override
+ public void close() {
+ super.close();
+ }
+
+ @Override
+ public TermsEnum iterator() throws IOException {
+ return new SegmentTermsEnum();
+ }
+
+ @Override
+ public long getUniqueTermCount() {
+ return numTerms;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
+
+ // Iterates through terms in this field
+ private final class SegmentTermsEnum extends TermsEnum {
+ private final IndexInput in;
+ private final BlockTermState state;
+ private final boolean doOrd;
+ private final FieldAndTerm fieldTerm = new FieldAndTerm();
+ private final TermsIndexReaderBase.FieldIndexEnum indexEnum;
+ private final BytesRef term = new BytesRef();
+
+ /* This is true if indexEnum is "still" seek'd to the index term
+ for the current term. We set it to true on seeking, and then it
+ remains valid until next() is called enough times to load another
+ terms block: */
+ private boolean indexIsCurrent;
+
+ /* True if we've already called .next() on the indexEnum, to "bracket"
+ the current block of terms: */
+ private boolean didIndexNext;
+
+ /* Next index term, bracketing the current block of terms; this is
+ only valid if didIndexNext is true: */
+ private BytesRef nextIndexTerm;
+
+ /* True after seek(TermState), do defer seeking. If the app then
+ calls next() (which is not "typical"), then we'll do the real seek */
+ private boolean seekPending;
+
+ /* How many blocks we've read since last seek. Once this
+ is >= indexEnum.getDivisor() we set indexIsCurrent to false (since
+ the index can no long bracket seek-within-block). */
+ private int blocksSinceSeek;
+
+ private byte[] termSuffixes;
+ private ByteArrayDataInput termSuffixesReader = new ByteArrayDataInput(null);
+
+ /* Common prefix used for all terms in this block. */
+ private int termBlockPrefix;
+
+ private byte[] docFreqBytes;
+ private final ByteArrayDataInput freqReader = new ByteArrayDataInput(null);
+ private int metaDataUpto;
+
+ public SegmentTermsEnum() throws IOException {
+ in = (IndexInput) BlockTermsReader.this.in.clone();
+ in.seek(termsStartPointer);
+ indexEnum = indexReader.getFieldEnum(fieldInfo);
+ doOrd = indexReader.supportsOrd();
+ fieldTerm.field = fieldInfo.name;
+ state = postingsReader.newTermState();
+ state.totalTermFreq = -1;
+ state.ord = -1;
+
+ termSuffixes = new byte[128];
+ docFreqBytes = new byte[64];
+ //System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader);
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return termComp;
+ }
+
+ @Override
+ public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException {
+
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
+ /*
+ if (didIndexNext) {
+ if (nextIndexTerm == null) {
+ //System.out.println(" nextIndexTerm=null");
+ } else {
+ //System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString());
+ }
+ }
+ */
+
+ // Check cache
+ if (useCache) {
+ fieldTerm.term = target;
+ // TODO: should we differentiate "frozen"
+ // TermState (ie one that was cloned and
+ // cached/returned by termState()) from the
+ // malleable (primary) one?
+ final TermState cachedState = termsCache.get(fieldTerm);
+ if (cachedState != null) {
+ seekPending = true;
+ //System.out.println(" cached!");
+ seek(target, cachedState);
+ //System.out.println(" term=" + term.utf8ToString());
+ return SeekStatus.FOUND;
+ }
+ }
+
+ boolean doSeek = true;
+
+ // See if we can avoid seeking, because target term
+ // is after current term but before next index term:
+ if (indexIsCurrent) {
+
+ final int cmp = termComp.compare(term, target);
+
+ if (cmp == 0) {
+ // Already at the requested term
+ return SeekStatus.FOUND;
+ } else if (cmp < 0) {
+
+ // Target term is after current term
+ if (!didIndexNext) {
+ if (indexEnum.next() == -1) {
+ nextIndexTerm = null;
+ } else {
+ nextIndexTerm = indexEnum.term();
+ }
+ //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
+ didIndexNext = true;
+ }
+
+ if (nextIndexTerm == null || termComp.compare(target, nextIndexTerm) < 0) {
+ // Optimization: requested term is within the
+ // same term block we are now in; skip seeking
+ // (but do scanning):
+ doSeek = false;
+ //System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
+ }
+ }
+ }
+
+ if (doSeek) {
+ //System.out.println(" seek");
+
+ // Ask terms index to find biggest indexed term (=
+ // first term in a block) that's <= our text:
+ in.seek(indexEnum.seek(target));
+ boolean result = nextBlock();
+
+ // Block must exist since, at least, the indexed term
+ // is in the block:
+ assert result;
+
+ indexIsCurrent = true;
+ didIndexNext = false;
+ blocksSinceSeek = 0;
+
+ if (doOrd) {
+ state.ord = indexEnum.ord()-1;
+ }
+
+ // NOTE: the first _next() after an index seek is
+ // a bit wasteful, since it redundantly reads some
+ // suffix bytes into the buffer. We could avoid storing
+ // those bytes in the primary file, but then when
+ // next()ing over an index term we'd have to
+ // special case it:
+ term.copy(indexEnum.term());
+ //System.out.println(" seek: term=" + term.utf8ToString());
+ } else {
+ ////System.out.println(" skip seek");
+ }
+
+ seekPending = false;
+
+ // Now scan:
+ while (_next() != null) {
+ final int cmp = termComp.compare(term, target);
+ if (cmp == 0) {
+ // Match!
+ if (useCache) {
+ // Store in cache
+ decodeMetaData();
+ termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
+ }
+ //System.out.println(" FOUND");
+ return SeekStatus.FOUND;
+ } else if (cmp > 0) {
+ //System.out.println(" NOT_FOUND term=" + term.utf8ToString());
+ return SeekStatus.NOT_FOUND;
+ }
+
+ // The purpose of the terms dict index is to seek
+ // the enum to the closest index term before the
+ // term we are looking for. So, we should never
+ // cross another index term (besides the first
+ // one) while we are scanning:
+ assert indexIsCurrent;
+ }
+
+ indexIsCurrent = false;
+ //System.out.println(" END");
+ return SeekStatus.END;
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ //System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" + state.termCount);
+
+ // If seek was previously called and the term was cached,
+ // usually caller is just going to pull a D/&PEnum or get
+ // docFreq, etc. But, if they then call next(),
+ // this method catches up all internal state so next()
+ // works properly:
+ if (seekPending) {
+ assert !indexIsCurrent;
+ in.seek(state.blockFilePointer);
+ final int pendingSeekCount = state.termCount;
+ boolean result = nextBlock();
+
+ final long savOrd = state.ord;
+
+ // Block must exist since seek(TermState) was called w/ a
+ // TermState previously returned by this enum when positioned
+ // on a real term:
+ assert result;
+
+ while(state.termCount < pendingSeekCount) {
+ BytesRef nextResult = _next();
+ assert nextResult != null;
+ }
+ seekPending = false;
+ state.ord = savOrd;
+ }
+ return _next();
+ }
+
+ /* Decodes only the term bytes of the next term. If caller then asks for
+ metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
+ decode all metadata up to the current term. */
+ private BytesRef _next() throws IOException {
+ //System.out.println("BTR._next this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")");
+ if (state.termCount == state.blockTermCount) {
+ if (!nextBlock()) {
+ //System.out.println(" eof");
+ indexIsCurrent = false;
+ return null;
+ }
+ }
+
+ // TODO: cutover to something better for these ints! simple64?
+ final int suffix = termSuffixesReader.readVInt();
+ //System.out.println(" suffix=" + suffix);
+
+ term.length = termBlockPrefix + suffix;
+ if (term.bytes.length < term.length) {
+ term.grow(term.length);
+ }
+ termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
+ state.termCount++;
+
+ // NOTE: meaningless in the non-ord case
+ state.ord++;
+
+ //System.out.println(" return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " + term);
+ return term;
+ }
+
+ @Override
+ public BytesRef term() {
+ return term;
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ //System.out.println("BTR.docFreq");
+ decodeMetaData();
+ //System.out.println(" return " + state.docFreq);
+ return state.docFreq;
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ decodeMetaData();
+ return state.totalTermFreq;
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+ //System.out.println("BTR.docs this=" + this);
+ decodeMetaData();
+ //System.out.println(" state.docFreq=" + state.docFreq);
+ final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
+ assert docsEnum != null;
+ return docsEnum;
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ //System.out.println("BTR.d&p this=" + this);
+ decodeMetaData();
+ if (fieldInfo.omitTermFreqAndPositions) {
+ return null;
+ } else {
+ DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse);
+ //System.out.println(" return d&pe=" + dpe);
+ return dpe;
+ }
+ }
+
+ @Override
+ public void seek(BytesRef target, TermState otherState) throws IOException {
+ //System.out.println("BTR.seek termState target=" + target.utf8ToString() + " " + target + " this=" + this);
+ assert otherState != null && otherState instanceof BlockTermState;
+ assert !doOrd || ((BlockTermState) otherState).ord < numTerms;
+ state.copyFrom(otherState);
+ seekPending = true;
+ indexIsCurrent = false;
+ term.copy(target);
+ }
+
+ @Override
+ public TermState termState() throws IOException {
+ //System.out.println("BTR.termState this=" + this);
+ decodeMetaData();
+ TermState ts = (TermState) state.clone();
+ //System.out.println(" return ts=" + ts);
+ return ts;
+ }
+
+ @Override
+ public SeekStatus seek(long ord) throws IOException {
+ //System.out.println("BTR.seek by ord ord=" + ord);
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (ord >= numTerms) {
+ state.ord = numTerms-1;
+ return SeekStatus.END;
+ }
+
+ // TODO: if ord is in same terms block and
+ // after current ord, we should avoid this seek just
+ // like we do in the seek(BytesRef) case
+ in.seek(indexEnum.seek(ord));
+ boolean result = nextBlock();
+
+ // Block must exist since ord < numTerms:
+ assert result;
+
+ indexIsCurrent = true;
+ didIndexNext = false;
+ blocksSinceSeek = 0;
+ seekPending = false;
+
+ state.ord = indexEnum.ord()-1;
+ assert state.ord >= -1: "ord=" + state.ord;
+ term.copy(indexEnum.term());
+
+ // Now, scan:
+ int left = (int) (ord - state.ord);
+ while(left > 0) {
+ final BytesRef term = _next();
+ assert term != null;
+ left--;
+ assert indexIsCurrent;
+ }
+
+ // always found
+ return SeekStatus.FOUND;
+ }
+
+ public long ord() {
+ if (!doOrd) {
+ throw new UnsupportedOperationException();
+ }
+ return state.ord;
+ }
+
+ private void doPendingSeek() {
+ }
+
+ /* Does initial decode of next block of terms; this
+ doesn't actually decode the docFreq, totalTermFreq,
+ postings details (frq/prx offset, etc.) metadata;
+ it just loads them as byte[] blobs which are then
+ decoded on-demand if the metadata is ever requested
+ for any term in this block. This enables terms-only
+ intensive consumes (eg certain MTQs, respelling) to
+ not pay the price of decoding metadata they won't
+ use. */
+ private boolean nextBlock() throws IOException {
+
+ // TODO: we still lazy-decode the byte[] for each
+ // term (the suffix), but, if we decoded
+ // all N terms up front then seeking could do a fast
+ // bsearch w/in the block...
+
+ //System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this);
+ state.blockFilePointer = in.getFilePointer();
+ state.blockTermCount = in.readVInt();
+ //System.out.println(" blockTermCount=" + state.blockTermCount);
+ if (state.blockTermCount == 0) {
+ return false;
+ }
+ termBlockPrefix = in.readVInt();
+
+ // term suffixes:
+ int len = in.readVInt();
+ if (termSuffixes.length < len) {
+ termSuffixes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ //System.out.println(" termSuffixes len=" + len);
+ in.readBytes(termSuffixes, 0, len);
+ termSuffixesReader.reset(termSuffixes);
+
+ // docFreq, totalTermFreq
+ len = in.readVInt();
+ if (docFreqBytes.length < len) {
+ docFreqBytes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ //System.out.println(" freq bytes len=" + len);
+ in.readBytes(docFreqBytes, 0, len);
+ freqReader.reset(docFreqBytes);
+ metaDataUpto = 0;
+
+ state.termCount = 0;
+
+ postingsReader.readTermsBlock(in, fieldInfo, state);
+
+ blocksSinceSeek++;
+ indexIsCurrent &= (blocksSinceSeek < indexReader.getDivisor());
+ //System.out.println(" indexIsCurrent=" + indexIsCurrent);
+
+ return true;
+ }
+
+ private void decodeMetaData() throws IOException {
+ //System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + state.termCount + " state=" + state);
+ if (!seekPending) {
+ // lazily catch up on metadata decode:
+ final int limit = state.termCount;
+ state.termCount = metaDataUpto;
+ while (metaDataUpto < limit) {
+ //System.out.println(" decode");
+ // TODO: we could make "tiers" of metadata, ie,
+ // decode docFreq/totalTF but don't decode postings
+ // metadata; this way caller could get
+ // docFreq/totalTF w/o paying decode cost for
+ // postings
+ state.docFreq = freqReader.readVInt();
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ state.totalTermFreq = state.docFreq + freqReader.readVLong();
+ }
+ postingsReader.nextTerm(fieldInfo, state);
+ metaDataUpto++;
+ state.termCount++;
+ }
+ } else {
+ //System.out.println(" skip! seekPending");
+ }
+ }
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java?rev=1061453&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java Thu Jan 20 18:53:55 2011
@@ -0,0 +1,316 @@
+package org.apache.lucene.index.codecs;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+// TODO: currently we encode all terms between two indexed
+// terms as a block; but, we could decouple the two, ie
+// allow several blocks in between two indexed terms
+
+/**
+ * Writes terms dict, block-encoding (column stride) each
+ * term's metadata for each set of terms between two
+ * index terms.
+ *
+ * @lucene.experimental
+ */
+
+public class BlockTermsWriter extends FieldsConsumer {
+
+ final static String CODEC_NAME = "BLOCK_TERMS_DICT";
+
+ // Initial format
+ public static final int VERSION_START = 0;
+
+ public static final int VERSION_CURRENT = VERSION_START;
+
+ /** Extension of terms file */
+ static final String TERMS_EXTENSION = "tib";
+
+ protected final IndexOutput out;
+ final PostingsWriterBase postingsWriter;
+ final FieldInfos fieldInfos;
+ FieldInfo currentField;
+ private final TermsIndexWriterBase termsIndexWriter;
+ private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
+ private final Comparator<BytesRef> termComp;
+ private final String segment;
+
+ public BlockTermsWriter(
+ TermsIndexWriterBase termsIndexWriter,
+ SegmentWriteState state,
+ PostingsWriterBase postingsWriter,
+ Comparator<BytesRef> termComp) throws IOException
+ {
+ final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION);
+ this.termsIndexWriter = termsIndexWriter;
+ this.termComp = termComp;
+ out = state.directory.createOutput(termsFileName);
+ fieldInfos = state.fieldInfos;
+ writeHeader(out);
+ currentField = null;
+ this.postingsWriter = postingsWriter;
+ segment = state.segmentName;
+
+ //System.out.println("BTW.init seg=" + state.segmentName);
+
+ postingsWriter.start(out); // have consumer write its format/header
+ }
+
+ protected void writeHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
+
+ out.writeLong(0); // leave space for end index pointer
+ }
+
+ @Override
+ public TermsConsumer addField(FieldInfo field) throws IOException {
+ //System.out.println("\nBTW.addField seg=" + segment + " field=" + field.name);
+ assert currentField == null || currentField.name.compareTo(field.name) < 0;
+ currentField = field;
+ TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field, out.getFilePointer());
+ final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
+ fields.add(terms);
+ return terms;
+ }
+
+ @Override
+ public void close() throws IOException {
+
+ try {
+
+ int nonZeroCount = 0;
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ nonZeroCount++;
+ }
+ }
+
+ final long dirStart = out.getFilePointer();
+
+ out.writeVInt(nonZeroCount);
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVLong(field.numTerms);
+ out.writeVLong(field.termsStartPointer);
+ if (!field.fieldInfo.omitTermFreqAndPositions) {
+ out.writeVLong(field.sumTotalTermFreq);
+ }
+ }
+ }
+ writeTrailer(dirStart);
+ } finally {
+ try {
+ out.close();
+ } finally {
+ try {
+ postingsWriter.close();
+ } finally {
+ termsIndexWriter.close();
+ }
+ }
+ }
+ }
+
+ protected void writeTrailer(long dirStart) throws IOException {
+ // TODO Auto-generated method stub
+ out.seek(CodecUtil.headerLength(CODEC_NAME));
+ out.writeLong(dirStart);
+ }
+
+ private static class TermEntry {
+ public final BytesRef term = new BytesRef();
+ public TermStats stats;
+ }
+
+ class TermsWriter extends TermsConsumer {
+ private final FieldInfo fieldInfo;
+ private final PostingsWriterBase postingsWriter;
+ private final long termsStartPointer;
+ private long numTerms;
+ private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ long sumTotalTermFreq;
+ private final BytesRef lastTerm = new BytesRef();
+
+ private TermEntry[] pendingTerms;
+
+ private int pendingCount;
+
+ TermsWriter(
+ TermsIndexWriterBase.FieldWriter fieldIndexWriter,
+ FieldInfo fieldInfo,
+ PostingsWriterBase postingsWriter)
+ {
+ this.fieldInfo = fieldInfo;
+ this.fieldIndexWriter = fieldIndexWriter;
+ pendingTerms = new TermEntry[32];
+ for(int i=0;i<pendingTerms.length;i++) {
+ pendingTerms[i] = new TermEntry();
+ }
+ termsStartPointer = out.getFilePointer();
+ postingsWriter.setField(fieldInfo);
+ this.postingsWriter = postingsWriter;
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return termComp;
+ }
+
+ @Override
+ public PostingsConsumer startTerm(BytesRef text) throws IOException {
+ //System.out.println("BTW.startTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text);
+ postingsWriter.startTerm();
+ return postingsWriter;
+ }
+
+ private final BytesRef lastPrevTerm = new BytesRef();
+
+ @Override
+ public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+
+ assert stats.docFreq > 0;
+ //System.out.println("BTW.finishTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " df=" + stats.docFreq);
+
+ final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
+
+ if (isIndexTerm) {
+ if (pendingCount > 0) {
+ // Instead of writing each term, live, we gather terms
+ // in RAM in a pending buffer, and then write the
+ // entire block in between index terms:
+ flushBlock();
+ }
+ fieldIndexWriter.add(text, stats, out.getFilePointer());
+ }
+
+ if (pendingTerms.length == pendingCount) {
+ final TermEntry[] newArray = new TermEntry[ArrayUtil.oversize(pendingCount+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(pendingTerms, 0, newArray, 0, pendingCount);
+ for(int i=pendingCount;i<newArray.length;i++) {
+ newArray[i] = new TermEntry();
+ }
+ pendingTerms = newArray;
+ }
+ final TermEntry te = pendingTerms[pendingCount];
+ te.term.copy(text);
+ te.stats = stats;
+
+ pendingCount++;
+
+ postingsWriter.finishTerm(stats);
+ numTerms++;
+ }
+
+ // Finishes all terms in this field
+ @Override
+ public void finish(long sumTotalTermFreq) throws IOException {
+ if (pendingCount > 0) {
+ flushBlock();
+ }
+ // EOF marker:
+ out.writeVInt(0);
+
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ fieldIndexWriter.finish(out.getFilePointer());
+ }
+
+ private int sharedPrefix(BytesRef term1, BytesRef term2) {
+ assert term1.offset == 0;
+ assert term2.offset == 0;
+ int pos1 = 0;
+ int pos1End = pos1 + Math.min(term1.length, term2.length);
+ int pos2 = 0;
+ while(pos1 < pos1End) {
+ if (term1.bytes[pos1] != term2.bytes[pos2]) {
+ return pos1;
+ }
+ pos1++;
+ pos2++;
+ }
+ return pos1;
+ }
+
+ private final RAMOutputStream bytesWriter = new RAMOutputStream();
+
+ private void flushBlock() throws IOException {
+ //System.out.println("BTW.flushBlock pendingCount=" + pendingCount);
+
+ // First pass: compute common prefix for all terms
+ // in the block, against term before first term in
+ // this block:
+ int commonPrefix = sharedPrefix(lastPrevTerm, pendingTerms[0].term);
+ for(int termCount=1;termCount<pendingCount;termCount++) {
+ commonPrefix = Math.min(commonPrefix,
+ sharedPrefix(lastPrevTerm,
+ pendingTerms[termCount].term));
+ }
+
+ out.writeVInt(pendingCount);
+ out.writeVInt(commonPrefix);
+
+ // 2nd pass: write suffixes, as separate byte[] blob
+ for(int termCount=0;termCount<pendingCount;termCount++) {
+ final int suffix = pendingTerms[termCount].term.length - commonPrefix;
+ // TODO: cutover to better intblock codec, instead
+ // of interleaving here:
+ bytesWriter.writeVInt(suffix);
+ bytesWriter.writeBytes(pendingTerms[termCount].term.bytes, commonPrefix, suffix);
+ }
+ out.writeVInt((int) bytesWriter.getFilePointer());
+ bytesWriter.writeTo(out);
+ bytesWriter.reset();
+
+ // 3rd pass: write the freqs as byte[] blob
+ // TODO: cutover to better intblock codec. simple64?
+ // write prefix, suffix first:
+ for(int termCount=0;termCount<pendingCount;termCount++) {
+ final TermStats stats = pendingTerms[termCount].stats;
+ assert stats != null;
+ bytesWriter.writeVInt(stats.docFreq);
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ bytesWriter.writeVLong(stats.totalTermFreq-stats.docFreq);
+ }
+ }
+
+ out.writeVInt((int) bytesWriter.getFilePointer());
+ bytesWriter.writeTo(out);
+ bytesWriter.reset();
+
+ postingsWriter.flushTermsBlock();
+ lastPrevTerm.copy(pendingTerms[pendingCount-1].term);
+ pendingCount = 0;
+ }
+ }
+}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java Thu Jan 20 18:53:55 2011
@@ -94,6 +94,7 @@ public class FixedGapTermsIndexReader ex
// Read directory
final int numFields = in.readVInt();
+ //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
for(int i=0;i<numFields;i++) {
final int field = in.readVInt();
final int numIndexTerms = in.readVInt();
@@ -241,9 +242,6 @@ public class FixedGapTermsIndexReader ex
this.packedOffsetsStart = packedOffsetsStart;
this.numIndexTerms = numIndexTerms;
- // We still create the indexReader when indexDivisor
- // is -1, so that PrefixCodedTermsReader can call
- // isIndexTerm for each field:
if (indexDivisor > 0) {
loadTermsIndex();
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java Thu Jan 20 18:53:55 2011
@@ -53,7 +53,6 @@ public class FixedGapTermsIndexWriter ex
private final List<SimpleFieldWriter> fields = new ArrayList<SimpleFieldWriter>();
private final FieldInfos fieldInfos; // unread
- private IndexOutput termsOut;
public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION);
@@ -71,13 +70,9 @@ public class FixedGapTermsIndexWriter ex
}
@Override
- public void setTermsOutput(IndexOutput termsOut) {
- this.termsOut = termsOut;
- }
-
- @Override
- public FieldWriter addField(FieldInfo field) {
- SimpleFieldWriter writer = new SimpleFieldWriter(field);
+ public FieldWriter addField(FieldInfo field, long termsFilePointer) {
+ //System.out.println("FGW: addFfield=" + field.name);
+ SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
@@ -119,10 +114,10 @@ public class FixedGapTermsIndexWriter ex
private final BytesRef lastTerm = new BytesRef();
- SimpleFieldWriter(FieldInfo fieldInfo) {
+ SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
- termsStart = lastTermsPointer = termsOut.getFilePointer();
+ termsStart = lastTermsPointer = termsFilePointer;
termLengths = new short[0];
termsPointerDeltas = new int[0];
}
@@ -130,33 +125,8 @@ public class FixedGapTermsIndexWriter ex
@Override
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term:
+ //System.out.println("FGW: checkIndexTerm text=" + text.utf8ToString());
if (0 == (numTerms++ % termIndexInterval)) {
-
- final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
-
- // write only the min prefix that shows the diff
- // against prior term
- out.writeBytes(text.bytes, text.offset, indexedTermLength);
-
- if (termLengths.length == numIndexTerms) {
- termLengths = ArrayUtil.grow(termLengths);
- }
- if (termsPointerDeltas.length == numIndexTerms) {
- termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
- }
-
- // save delta terms pointer
- final long fp = termsOut.getFilePointer();
- termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer);
- lastTermsPointer = fp;
-
- // save term length (in bytes)
- assert indexedTermLength <= Short.MAX_VALUE;
- termLengths[numIndexTerms] = (short) indexedTermLength;
- totTermLength += indexedTermLength;
-
- lastTerm.copy(text);
- numIndexTerms++;
return true;
} else {
if (0 == numTerms % termIndexInterval) {
@@ -169,13 +139,41 @@ public class FixedGapTermsIndexWriter ex
}
@Override
- public void finish() throws IOException {
+ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
+ final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
+ //System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" + termsFilePointer);
+
+ // write only the min prefix that shows the diff
+ // against prior term
+ out.writeBytes(text.bytes, text.offset, indexedTermLength);
+
+ if (termLengths.length == numIndexTerms) {
+ termLengths = ArrayUtil.grow(termLengths);
+ }
+ if (termsPointerDeltas.length == numIndexTerms) {
+ termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
+ }
+
+ // save delta terms pointer
+ termsPointerDeltas[numIndexTerms] = (int) (termsFilePointer - lastTermsPointer);
+ lastTermsPointer = termsFilePointer;
+
+ // save term length (in bytes)
+ assert indexedTermLength <= Short.MAX_VALUE;
+ termLengths[numIndexTerms] = (short) indexedTermLength;
+ totTermLength += indexedTermLength;
+
+ lastTerm.copy(text);
+ numIndexTerms++;
+ }
+
+ @Override
+ public void finish(long termsFilePointer) throws IOException {
// write primary terms dict offsets
packedIndexStart = out.getFilePointer();
- final long maxValue = termsOut.getFilePointer();
- PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue));
+ PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(termsFilePointer));
// relative to our indexStart
long upto = 0;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java Thu Jan 20 18:53:55 2011
@@ -28,12 +28,12 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; // javadocs
-/** PrefixCodedTermsReader interacts with a single instance
- * of this to manage creation of {@link DocsEnum} and
+/** BlockTermsReader interacts with a single instance
+ * of this class to manage creation of {@link DocsEnum} and
* {@link DocsAndPositionsEnum} instances. It provides an
* IndexInput (termsIn) where this class may read any
* previously stored data that it had written in its
- * corresponding {@link StandardPostingsWriter} at indexing
+ * corresponding {@link PostingsWriterBase} at indexing
* time.
* @lucene.experimental */
@@ -42,17 +42,23 @@ public abstract class PostingsReaderBase
public abstract void init(IndexInput termsIn) throws IOException;
/** Return a newly created empty TermState */
- public abstract PrefixCodedTermState newTermState() throws IOException;
+ public abstract BlockTermState newTermState() throws IOException;
- public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState state, boolean isIndexTerm) throws IOException;
+ /** Actually decode metadata for next term */
+ public abstract void nextTerm(FieldInfo fieldInfo, BlockTermState state) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
- public abstract DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
+ public abstract DocsEnum docs(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
- public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
+ public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
public abstract void close() throws IOException;
+
+ /** Reads data for all terms in the next block; this
+ * method should merely load the byte[] blob but not
+ * decode, which is done in {@link #nextTerm}. */
+ public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState termState) throws IOException;
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java Thu Jan 20 18:53:55 2011
@@ -33,8 +33,10 @@ public abstract class PostingsWriterBase
public abstract void startTerm() throws IOException;
+ public abstract void flushTermsBlock() throws IOException;
+
/** Finishes the current term */
- public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
+ public abstract void finishTerm(TermStats stats) throws IOException;
public abstract void setField(FieldInfo fieldInfo);
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java Thu Jan 20 18:53:55 2011
@@ -17,7 +17,6 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
-import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
@@ -25,14 +24,13 @@ import java.io.IOException;
/** @lucene.experimental */
public abstract class TermsIndexWriterBase {
- public abstract void setTermsOutput(IndexOutput out);
-
public abstract class FieldWriter {
public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
- public abstract void finish() throws IOException;
+ public abstract void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException;
+ public abstract void finish(long termsFilePointer) throws IOException;
}
- public abstract FieldWriter addField(FieldInfo fieldInfo) throws IOException;
+ public abstract FieldWriter addField(FieldInfo fieldInfo, long termsFilePointer) throws IOException;
public abstract void close() throws IOException;
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java Thu Jan 20 18:53:55 2011
@@ -164,9 +164,6 @@ public class VariableGapTermsIndexReader
this.fieldInfo = fieldInfo;
this.indexStart = indexStart;
- // We still create the indexReader when indexDivisor
- // is -1, so that PrefixCodedTermsReader can call
- // isIndexTerm for each field:
if (indexDivisor > 0) {
loadTermsIndex();
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Thu Jan 20 18:53:55 2011
@@ -52,7 +52,6 @@ public class VariableGapTermsIndexWriter
private final List<FSTFieldWriter> fields = new ArrayList<FSTFieldWriter>();
private final FieldInfos fieldInfos; // unread
- private IndexOutput termsOut;
private final IndexTermSelector policy;
/** @lucene.experimental */
@@ -60,6 +59,7 @@ public class VariableGapTermsIndexWriter
// Called sequentially on every term being written,
// returning true if this term should be indexed
public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
+ public abstract void newField(FieldInfo fieldInfo);
}
/** Same policy as {@link FixedGapTermsIndexWriter} */
@@ -83,6 +83,11 @@ public class VariableGapTermsIndexWriter
return false;
}
}
+
+ @Override
+ public void newField(FieldInfo fieldInfo) {
+ count = interval;
+ }
}
/** Sets an index term when docFreq >= docFreqThresh, or
@@ -96,6 +101,9 @@ public class VariableGapTermsIndexWriter
public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
this.interval = interval;
this.docFreqThresh = docFreqThresh;
+
+ // First term is first indexed term:
+ count = interval;
}
@Override
@@ -108,6 +116,11 @@ public class VariableGapTermsIndexWriter
return false;
}
}
+
+ @Override
+ public void newField(FieldInfo fieldInfo) {
+ count = interval;
+ }
}
// TODO: it'd be nice to let the FST builder prune based
@@ -158,14 +171,10 @@ public class VariableGapTermsIndexWriter
}
@Override
- public void setTermsOutput(IndexOutput termsOut) {
- this.termsOut = termsOut;
- }
-
- @Override
- public FieldWriter addField(FieldInfo field) throws IOException {
- //System.out.println("VGW: field=" + field.name);
- FSTFieldWriter writer = new FSTFieldWriter(field);
+ public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
+ ////System.out.println("VGW: field=" + field.name);
+ policy.newField(field);
+ FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
@@ -200,42 +209,48 @@ public class VariableGapTermsIndexWriter
private final BytesRef lastTerm = new BytesRef();
private boolean first = true;
- public FSTFieldWriter(FieldInfo fieldInfo) throws IOException {
+ public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton(true);
fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1,
0, 0, true,
fstOutputs);
indexStart = out.getFilePointer();
- //System.out.println("VGW: field=" + fieldInfo.name);
+ ////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
- fstBuilder.add(new BytesRef(), fstOutputs.get(termsOut.getFilePointer()));
+ fstBuilder.add(new BytesRef(), fstOutputs.get(termsFilePointer));
}
@Override
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
+ //System.out.println("VGW: index term=" + text.utf8ToString());
+ // NOTE: we must force the first term per field to be
+ // indexed, in case policy doesn't:
if (policy.isIndexTerm(text, stats) || first) {
first = false;
- //System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
- final int lengthSave = text.length;
- text.length = indexedTermPrefixLength(lastTerm, text);
- try {
- fstBuilder.add(text, fstOutputs.get(termsOut.getFilePointer()));
- } finally {
- text.length = lengthSave;
- }
- lastTerm.copy(text);
+ //System.out.println(" YES");
return true;
} else {
- //System.out.println("VGW: not index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
lastTerm.copy(text);
return false;
}
}
@Override
- public void finish() throws IOException {
+ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
+ final int lengthSave = text.length;
+ text.length = indexedTermPrefixLength(lastTerm, text);
+ try {
+ fstBuilder.add(text, fstOutputs.get(termsFilePointer));
+ } finally {
+ text.length = lengthSave;
+ }
+ lastTerm.copy(text);
+ }
+
+ @Override
+ public void finish(long termsFilePointer) throws IOException {
fst = fstBuilder.finish();
if (fst != null) {
fst.save(out);
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java Thu Jan 20 18:53:55 2011
@@ -24,6 +24,7 @@ package org.apache.lucene.index.codecs.i
import java.io.IOException;
import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.IntsRef;
@@ -149,7 +150,7 @@ public abstract class FixedIntBlockIndex
private int upto;
@Override
- public void read(final IndexInput indexIn, final boolean absolute) throws IOException {
+ public void read(final DataInput indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.readVInt();
@@ -205,5 +206,10 @@ public abstract class FixedIntBlockIndex
other.upto = upto;
return other;
}
+
+ @Override
+ public String toString() {
+ return "fp=" + fp + " upto=" + upto;
+ }
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java Thu Jan 20 18:53:55 2011
@@ -111,6 +111,11 @@ public abstract class FixedIntBlockIndex
lastUpto = upto;
lastFP = fp;
}
+
+ @Override
+ public String toString() {
+ return "fp=" + fp + " upto=" + upto;
+ }
}
@Override
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java Thu Jan 20 18:53:55 2011
@@ -24,6 +24,7 @@ package org.apache.lucene.index.codecs.i
import java.io.IOException;
import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.IntsRef;
@@ -168,7 +169,7 @@ public abstract class VariableIntBlockIn
private int upto;
@Override
- public void read(final IndexInput indexIn, final boolean absolute) throws IOException {
+ public void read(final DataInput indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.readByte()&0xFF;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=1061453&r1=1061452&r2=1061453&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Thu Jan 20 18:53:55 2011
@@ -32,8 +32,8 @@ import org.apache.lucene.index.codecs.Fi
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.FixedGapTermsIndexReader;
import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter;
-import org.apache.lucene.index.codecs.PrefixCodedTermsReader;
-import org.apache.lucene.index.codecs.PrefixCodedTermsWriter;
+import org.apache.lucene.index.codecs.BlockTermsReader;
+import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardCodec;
@@ -89,7 +89,7 @@ public class PulsingCodec extends Codec
// Terms dict
success = false;
try {
- FieldsConsumer ret = new PrefixCodedTermsWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@@ -132,13 +132,13 @@ public class PulsingCodec extends Codec
// Terms dict reader
success = false;
try {
- FieldsProducer ret = new PrefixCodedTermsReader(indexReader,
- state.dir, state.fieldInfos, state.segmentInfo.name,
- pulsingReader,
- state.readBufferSize,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
- StandardCodec.TERMS_CACHE_SIZE,
- state.codecId);
+ FieldsProducer ret = new BlockTermsReader(indexReader,
+ state.dir, state.fieldInfos, state.segmentInfo.name,
+ pulsingReader,
+ state.readBufferSize,
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
+ StandardCodec.TERMS_CACHE_SIZE,
+ state.codecId);
success = true;
return ret;
} finally {
@@ -155,7 +155,7 @@ public class PulsingCodec extends Codec
@Override
public void files(Directory dir, SegmentInfo segmentInfo, String id, Set<String> files) throws IOException {
StandardPostingsReader.files(dir, segmentInfo, id, files);
- PrefixCodedTermsReader.files(dir, segmentInfo, id, files);
+ BlockTermsReader.files(dir, segmentInfo, id, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, id, files);
}