You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/01/20 20:52:08 UTC
svn commit: r1061480 [2/5] - in /lucene/dev/branches/bulkpostings: ./
dev-tools/idea/.idea/copyright/ lucene/ lucene/contrib/
lucene/contrib/demo/src/java/org/apache/lucene/demo/
lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/ l...
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java Thu Jan 20 19:52:03 2011
@@ -23,6 +23,7 @@ import java.nio.CharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.UnicodeUtil;
@@ -244,6 +245,14 @@ public class CharTermAttributeImpl exten
}
@Override
+ public void reflectWith(AttributeReflector reflector) {
+ reflector.reflect(CharTermAttribute.class, "term", toString());
+ final BytesRef bytes = new BytesRef();
+ toBytesRef(bytes);
+ reflector.reflect(TermToBytesRefAttribute.class, "bytes", bytes);
+ }
+
+ @Override
public void copyTo(AttributeImpl target) {
CharTermAttribute t = (CharTermAttribute) target;
t.copyBuffer(termBuffer, 0, termLength);
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java Thu Jan 20 19:52:03 2011
@@ -229,7 +229,9 @@ class BufferedDeletes {
if (mergedDeletes == null) {
mergedDeletes = getDeletes(segmentInfos.info(firstIdx-1));
numTerms.addAndGet(-mergedDeletes.numTermDeletes.get());
+ assert numTerms.get() >= 0;
bytesUsed.addAndGet(-mergedDeletes.bytesUsed.get());
+ assert bytesUsed.get() >= 0;
}
mergedDeletes.update(deletes, true);
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java Thu Jan 20 19:52:03 2011
@@ -610,6 +610,8 @@ public class CheckIndex {
Comparator<BytesRef> termComp = terms.getComparator();
+ long sumTotalTermFreq = 0;
+
while(true) {
final BytesRef term = terms.next();
@@ -660,6 +662,8 @@ public class CheckIndex {
}
int lastDoc = -1;
+ int docCount = 0;
+ long totalTermFreq = 0;
while(true) {
final int doc = docs2.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
@@ -667,6 +671,8 @@ public class CheckIndex {
}
final int freq = docs2.freq();
status.totPos += freq;
+ totalTermFreq += freq;
+ docCount++;
if (doc <= lastDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
@@ -697,22 +703,39 @@ public class CheckIndex {
}
}
}
+
+ final long totalTermFreq2 = terms.totalTermFreq();
+ final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
- // Now count how many deleted docs occurred in
- // this term:
-
+ // Re-count if there are deleted docs:
if (reader.hasDeletions()) {
final DocsEnum docsNoDel = terms.docs(null, docs);
- int count = 0;
+ docCount = 0;
+ totalTermFreq = 0;
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
- count++;
+ docCount++;
+ totalTermFreq += docsNoDel.freq();
}
- if (count != docFreq) {
- throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
+ }
+
+ if (docCount != docFreq) {
+ throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
+ }
+ if (hasTotalTermFreq) {
+ sumTotalTermFreq += totalTermFreq;
+ if (totalTermFreq != totalTermFreq2) {
+ throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
}
}
}
+ if (sumTotalTermFreq != 0) {
+ final long v = fields.terms(field).getSumTotalTermFreq();
+ if (v != -1 && sumTotalTermFreq != v) {
+ throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
+ }
+ }
+
// Test seek to last term:
if (lastTerm != null) {
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java Thu Jan 20 19:52:03 2011
@@ -142,8 +142,12 @@ public class ConcurrentMergeScheduler ex
}
};
- /** Called whenever the running merges have changed, to
- * pause & unpause threads. */
+ /**
+ * Called whenever the running merges have changed, to pause & unpause
+ * threads. This method sorts the merge threads by their merge size in
+ * descending order and then pauses/unpauses threads from first to lsat --
+ * that way, smaller merges are guaranteed to run before larger ones.
+ */
protected synchronized void updateMergeThreads() {
// Only look at threads that are alive & not in the
@@ -164,6 +168,7 @@ public class ConcurrentMergeScheduler ex
threadIdx++;
}
+ // Sort the merge threads in descending order.
CollectionUtil.mergeSort(activeMerges, compareByMergeDocCount);
int pri = mergeThreadPriority;
@@ -175,12 +180,8 @@ public class ConcurrentMergeScheduler ex
continue;
}
- final boolean doPause;
- if (threadIdx < activeMergeCount-maxThreadCount) {
- doPause = true;
- } else {
- doPause = false;
- }
+ // pause the thread if maxThreadCount is smaller than the number of merge threads.
+ final boolean doPause = threadIdx < activeMergeCount - maxThreadCount;
if (verbose()) {
if (doPause != merge.getPause()) {
@@ -205,13 +206,26 @@ public class ConcurrentMergeScheduler ex
}
}
- private boolean verbose() {
+ /**
+ * Returns true if verbosing is enabled. This method is usually used in
+ * conjunction with {@link #message(String)}, like that:
+ *
+ * <pre>
+ * if (verbose()) {
+ * message("your message");
+ * }
+ * </pre>
+ */
+ protected boolean verbose() {
return writer != null && writer.verbose();
}
- private void message(String message) {
- if (verbose())
- writer.message("CMS: " + message);
+ /**
+ * Outputs the given message - this method assumes {@link #verbose()} was
+ * called and returned true.
+ */
+ protected void message(String message) {
+ writer.message("CMS: " + message);
}
private synchronized void initMergeThreadPriority() {
@@ -231,10 +245,10 @@ public class ConcurrentMergeScheduler ex
/** Wait for any running merge threads to finish */
public void sync() {
- while(true) {
+ while (true) {
MergeThread toSync = null;
- synchronized(this) {
- for(MergeThread t : mergeThreads) {
+ synchronized (this) {
+ for (MergeThread t : mergeThreads) {
if (t.isAlive()) {
toSync = t;
break;
@@ -253,12 +267,14 @@ public class ConcurrentMergeScheduler ex
}
}
- private synchronized int mergeThreadCount() {
+ /**
+ * Returns the number of merge threads that are alive. Note that this number
+ * is ≤ {@link #mergeThreads} size.
+ */
+ protected synchronized int mergeThreadCount() {
int count = 0;
- final int numThreads = mergeThreads.size();
- for(int i=0;i<numThreads;i++) {
- final MergeThread t = mergeThreads.get(i);
- if (t.isAlive() && t.getCurrentMerge() != null) {
+ for (MergeThread mt : mergeThreads) {
+ if (mt.isAlive() && mt.getCurrentMerge() != null) {
count++;
}
}
@@ -266,8 +282,7 @@ public class ConcurrentMergeScheduler ex
}
@Override
- public void merge(IndexWriter writer)
- throws CorruptIndexException, IOException {
+ public void merge(IndexWriter writer) throws IOException {
assert !Thread.holdsLock(writer);
@@ -291,7 +306,7 @@ public class ConcurrentMergeScheduler ex
// Iterate, pulling from the IndexWriter's queue of
// pending merges, until it's empty:
- while(true) {
+ while (true) {
// TODO: we could be careful about which merges to do in
// the BG (eg maybe the "biggest" ones) vs FG, which
@@ -360,8 +375,7 @@ public class ConcurrentMergeScheduler ex
}
/** Does the actual merge, by calling {@link IndexWriter#merge} */
- protected void doMerge(MergePolicy.OneMerge merge)
- throws IOException {
+ protected void doMerge(MergePolicy.OneMerge merge) throws IOException {
writer.merge(merge);
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java Thu Jan 20 19:52:03 2011
@@ -63,8 +63,6 @@ final class DocInverterPerField extends
fieldState.reset(docState.doc.getBoost());
- final int maxFieldLength = docState.maxFieldLength;
-
final boolean doInvert = consumer.start(fields, count);
for(int i=0;i<count;i++) {
@@ -171,12 +169,8 @@ final class DocInverterPerField extends
if (!success)
docState.docWriter.setAborting();
}
+ fieldState.length++;
fieldState.position++;
- if (++fieldState.length >= maxFieldLength) {
- if (docState.infoStream != null)
- docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
- break;
- }
hasMoreTokens = stream.incrementToken();
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Jan 20 19:52:03 2011
@@ -127,7 +127,6 @@ final class DocumentsWriter {
private boolean aborting; // True if an abort is pending
PrintStream infoStream;
- int maxFieldLength = IndexWriterConfig.UNLIMITED_FIELD_LENGTH;
Similarity similarity;
// max # simultaneous threads; if there are more than
@@ -140,7 +139,6 @@ final class DocumentsWriter {
static class DocState {
DocumentsWriter docWriter;
Analyzer analyzer;
- int maxFieldLength;
PrintStream infoStream;
Similarity similarity;
int docID;
@@ -191,6 +189,7 @@ final class DocumentsWriter {
/**
* Allocate bytes used from shared pool.
*/
+ @Override
protected byte[] newBuffer(int size) {
assert size == PER_DOC_BLOCK_SIZE;
return perDocAllocator.getByteBlock();
@@ -358,13 +357,6 @@ final class DocumentsWriter {
}
}
- synchronized void setMaxFieldLength(int maxFieldLength) {
- this.maxFieldLength = maxFieldLength;
- for(int i=0;i<threadStates.length;i++) {
- threadStates[i].docState.maxFieldLength = maxFieldLength;
- }
- }
-
synchronized void setSimilarity(Similarity similarity) {
this.similarity = similarity;
for(int i=0;i<threadStates.length;i++) {
@@ -546,6 +538,8 @@ final class DocumentsWriter {
// Lock order: IW -> DW
synchronized SegmentInfo flush(IndexWriter writer, IndexFileDeleter deleter, MergePolicy mergePolicy, SegmentInfos segmentInfos) throws IOException {
+ final long startTime = System.currentTimeMillis();
+
// We change writer's segmentInfos:
assert Thread.holdsLock(writer);
@@ -646,6 +640,10 @@ final class DocumentsWriter {
// Lock order: IW -> DW -> BD
pushDeletes(newSegment, segmentInfos);
+ if (infoStream != null) {
+ message("flush time " + (System.currentTimeMillis()-startTime) + " msec");
+ }
+
return newSegment;
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java Thu Jan 20 19:52:03 2011
@@ -35,7 +35,6 @@ final class DocumentsWriterThreadState {
public DocumentsWriterThreadState(DocumentsWriter docWriter) throws IOException {
this.docWriter = docWriter;
docState = new DocumentsWriter.DocState();
- docState.maxFieldLength = docWriter.maxFieldLength;
docState.infoStream = docWriter.infoStream;
docState.similarity = docWriter.similarity;
docState.docWriter = docWriter;
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Thu Jan 20 19:52:03 2011
@@ -99,6 +99,11 @@ public class FilterIndexReader extends I
public long getUniqueTermCount() throws IOException {
return in.getUniqueTermCount();
}
+
+ @Override
+ public long getSumTotalTermFreq() throws IOException {
+ return in.getSumTotalTermFreq();
+ }
}
/** Base class for filtering {@link TermsEnum} implementations. */
@@ -156,6 +161,11 @@ public class FilterIndexReader extends I
}
@Override
+ public long totalTermFreq() {
+ return in.totalTermFreq();
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return in.docs(skipDocs, reuse);
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Thu Jan 20 19:52:03 2011
@@ -20,13 +20,14 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Comparator;
import java.util.List;
import java.util.Map;
-import java.util.Comparator;
-import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.FieldsConsumer;
+import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
@@ -165,6 +166,7 @@ final class FreqProxTermsWriter extends
// multiple threads and interacting with the
// TermsConsumer, only calling out to us (passing us the
// DocsConsumer) to handle delivery of docs/positions
+ long sumTotalTermFreq = 0;
while(numFields > 0) {
// Get the next term to merge
@@ -197,6 +199,7 @@ final class FreqProxTermsWriter extends
// which all share the same term. Now we must
// interleave the docID streams.
int numDocs = 0;
+ long totTF = 0;
while(numToMerge > 0) {
FreqProxFieldMergeState minState = termStates[0];
@@ -222,6 +225,7 @@ final class FreqProxTermsWriter extends
// omitTermFreqAndPositions == false so we do write positions &
// payload
int position = 0;
+ totTF += termDocFreq;
for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt();
position += code >> 1;
@@ -286,9 +290,10 @@ final class FreqProxTermsWriter extends
}
assert numDocs > 0;
- termsConsumer.finishTerm(text, numDocs);
+ termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
+ sumTotalTermFreq += totTF;
}
- termsConsumer.finish();
+ termsConsumer.finish(sumTotalTermFreq);
}
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java Thu Jan 20 19:52:03 2011
@@ -997,6 +997,23 @@ public abstract class IndexReader implem
return terms.docFreq(term);
}
+ /** Returns the number of documents containing the term
+ * <code>t</code>. This method returns 0 if the term or
+ * field does not exists. This method does not take into
+ * account deleted documents that have not yet been merged
+ * away. */
+ public long totalTermFreq(String field, BytesRef term) throws IOException {
+ final Fields fields = fields();
+ if (fields == null) {
+ return 0;
+ }
+ final Terms terms = fields.terms(field);
+ if (terms == null) {
+ return 0;
+ }
+ return terms.totalTermFreq(term);
+ }
+
/** This may return null if the field does not exist.*/
public Terms terms(String field) throws IOException {
final Fields fields = fields();
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java Thu Jan 20 19:52:03 2011
@@ -662,9 +662,6 @@ public class IndexWriter implements Clos
* IndexWriter. Additionally, calling {@link #getConfig()} and changing the
* parameters does not affect that IndexWriter instance.
* <p>
- * <b>NOTE:</b> by default, {@link IndexWriterConfig#getMaxFieldLength()}
- * returns {@link IndexWriterConfig#UNLIMITED_FIELD_LENGTH}. Pay attention to
- * whether this setting fits your application.
*
* @param d
* the index directory. The index is either created or appended
@@ -689,7 +686,6 @@ public class IndexWriter implements Clos
directory = d;
analyzer = conf.getAnalyzer();
infoStream = defaultInfoStream;
- maxFieldLength = conf.getMaxFieldLength();
termIndexInterval = conf.getTermIndexInterval();
mergePolicy = conf.getMergePolicy();
mergePolicy.setIndexWriter(this);
@@ -768,7 +764,6 @@ public class IndexWriter implements Clos
docWriter = new DocumentsWriter(directory, this, conf.getIndexingChain(), conf.getMaxThreadStates(), getCurrentFieldInfos(), bufferedDeletes);
docWriter.setInfoStream(infoStream);
- docWriter.setMaxFieldLength(maxFieldLength);
// Default deleter (for backwards compatibility) is
// KeepOnlyLastCommitDeleter:
@@ -1177,25 +1172,7 @@ public class IndexWriter implements Clos
}
/**
- * The maximum number of terms that will be indexed for a single field in a
- * document. This limits the amount of memory required for indexing, so that
- * collections with very large files will not crash the indexing process by
- * running out of memory.<p/>
- * Note that this effectively truncates large documents, excluding from the
- * index terms that occur further in the document. If you know your source
- * documents are large, be sure to set this value high enough to accommodate
- * the expected size. If you set it to Integer.MAX_VALUE, then the only limit
- * is your memory, but you should anticipate an OutOfMemoryError.<p/>
- * By default, no more than 10,000 terms will be indexed for a field.
- *
- * @see MaxFieldLength
- */
- private int maxFieldLength;
-
- /**
- * Adds a document to this index. If the document contains more than
- * {@link IndexWriterConfig#setMaxFieldLength(int)} terms for a given field,
- * the remainder are discarded.
+ * Adds a document to this index.
*
* <p> Note that if an Exception is hit (for example disk full)
* then the index will be consistent, but this document
@@ -1242,9 +1219,7 @@ public class IndexWriter implements Clos
/**
* Adds a document to this index, using the provided analyzer instead of the
- * value of {@link #getAnalyzer()}. If the document contains more than
- * {@link IndexWriterConfig#setMaxFieldLength(int)} terms for a given field, the remainder are
- * discarded.
+ * value of {@link #getAnalyzer()}.
*
* <p>See {@link #addDocument(Document)} for details on
* index and IndexWriter state after an Exception, and
@@ -3280,7 +3255,7 @@ public class IndexWriter implements Clos
// NOTE: the callers of this method should in theory
// be able to do simply wait(), but, as a defense
// against thread timing hazards where notifyAll()
- // falls to be called, we wait for at most 1 second
+ // fails to be called, we wait for at most 1 second
// and then return so caller can check if wait
// conditions are satisfied:
try {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java Thu Jan 20 19:52:03 2011
@@ -41,8 +41,6 @@ import org.apache.lucene.util.Version;
*/
public final class IndexWriterConfig implements Cloneable {
- public static final int UNLIMITED_FIELD_LENGTH = Integer.MAX_VALUE;
-
/**
* Specifies the open mode for {@link IndexWriter}:
* <ul>
@@ -55,7 +53,7 @@ public final class IndexWriterConfig imp
public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
/** Default value is 32. Change using {@link #setTermIndexInterval(int)}. */
- public static final int DEFAULT_TERM_INDEX_INTERVAL = 32; // TODO: this should be private to the codec, not settable here
+ public static final int DEFAULT_TERM_INDEX_INTERVAL = 32; // TODO: this should be private to the codec, not settable here
/** Denotes a flush trigger is disabled. */
public final static int DISABLE_AUTO_FLUSH = -1;
@@ -113,7 +111,6 @@ public final class IndexWriterConfig imp
private IndexDeletionPolicy delPolicy;
private IndexCommit commit;
private OpenMode openMode;
- private int maxFieldLength;
private Similarity similarity;
private int termIndexInterval; // TODO: this should be private to the codec, not settable here
private MergeScheduler mergeScheduler;
@@ -145,7 +142,6 @@ public final class IndexWriterConfig imp
delPolicy = new KeepOnlyLastCommitDeletionPolicy();
commit = null;
openMode = OpenMode.CREATE_OR_APPEND;
- maxFieldLength = UNLIMITED_FIELD_LENGTH;
similarity = Similarity.getDefault();
termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; // TODO: this should be private to the codec, not settable here
mergeScheduler = new ConcurrentMergeScheduler();
@@ -220,37 +216,6 @@ public final class IndexWriterConfig imp
}
/**
- * The maximum number of terms that will be indexed for a single field in a
- * document. This limits the amount of memory required for indexing, so that
- * collections with very large files will not crash the indexing process by
- * running out of memory. This setting refers to the number of running terms,
- * not to the number of different terms.
- * <p>
- * <b>NOTE:</b> this silently truncates large documents, excluding from the
- * index all terms that occur further in the document. If you know your source
- * documents are large, be sure to set this value high enough to accomodate
- * the expected size. If you set it to {@link #UNLIMITED_FIELD_LENGTH}, then
- * the only limit is your memory, but you should anticipate an
- * OutOfMemoryError.
- * <p>
- * By default it is set to {@link #UNLIMITED_FIELD_LENGTH}.
- */
- public IndexWriterConfig setMaxFieldLength(int maxFieldLength) {
- this.maxFieldLength = maxFieldLength;
- return this;
- }
-
- /**
- * Returns the maximum number of terms that will be indexed for a single field
- * in a document.
- *
- * @see #setMaxFieldLength(int)
- */
- public int getMaxFieldLength() {
- return maxFieldLength;
- }
-
- /**
* Expert: allows to open a certain commit point. The default is null which
* opens the latest commit point.
*/
@@ -611,7 +576,6 @@ public final class IndexWriterConfig imp
sb.append("delPolicy=").append(delPolicy.getClass().getName()).append("\n");
sb.append("commit=").append(commit == null ? "null" : commit).append("\n");
sb.append("openMode=").append(openMode).append("\n");
- sb.append("maxFieldLength=").append(maxFieldLength).append("\n");
sb.append("similarity=").append(similarity.getClass().getName()).append("\n");
sb.append("termIndexInterval=").append(termIndexInterval).append("\n"); // TODO: this should be private to the codec, not settable here
sb.append("mergeScheduler=").append(mergeScheduler.getClass().getName()).append("\n");
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java Thu Jan 20 19:52:03 2011
@@ -30,9 +30,14 @@ public class LogByteSizeMergePolicy exte
* or larger will never be merged. @see setMaxMergeMB */
public static final double DEFAULT_MAX_MERGE_MB = 2048;
+ /** Default maximum segment size. A segment of this size
+ * or larger will never be merged during optimize. @see setMaxMergeMBForOptimize */
+ public static final double DEFAULT_MAX_MERGE_MB_FOR_OPTIMIZE = Long.MAX_VALUE;
+
public LogByteSizeMergePolicy() {
minMergeSize = (long) (DEFAULT_MIN_MERGE_MB*1024*1024);
maxMergeSize = (long) (DEFAULT_MAX_MERGE_MB*1024*1024);
+ maxMergeSizeForOptimize = (long) (DEFAULT_MAX_MERGE_MB_FOR_OPTIMIZE*1024*1024);
}
@Override
@@ -63,6 +68,23 @@ public class LogByteSizeMergePolicy exte
return ((double) maxMergeSize)/1024/1024;
}
+ /** <p>Determines the largest segment (measured by total
+ * byte size of the segment's files, in MB) that may be
+ * merged with other segments during optimize. Setting
+ * it low will leave the index with more than 1 segment,
+ * even if {@link IndexWriter#optimize()} is called.*/
+ public void setMaxMergeMBForOptimize(double mb) {
+ maxMergeSizeForOptimize = (long) (mb*1024*1024);
+ }
+
+ /** Returns the largest segment (measured by total byte
+ * size of the segment's files, in MB) that may be merged
+ * with other segments during optimize.
+ * @see #setMaxMergeMBForOptimize */
+ public double getMaxMergeMBForOptimize() {
+ return ((double) maxMergeSizeForOptimize)/1024/1024;
+ }
+
/** Sets the minimum size for the lowest level segments.
* Any segments below this size are considered to be on
* the same level (even if they vary drastically in size)
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java Thu Jan 20 19:52:03 2011
@@ -31,9 +31,10 @@ public class LogDocMergePolicy extends L
public LogDocMergePolicy() {
minMergeSize = DEFAULT_MIN_MERGE_DOCS;
- // maxMergeSize is never used by LogDocMergePolicy; set
+ // maxMergeSize(ForOptimize) are never used by LogDocMergePolicy; set
// it to Long.MAX_VALUE to disable it
maxMergeSize = Long.MAX_VALUE;
+ maxMergeSizeForOptimize = Long.MAX_VALUE;
}
@Override
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java Thu Jan 20 19:52:03 2011
@@ -63,6 +63,9 @@ public abstract class LogMergePolicy ext
protected long minMergeSize;
protected long maxMergeSize;
+ // Although the core MPs set it explicitly, we must default in case someone
+ // out there wrote his own LMP ...
+ protected long maxMergeSizeForOptimize = Long.MAX_VALUE;
protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
@@ -240,9 +243,9 @@ public abstract class LogMergePolicy ext
int start = last - 1;
while (start >= 0) {
SegmentInfo info = infos.info(start);
- if (size(info) > maxMergeSize || sizeDocs(info) > maxMergeDocs) {
+ if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
if (verbose()) {
- message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
+ message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForOptimize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
}
// need to skip that segment + add a merge for the 'right' segments,
// unless there is only 1 which is optimized.
@@ -326,9 +329,12 @@ public abstract class LogMergePolicy ext
}
/** Returns the merges necessary to optimize the index.
- * This merge policy defines "optimized" to mean only one
- * segment in the index, where that segment has no
- * deletions pending nor separate norms, and it is in
+ * This merge policy defines "optimized" to mean only the
+ * requested number of segments is left in the index, and
+ * respects the {@link #maxMergeSizeForOptimize} setting.
+ * By default, and assuming {@code maxNumSegments=1}, only
+ * one segment will be left in the index, where that segment
+ * has no deletions pending nor separate norms, and it is in
* compound file format if the current useCompoundFile
* setting is true. This method returns multiple merges
* (mergeFactor at a time) so the {@link MergeScheduler}
@@ -382,7 +388,7 @@ public abstract class LogMergePolicy ext
boolean anyTooLarge = false;
for (int i = 0; i < last; i++) {
SegmentInfo info = infos.info(i);
- if (size(info) > maxMergeSize || sizeDocs(info) > maxMergeDocs) {
+ if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
anyTooLarge = true;
break;
}
@@ -588,6 +594,7 @@ public abstract class LogMergePolicy ext
sb.append("minMergeSize=").append(minMergeSize).append(", ");
sb.append("mergeFactor=").append(mergeFactor).append(", ");
sb.append("maxMergeSize=").append(maxMergeSize).append(", ");
+ sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", ");
sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
sb.append("useCompoundFile=").append(useCompoundFile);
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java Thu Jan 20 19:52:03 2011
@@ -77,6 +77,19 @@ public final class MultiTerms extends Te
}
@Override
+ public long getSumTotalTermFreq() throws IOException {
+ long sum = 0;
+ for(Terms terms : subs) {
+ final long v = terms.getSumTotalTermFreq();
+ if (v == -1) {
+ return -1;
+ }
+ sum += v;
+ }
+ return sum;
+ }
+
+ @Override
public Comparator<BytesRef> getComparator() {
return termComp;
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Jan 20 19:52:03 2011
@@ -270,6 +270,19 @@ public final class MultiTermsEnum extend
}
@Override
+ public long totalTermFreq() {
+ long sum = 0;
+ for(int i=0;i<numTop;i++) {
+ final long v = top[i].terms.totalTermFreq();
+ if (v == -1) {
+ return v;
+ }
+ sum += v;
+ }
+ return sum;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final MultiDocsEnum docsEnum;
if (reuse != null) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java Thu Jan 20 19:52:03 2011
@@ -335,29 +335,6 @@ public class SegmentReader extends Index
}
}
- // Load bytes but do not cache them if they were not
- // already cached
- public synchronized void bytes(byte[] bytesOut, int offset, int len) throws IOException {
- assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);
- if (bytes != null) {
- // Already cached -- copy from cache:
- assert len <= maxDoc();
- System.arraycopy(bytes, 0, bytesOut, offset, len);
- } else {
- // Not cached
- if (origNorm != null) {
- // Ask origNorm to load
- origNorm.bytes(bytesOut, offset, len);
- } else {
- // We are orig -- read ourselves from disk:
- synchronized(in) {
- in.seek(normSeek);
- in.readBytes(bytesOut, offset, len, false);
- }
- }
- }
- }
-
// Load & cache full bytes array. Returns bytes.
public synchronized byte[] bytes() throws IOException {
assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java Thu Jan 20 19:52:03 2011
@@ -57,6 +57,18 @@ public abstract class Terms {
}
}
+ /** Returns the number of documents containing the
+ * specified term text. Returns 0 if the term does not
+ * exist. */
+ public long totalTermFreq(BytesRef text) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
+ return termsEnum.totalTermFreq();
+ } else {
+ return 0;
+ }
+ }
+
/** Get {@link DocsEnum} for the specified term. This
* method may return null if the term does not exist. */
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
@@ -133,6 +145,14 @@ public abstract class Terms {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
+ /** Returns the sum of {@link TermsEnum#totalTermFreq} for
+ * all terms in this field, or -1 if this measure isn't
+ * stored by the codec (or if this fields omits term freq
+ * and positions). Note that, just like other term
+ * measures, this measure does not take deleted documents
+ * into account. */
+ public abstract long getSumTotalTermFreq() throws IOException;
+
/**
* Returns a thread-private {@link TermsEnum} instance. Obtaining
* {@link TermsEnum} from this method might be more efficient than using
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java Thu Jan 20 19:52:03 2011
@@ -125,7 +125,15 @@ public abstract class TermsEnum {
* first time, after next() returns null or seek returns
* {@link SeekStatus#END}.*/
public abstract int docFreq();
-
+
+ /** Returns the total number of occurrences of this term
+ * across all documents (the sum of the freq() for each
+ * doc that has this term). This will be -1 if the
+ * codec doesn't support this measure. Note that, like
+ * other term measures, this measure does not take
+ * deleted documents into account. */
+ public abstract long totalTermFreq();
+
/** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link
* #seek} for the first time. This method will not
@@ -202,6 +210,11 @@ public abstract class TermsEnum {
public int docFreq() {
throw new IllegalStateException("this method should never be called");
}
+
+ @Override
+ public long totalTermFreq() {
+ throw new IllegalStateException("this method should never be called");
+ }
@Override
public long ord() {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java Thu Jan 20 19:52:03 2011
@@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter ex
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java Thu Jan 20 19:52:03 2011
@@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
/** Default merge impl: append documents, mapping around
* deletes */
- public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
+ public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
int df = 0;
+ long totTF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
while(true) {
@@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
this.startDoc(doc, postings.freq());
this.finishDoc();
df++;
+ totTF++;
}
} else {
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
@@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
}
final int freq = postingsEnum.freq();
this.startDoc(doc, freq);
+ totTF += freq;
for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition();
final BytesRef payload;
@@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
df++;
}
}
- return df;
+ return new TermStats(df, totTF);
}
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java Thu Jan 20 19:52:03 2011
@@ -34,7 +34,7 @@ public abstract class PostingsWriterBase
public abstract void startTerm() throws IOException;
/** Finishes the current term */
- public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
+ public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
public abstract void setField(FieldInfo fieldInfo);
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java Thu Jan 20 19:52:03 2011
@@ -16,6 +16,7 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
+import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
@@ -27,7 +28,8 @@ import org.apache.lucene.index.TermState
public class PrefixCodedTermState extends OrdTermState {
public int docFreq; // how many docs have this term
public long filePointer; // fp into the terms dict primary file (_X.tis)
-
+ public long totalTermFreq; // total number of occurrences of this term
+
@Override
public void copyFrom(TermState _other) {
assert _other instanceof PrefixCodedTermState : "can not copy from " + _other.getClass().getName();
@@ -35,11 +37,12 @@ public class PrefixCodedTermState extend
super.copyFrom(_other);
filePointer = other.filePointer;
docFreq = other.docFreq;
+ totalTermFreq = other.totalTermFreq;
}
@Override
public String toString() {
- return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
+ return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
}
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java Thu Jan 20 19:52:03 2011
@@ -130,18 +130,17 @@ public class PrefixCodedTermsReader exte
// Read per-field details
seekDir(in, dirOffset);
- final int numFields = in.readInt();
+ final int numFields = in.readVInt();
for(int i=0;i<numFields;i++) {
- final int field = in.readInt();
- final long numTerms = in.readLong();
+ final int field = in.readVInt();
+ final long numTerms = in.readVLong();
assert numTerms >= 0;
- final long termsStartPointer = in.readLong();
+ final long termsStartPointer = in.readVLong();
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- if (numTerms > 0) {
- assert !fields.containsKey(fieldInfo.name);
- fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
- }
+ final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
+ assert !fields.containsKey(fieldInfo.name);
+ fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
}
success = true;
} finally {
@@ -246,12 +245,14 @@ public class PrefixCodedTermsReader exte
final long numTerms;
final FieldInfo fieldInfo;
final long termsStartPointer;
+ final long sumTotalTermFreq;
- FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer;
+ this.sumTotalTermFreq = sumTotalTermFreq;
}
@Override
@@ -274,6 +275,11 @@ public class PrefixCodedTermsReader exte
return numTerms;
}
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
+
// Iterates through terms in this field, not supporting ord()
private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in;
@@ -296,6 +302,7 @@ public class PrefixCodedTermsReader exte
bytesReader = new DeltaBytesReader(in);
fieldTerm.field = fieldInfo.name;
state = postingsReader.newTermState();
+ state.totalTermFreq = -1;
state.ord = -1;
}
@@ -496,6 +503,10 @@ public class PrefixCodedTermsReader exte
state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
}
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ state.totalTermFreq = state.docFreq + in.readVLong();
+ }
+
postingsReader.readTerm(in,
fieldInfo, state,
isIndexTerm);
@@ -514,6 +525,11 @@ public class PrefixCodedTermsReader exte
}
@Override
+ public long totalTermFreq() {
+ return state.totalTermFreq;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
assert docsEnum != null;
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java Thu Jan 20 19:52:03 2011
@@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter exte
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
- private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
+ private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private final Comparator<BytesRef> termComp;
public PrefixCodedTermsWriter(
@@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter exte
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
- TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
+ final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
fields.add(terms);
return terms;
}
@@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter exte
public void close() throws IOException {
try {
- final int fieldCount = fields.size();
+
+ int nonZeroCount = 0;
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ nonZeroCount++;
+ }
+ }
final long dirStart = out.getFilePointer();
- out.writeInt(fieldCount);
- for(int i=0;i<fieldCount;i++) {
- TermsWriter field = (TermsWriter) fields.get(i);
- out.writeInt(field.fieldInfo.number);
- out.writeLong(field.numTerms);
- out.writeLong(field.termsStartPointer);
+ out.writeVInt(nonZeroCount);
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVLong(field.numTerms);
+ out.writeVLong(field.termsStartPointer);
+ if (!field.fieldInfo.omitTermFreqAndPositions) {
+ out.writeVLong(field.sumTotalTermFreq);
+ }
+ }
}
writeTrailer(dirStart);
} finally {
@@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter exte
private final long termsStartPointer;
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ long sumTotalTermFreq;
TermsWriter(
TermsIndexWriterBase.FieldWriter fieldIndexWriter,
@@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter exte
}
@Override
- public void finishTerm(BytesRef text, int numDocs) throws IOException {
+ public void finishTerm(BytesRef text, TermStats stats) throws IOException {
- assert numDocs > 0;
+ assert stats.docFreq > 0;
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
- final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
+ final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
termWriter.write(text);
final int highBit = isIndexTerm ? 0x80 : 0;
@@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter exte
// This is a vInt, except, we steal top bit to record
// whether this was an indexed term:
- if ((numDocs & ~0x3F) == 0) {
+ if ((stats.docFreq & ~0x3F) == 0) {
// Fast case -- docFreq fits in 6 bits
- out.writeByte((byte) (highBit | numDocs));
+ out.writeByte((byte) (highBit | stats.docFreq));
} else {
// Write bottom 6 bits of docFreq, then write the
// remainder as vInt:
- out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
- out.writeVInt(numDocs >>> 6);
+ out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
+ out.writeVInt(stats.docFreq >>> 6);
+ }
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ assert stats.totalTermFreq >= stats.docFreq;
+ out.writeVLong(stats.totalTermFreq - stats.docFreq);
}
- postingsWriter.finishTerm(numDocs, isIndexTerm);
+ postingsWriter.finishTerm(stats, isIndexTerm);
numTerms++;
}
// Finishes all terms in this field
@Override
- public void finish() throws IOException {
+ public void finish(long sumTotalTermFreq) throws IOException {
// EOF marker:
+ this.sumTotalTermFreq = sumTotalTermFreq;
out.writeVInt(DeltaBytesWriter.TERM_EOF);
fieldIndexWriter.finish();
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java Thu Jan 20 19:52:03 2011
@@ -38,10 +38,10 @@ public abstract class TermsConsumer {
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */
- public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
+ public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */
- public abstract void finish() throws IOException;
+ public abstract void finish(long sumTotalTermFreq) throws IOException;
/** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */
@@ -55,6 +55,7 @@ public abstract class TermsConsumer {
BytesRef term;
assert termsEnum != null;
+ long sumTotalTermFreq = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
if (docsEnum == null) {
@@ -69,9 +70,9 @@ public abstract class TermsConsumer {
if (docsEnumIn != null) {
docsEnum.reset(docsEnumIn);
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
}
}
}
@@ -94,14 +95,15 @@ public abstract class TermsConsumer {
}
}
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
+ sumTotalTermFreq += stats.totalTermFreq;
}
}
}
}
- finish();
+ finish(sumTotalTermFreq);
}
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java Thu Jan 20 19:52:03 2011
@@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBa
public abstract void setTermsOutput(IndexOutput out);
public abstract class FieldWriter {
- public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
+ public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
public abstract void finish() throws IOException;
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Thu Jan 20 19:52:03 2011
@@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter
public static abstract class IndexTermSelector {
// Called sequentially on every term being written,
// returning true if this term should be indexed
- public abstract boolean isIndexTerm(BytesRef term, int docFreq);
+ public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
}
/** Same policy as {@link FixedGapTermsIndexWriter} */
@@ -74,9 +74,9 @@ public class VariableGapTermsIndexWriter
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (count >= interval) {
- count = 0;
+ count = 1;
return true;
} else {
count++;
@@ -99,9 +99,9 @@ public class VariableGapTermsIndexWriter
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
- if (docFreq >= docFreqThresh || count >= interval) {
- count = 0;
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
+ if (stats.docFreq >= docFreqThresh || count >= interval) {
+ count = 1;
return true;
} else {
count++;
@@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
- if (policy.isIndexTerm(text, docFreq) || first) {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
+ if (policy.isIndexTerm(text, stats) || first) {
first = false;
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
final int lengthSave = text.length;
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Jan 20 19:52:03 2011
@@ -34,7 +34,6 @@ import org.apache.lucene.index.FieldsEnu
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.CompoundFileReader;
@@ -265,6 +264,11 @@ public class PreFlexFields extends Field
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return -1;
+ }
}
private class PreTermsEnum extends TermsEnum {
@@ -941,6 +945,11 @@ public class PreFlexFields extends Field
}
@Override
+ public long totalTermFreq() {
+ return -1;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
PreDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof PreDocsEnum)) {
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Thu Jan 20 19:52:03 2011
@@ -45,6 +45,7 @@ public class PulsingPostingsReaderImpl e
// Fallback reader for non-pulsed terms:
final PostingsReaderBase wrappedPostingsReader;
+ int maxPositions;
public PulsingPostingsReaderImpl(PostingsReaderBase wrappedPostingsReader) throws IOException {
this.wrappedPostingsReader = wrappedPostingsReader;
@@ -54,6 +55,7 @@ public class PulsingPostingsReaderImpl e
public void init(IndexInput termsIn) throws IOException {
CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
+ maxPositions = termsIn.readVInt();
wrappedPostingsReader.init(termsIn);
}
@@ -115,8 +117,10 @@ public class PulsingPostingsReaderImpl e
termState.pendingIndexTerm |= isIndexTerm;
- // TODO: wasteful to use whole byte for this (need just a 1 bit);
- if (termsIn.readByte() == 1) {
+ // total TF, but in the omitTFAP case its computed based on docFreq.
+ long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
+
+ if (count <= maxPositions) {
// Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Thu Jan 20 19:52:03 2011
@@ -21,15 +21,16 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
-// TODO: we now pulse entirely according to docFreq of the
-// term; it might be better to eg pulse by "net bytes used"
-// so that a term that has only 1 doc but zillions of
-// positions would not be inlined. Though this is
+// TODO: we pulse based on total TF of the term,
+// it might be better to eg pulse by "net bytes used"
+// so that a term that has only 1 posting but a huge
+// payload would not be inlined. Though this is
// presumably rare in practice...
/** @lucene.experimental */
@@ -85,6 +86,7 @@ public final class PulsingPostingsWriter
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+ termsOut.writeVInt(pending.length); // encode maxPositions in header
wrappedPostingsWriter.start(termsOut);
}
@@ -177,7 +179,7 @@ public final class PulsingPostingsWriter
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+ public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
//System.out.println("PW finishTerm docCount=" + docCount);
assert pendingCount > 0 || pendingCount == -1;
@@ -185,8 +187,7 @@ public final class PulsingPostingsWriter
pendingIsIndexTerm |= isIndexTerm;
if (pendingCount == -1) {
- termsOut.writeByte((byte) 0);
- wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
+ wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
pendingIsIndexTerm = false;
} else {
@@ -194,8 +195,6 @@ public final class PulsingPostingsWriter
// term, so we fully inline our postings data into
// terms dict, now:
- termsOut.writeByte((byte) 1);
-
// TODO: it'd be better to share this encoding logic
// in some inner codec that knows how to write a
// single doc / single position, etc. This way if a
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Thu Jan 20 19:52:03 2011
@@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -244,11 +245,11 @@ public final class SepPostingsWriterImpl
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+ public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
// TODO: -- wasteful we are counting this in two places?
- assert docCount > 0;
- assert docCount == df;
+ assert stats.docFreq > 0;
+ assert stats.docFreq == df;
docIndex.write(termsOut, isIndexTerm);
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Thu Jan 20 19:52:03 2011
@@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.FieldsEnum;
-import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
@@ -120,28 +119,31 @@ class SimpleTextFieldsReader extends Fie
private final IndexInput in;
private final boolean omitTF;
private int docFreq;
+ private long totalTermFreq;
private long docsStart;
private boolean ended;
- private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
+ private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
- public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
+ public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.omitTF = omitTF;
- fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
+ fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
}
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
//System.out.println("seek to text=" + text.utf8ToString());
- final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
+ final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END;
} else {
//System.out.println(" got text=" + term.utf8ToString());
- PairOutputs.Pair<Long,Long> pair = result.output;
- docsStart = pair.output1;
- docFreq = pair.output2.intValue();
+ PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
+ PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
+ docsStart = pair1.output1;
+ docFreq = pair2.output1.intValue();
+ totalTermFreq = pair2.output2;
if (result.input.equals(text)) {
//System.out.println(" match docsStart=" + docsStart);
@@ -156,11 +158,13 @@ class SimpleTextFieldsReader extends Fie
@Override
public BytesRef next() throws IOException {
assert !ended;
- final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
+ final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
if (result != null) {
- final PairOutputs.Pair<Long,Long> pair = result.output;
- docsStart = pair.output1;
- docFreq = pair.output2.intValue();
+ PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
+ PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
+ docsStart = pair1.output1;
+ docFreq = pair2.output1.intValue();
+ totalTermFreq = pair2.output2;
return result.input;
} else {
return null;
@@ -188,6 +192,11 @@ class SimpleTextFieldsReader extends Fie
}
@Override
+ public long totalTermFreq() {
+ return totalTermFreq;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
SimpleTextDocsEnum docsEnum;
if (reuse != null && reuse instanceof SimpleTextDocsEnum && ((SimpleTextDocsEnum) reuse).canReuse(SimpleTextFieldsReader.this.in)) {
@@ -636,8 +645,9 @@ class SimpleTextFieldsReader extends Fie
private class SimpleTextTerms extends Terms {
private final long termsStart;
private final boolean omitTF;
- private FST<PairOutputs.Pair<Long,Long>> fst;
-
+ private long sumTotalTermFreq;
+ private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
+ private int termCount;
private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException {
@@ -648,24 +658,38 @@ class SimpleTextFieldsReader extends Fie
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
- Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
+ final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
+ b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
+ 0,
+ 0,
+ true,
+ new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
+ new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1;
int docFreq = 0;
+ long totalTermFreq = 0;
while(true) {
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) {
- b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
+ b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
+ new PairOutputs.Pair<Long,Long>((long) docFreq,
+ posIntOutputs.get(totalTermFreq))));
+ sumTotalTermFreq += totalTermFreq;
}
break;
} else if (scratch.startsWith(DOC)) {
docFreq++;
+ } else if (scratch.startsWith(POS)) {
+ totalTermFreq++;
} else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) {
- b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
+ b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
+ new PairOutputs.Pair<Long,Long>((long) docFreq,
+ posIntOutputs.get(totalTermFreq))));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
@@ -675,6 +699,9 @@ class SimpleTextFieldsReader extends Fie
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
lastTerm.length = len;
docFreq = 0;
+ sumTotalTermFreq += totalTermFreq;
+ totalTermFreq = 0;
+ termCount++;
}
}
fst = b.finish();
@@ -700,6 +727,16 @@ class SimpleTextFieldsReader extends Fie
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
+
+ @Override
+ public long getUniqueTermCount() {
+ return (long) termCount;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
}
@Override
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java Thu Jan 20 19:52:03 2011
@@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUti
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
@@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends Fie
}
@Override
- public void finishTerm(BytesRef term, int numDocs) throws IOException {
+ public void finishTerm(BytesRef term, TermStats stats) throws IOException {
}
@Override
- public void finish() throws IOException {
+ public void finish(long sumTotalTermFreq) throws IOException {
}
@Override
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java Thu Jan 20 19:52:03 2011
@@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWr
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -184,12 +185,12 @@ public final class StandardPostingsWrite
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
- assert docCount > 0;
+ public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
+ assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
- assert docCount == df;
+ assert stats.docFreq == df;
if (isIndexTerm) {
// Write absolute at seek points
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java Thu Jan 20 19:52:03 2011
@@ -169,7 +169,7 @@ public class BooleanQuery extends Query
public BooleanWeight(IndexSearcher searcher, boolean disableCoord)
throws IOException {
- this.similarity = getSimilarity(searcher);
+ this.similarity = searcher.getSimilarity();
this.disableCoord = disableCoord;
weights = new ArrayList<Weight>(clauses.size());
for (int i = 0 ; i < clauses.size(); i++) {
@@ -201,6 +201,9 @@ public class BooleanQuery extends Query
return sum ;
}
+ public float coord(int overlap, int maxOverlap) {
+ return similarity.coord(overlap, maxOverlap);
+ }
@Override
public void normalize(float norm) {
@@ -273,7 +276,7 @@ public class BooleanQuery extends Query
sumExpl.setMatch(0 < coord ? Boolean.TRUE : Boolean.FALSE);
sumExpl.setValue(sum);
- final float coordFactor = disableCoord ? 1.0f : similarity.coord(coord, maxCoord);
+ final float coordFactor = disableCoord ? 1.0f : coord(coord, maxCoord);
if (coordFactor == 1.0f) {
return sumExpl; // eliminate wrapper
} else {
@@ -312,7 +315,7 @@ public class BooleanQuery extends Query
// Check if we can return a BooleanScorer
if (!scorerContext.scoreDocsInOrder && scorerContext.topScorer && required.size() == 0 && prohibited.size() < 32) {
- return new BooleanScorer(this, disableCoord, similarity, minNrShouldMatch, optional, prohibited, maxCoord);
+ return new BooleanScorer(this, disableCoord, minNrShouldMatch, optional, prohibited, maxCoord);
}
if (required.size() == 0 && optional.size() == 0) {
@@ -326,7 +329,7 @@ public class BooleanQuery extends Query
}
// Return a BooleanScorer2
- return new BooleanScorer2(this, disableCoord, similarity, minNrShouldMatch, required, prohibited, optional, maxCoord);
+ return new BooleanScorer2(this, disableCoord, minNrShouldMatch, required, prohibited, optional, maxCoord);
}
@Override
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java Thu Jan 20 19:52:03 2011
@@ -22,6 +22,7 @@ import java.util.List;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery.BooleanWeight;
/* Description from Doug Cutting (excerpted from
* LUCENE-1483):
@@ -197,7 +198,7 @@ final class BooleanScorer extends Scorer
private Bucket current;
private int doc = -1;
- BooleanScorer(Weight weight, boolean disableCoord, Similarity similarity, int minNrShouldMatch,
+ BooleanScorer(BooleanWeight weight, boolean disableCoord, int minNrShouldMatch,
List<Scorer> optionalScorers, List<Scorer> prohibitedScorers, int maxCoord) throws IOException {
super(null, weight); // Similarity not used
this.minNrShouldMatch = minNrShouldMatch;
@@ -223,7 +224,7 @@ final class BooleanScorer extends Scorer
coordFactors = new float[optionalScorers.size() + 1];
for (int i = 0; i < coordFactors.length; i++) {
- coordFactors[i] = disableCoord ? 1.0f : similarity.coord(i, maxCoord);
+ coordFactors[i] = disableCoord ? 1.0f : weight.coord(i, maxCoord);
}
}