You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by bu...@apache.org on 2011/01/16 03:25:26 UTC
svn commit: r1059434 [1/2] - in /lucene/dev/branches/realtime_search: ./
dev-tools/idea/.idea/copyright/ lucene/
lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/
lucene/contrib/instantiated/src/test/org/apache/lucene/store/ins...
Author: buschmi
Date: Sun Jan 16 02:25:24 2011
New Revision: 1059434
URL: http://svn.apache.org/viewvc?rev=1059434&view=rev
Log:
Merging r1058717 through r1059431 into realtime branch
Added:
lucene/dev/branches/realtime_search/dev-tools/idea/.idea/copyright/
- copied from r1059431, lucene/dev/trunk/dev-tools/idea/.idea/copyright/
lucene/dev/branches/realtime_search/dev-tools/idea/.idea/copyright/Apache_Software_Foundation.xml
- copied unchanged from r1059431, lucene/dev/trunk/dev-tools/idea/.idea/copyright/Apache_Software_Foundation.xml
lucene/dev/branches/realtime_search/dev-tools/idea/.idea/copyright/profiles_settings.xml
- copied unchanged from r1059431, lucene/dev/trunk/dev-tools/idea/.idea/copyright/profiles_settings.xml
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermStats.java
- copied unchanged from r1059431, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermStats.java
lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
- copied unchanged from r1059431, lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
Modified:
lucene/dev/branches/realtime_search/ (props changed)
lucene/dev/branches/realtime_search/lucene/ (props changed)
lucene/dev/branches/realtime_search/lucene/CHANGES.txt
lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java
lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java
lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
lucene/dev/branches/realtime_search/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java
lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java
lucene/dev/branches/realtime_search/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java
lucene/dev/branches/realtime_search/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/CheckIndex.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/IndexReader.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTerms.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/PhraseQuery.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/TermQuery.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/function/CustomScoreQuery.java
lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/function/ValueSourceQuery.java
lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/TestExternalCodecs.java
lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestCodecs.java
lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestIndexReader.java
lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java
lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java
lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java
lucene/dev/branches/realtime_search/modules/ (props changed)
lucene/dev/branches/realtime_search/solr/ (props changed)
lucene/dev/branches/realtime_search/solr/CHANGES.txt (props changed)
lucene/dev/branches/realtime_search/solr/KEYS (props changed)
lucene/dev/branches/realtime_search/solr/LICENSE.txt (props changed)
lucene/dev/branches/realtime_search/solr/NOTICE.txt (props changed)
lucene/dev/branches/realtime_search/solr/README.txt (props changed)
lucene/dev/branches/realtime_search/solr/build.xml (props changed)
lucene/dev/branches/realtime_search/solr/client/ (props changed)
lucene/dev/branches/realtime_search/solr/common-build.xml (props changed)
lucene/dev/branches/realtime_search/solr/contrib/ (props changed)
lucene/dev/branches/realtime_search/solr/contrib/clustering/CHANGES.txt
lucene/dev/branches/realtime_search/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
lucene/dev/branches/realtime_search/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
lucene/dev/branches/realtime_search/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/DistributedClusteringComponentTest.java
lucene/dev/branches/realtime_search/solr/example/ (props changed)
lucene/dev/branches/realtime_search/solr/lib/ (props changed)
lucene/dev/branches/realtime_search/solr/site/ (props changed)
lucene/dev/branches/realtime_search/solr/src/ (props changed)
lucene/dev/branches/realtime_search/solr/src/java/org/apache/solr/request/UnInvertedField.java
lucene/dev/branches/realtime_search/solr/testlogging.properties (props changed)
Modified: lucene/dev/branches/realtime_search/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/CHANGES.txt?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/realtime_search/lucene/CHANGES.txt Sun Jan 16 02:25:24 2011
@@ -359,6 +359,9 @@ New features
terms dict. This impl stores the indexed terms in an FST, which is
much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
+* LUCENE-2862: Added TermsEnum.totalTermFreq() and
+ Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
+
Optimizations
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.
@@ -737,6 +740,10 @@ New features
is no longer needed and discouraged for that use case. Directly wrapping
Query improves performance, as out-of-order collection is now supported.
(Uwe Schindler)
+
+* LUCENE-2864: Add getMaxTermFrequency (maximum within-document TF) to
+ FieldInvertState so that it can be used in Similarity.computeNorm.
+ (Robert Muir)
Optimizations
Modified: lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java Sun Jan 16 02:25:24 2011
@@ -238,6 +238,10 @@ public class InstantiatedIndex
while((text = termsEnum.next()) != null) {
String termText = text.utf8ToString();
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
+ final long totalTermFreq = termsEnum.totalTermFreq();
+ if (totalTermFreq != -1) {
+ instantiatedTerm.addPositionsCount(totalTermFreq);
+ }
getTermsByFieldAndText().get(field).put(termText, instantiatedTerm);
instantiatedTerm.setTermIndex(terms.size());
terms.add(instantiatedTerm);
Modified: lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java Sun Jan 16 02:25:24 2011
@@ -398,12 +398,22 @@ public class InstantiatedIndexReader ext
if (i < 0) {
i = -i - 1;
}
- if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) {
+ if (i >= orderedTerms.length || orderedTerms[i].field() != field) {
// field does not exist
return null;
}
final int startLoc = i;
+ // TODO: heavy to do this here; would be better to
+ // do it up front & cache
+ long sum = 0;
+ int upto = i;
+ while(upto < orderedTerms.length && orderedTerms[i].field() == field) {
+ sum += orderedTerms[i].getTotalTermFreq();
+ upto++;
+ }
+ final long sumTotalTermFreq = sum;
+
return new Terms() {
@Override
public TermsEnum iterator() {
@@ -411,6 +421,11 @@ public class InstantiatedIndexReader ext
}
@Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
+
+ @Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
Modified: lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java Sun Jan 16 02:25:24 2011
@@ -315,6 +315,7 @@ public class InstantiatedIndexWriter imp
}
associatedDocuments[associatedDocuments.length - 1] = info;
term.setAssociatedDocuments(associatedDocuments);
+ term.addPositionsCount(positions.length);
// todo optimize, only if term vector?
informationByTermOfCurrentDocument.put(term, info);
Modified: lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java Sun Jan 16 02:25:24 2011
@@ -45,6 +45,8 @@ public class InstantiatedTerm
private Term term;
+ private long totalTermFreq;
+
/**
* index of term in InstantiatedIndex
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
@@ -92,6 +94,14 @@ public class InstantiatedTerm
this.associatedDocuments = associatedDocuments;
}
+ void addPositionsCount(long count) {
+ totalTermFreq += count;
+ }
+
+ public long getTotalTermFreq() {
+ return totalTermFreq;
+ }
+
/**
* Finds index to the first beyond the current whose document number is
* greater than or equal to <i>target</i>, -1 if there is no such element.
Modified: lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Sun Jan 16 02:25:24 2011
@@ -24,7 +24,6 @@ import org.apache.lucene.index.TermState
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.index.codecs.PrefixCodedTermState;
import java.io.IOException;
import java.util.Arrays;
@@ -111,6 +110,12 @@ public class InstantiatedTermsEnum exten
}
@Override
+ public long totalTermFreq() {
+ final long v = terms[upto].getTotalTermFreq();
+ return v == 0 ? -1 : v;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) {
reuse = new InstantiatedDocsEnum();
Modified: lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java Sun Jan 16 02:25:24 2011
@@ -66,6 +66,7 @@ public class TestIndicesEquals extends L
// create dir data
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer()));
+
for (int i = 0; i < 20; i++) {
Document document = new Document();
assembleDocument(document, i);
@@ -395,6 +396,10 @@ public class TestIndicesEquals extends L
}
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
+ final long totalTermFreq = aprioriTermEnum.totalTermFreq();
+ if (totalTermFreq != -1) {
+ assertEquals(totalTermFreq, testTermEnum.totalTermFreq());
+ }
// compare termDocs seeking
Modified: lucene/dev/branches/realtime_search/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Sun Jan 16 02:25:24 2011
@@ -610,6 +610,8 @@ public class MemoryIndex implements Seri
/** Term for this field's fieldName, lazily computed on demand */
public transient Term template;
+ private final long sumTotalTermFreq;
+
private static final long serialVersionUID = 2882195016849084649L;
public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) {
@@ -617,6 +619,15 @@ public class MemoryIndex implements Seri
this.numTokens = numTokens;
this.numOverlapTokens = numOverlapTokens;
this.boost = boost;
+ long sum = 0;
+ for(Map.Entry<BytesRef,ArrayIntList> ent : terms.entrySet()) {
+ sum += ent.getValue().size();
+ }
+ sumTotalTermFreq = sum;
+ }
+
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
}
/**
@@ -826,6 +837,11 @@ public class MemoryIndex implements Seri
public long getUniqueTermCount() {
return info.sortedTerms.length;
}
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return info.getSumTotalTermFreq();
+ }
};
}
}
@@ -896,6 +912,11 @@ public class MemoryIndex implements Seri
}
@Override
+ public long totalTermFreq() {
+ return info.sortedTerms[termUpto].getValue().size();
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
if (reuse == null || !(reuse instanceof MemoryDocsEnum)) {
reuse = new MemoryDocsEnum();
Modified: lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java Sun Jan 16 02:25:24 2011
@@ -176,15 +176,34 @@ public class HighFreqTerms {
return ts;
}
- public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
- BytesRef br = termtext;
+ public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termText) throws Exception {
+
long totalTF = 0;
- Bits skipDocs = MultiFields.getDeletedDocs(reader);
- DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br);
- // if term is not in index return totalTF of 0
- if (de == null) {
+
+ Terms terms = MultiFields.getTerms(reader, field);
+ if (terms == null) {
+ return 0;
+ }
+
+ TermsEnum termsEnum = terms.iterator();
+ if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) {
return 0;
}
+
+ Bits skipDocs = MultiFields.getDeletedDocs(reader);
+ if (skipDocs == null) {
+ // TODO: we could do this up front, during the scan
+ // (next()), instead of after-the-fact here w/ seek,
+ // if the codec supports it and there are no del
+ // docs...
+ final long totTF = termsEnum.totalTermFreq();
+ if (totTF != -1) {
+ return totTF;
+ }
+ }
+
+ DocsEnum de = termsEnum.docs(skipDocs, null);
+
// use DocsEnum.read() and BulkResult api
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
int count;
Modified: lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java Sun Jan 16 02:25:24 2011
@@ -41,4 +41,9 @@ public final class TermStats {
String getTermText() {
return termtext.utf8ToString();
}
+
+ @Override
+ public String toString() {
+ return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq);
+ }
}
\ No newline at end of file
Modified: lucene/dev/branches/realtime_search/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java Sun Jan 16 02:25:24 2011
@@ -17,15 +17,16 @@ package org.apache.lucene.misc;
* limitations under the License.
*/
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@@ -41,8 +42,10 @@ public class TestHighFreqTerms extends L
writer = new IndexWriter(dir, newIndexWriterConfig(random,
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))
.setMaxBufferedDocs(2));
+ writer.setInfoStream(VERBOSE ? System.out : null);
indexDocs(writer);
reader = IndexReader.open(dir, true);
+ _TestUtil.checkIndex(dir);
}
@AfterClass
@@ -75,8 +78,8 @@ public class TestHighFreqTerms extends L
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
for (int i = 0; i < terms.length; i++) {
- if (i >0){
- assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
+ if (i > 0) {
+ assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
}
}
}
@@ -134,11 +137,12 @@ public class TestHighFreqTerms extends L
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
- for (int i = 0; i < termsWithTF.length; i++) {
- // check that they are sorted by descending termfreq order
- if (i >0){
- assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
- }
+ for (int i = 0; i < termsWithTF.length; i++) {
+ // check that they are sorted by descending termfreq
+ // order
+ if (i > 0) {
+ assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
+ }
}
}
Modified: lucene/dev/branches/realtime_search/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java (original)
+++ lucene/dev/branches/realtime_search/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java Sun Jan 16 02:25:24 2011
@@ -123,7 +123,11 @@ public final class FieldCacheRewriteMeth
public TermsEnum iterator() throws IOException {
return fcsi.getTermsEnum();
}
-
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return -1;
+ }
});
assert termsEnum != null;
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/CheckIndex.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/CheckIndex.java Sun Jan 16 02:25:24 2011
@@ -610,6 +610,8 @@ public class CheckIndex {
Comparator<BytesRef> termComp = terms.getComparator();
+ long sumTotalTermFreq = 0;
+
while(true) {
final BytesRef term = terms.next();
@@ -660,6 +662,8 @@ public class CheckIndex {
}
int lastDoc = -1;
+ int docCount = 0;
+ long totalTermFreq = 0;
while(true) {
final int doc = docs2.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
@@ -667,6 +671,8 @@ public class CheckIndex {
}
final int freq = docs2.freq();
status.totPos += freq;
+ totalTermFreq += freq;
+ docCount++;
if (doc <= lastDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
@@ -697,22 +703,39 @@ public class CheckIndex {
}
}
}
+
+ final long totalTermFreq2 = terms.totalTermFreq();
+ final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
- // Now count how many deleted docs occurred in
- // this term:
-
+ // Re-count if there are deleted docs:
if (reader.hasDeletions()) {
final DocsEnum docsNoDel = terms.docs(null, docs);
- int count = 0;
+ docCount = 0;
+ totalTermFreq = 0;
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
- count++;
+ docCount++;
+ totalTermFreq += docsNoDel.freq();
}
- if (count != docFreq) {
- throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
+ }
+
+ if (docCount != docFreq) {
+ throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
+ }
+ if (hasTotalTermFreq) {
+ sumTotalTermFreq += totalTermFreq;
+ if (totalTermFreq != totalTermFreq2) {
+ throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
}
}
}
+ if (sumTotalTermFreq != 0) {
+ final long v = fields.terms(field).getSumTotalTermFreq();
+ if (v != -1 && sumTotalTermFreq != v) {
+ throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
+ }
+ }
+
// Test seek to last term:
if (lastTerm != null) {
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FieldInvertState.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FieldInvertState.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FieldInvertState.java Sun Jan 16 02:25:24 2011
@@ -30,6 +30,7 @@ public final class FieldInvertState {
int length;
int numOverlap;
int offset;
+ int maxTermFrequency;
float boost;
AttributeSource attributeSource;
@@ -53,6 +54,7 @@ public final class FieldInvertState {
length = 0;
numOverlap = 0;
offset = 0;
+ maxTermFrequency = 0;
boost = docBoost;
attributeSource = null;
}
@@ -110,6 +112,15 @@ public final class FieldInvertState {
public void setBoost(float boost) {
this.boost = boost;
}
+
+ /**
+ * Get the maximum term-frequency encountered for any term in the field. A
+ * field containing "the quick brown fox jumps over the lazy dog" would have
+ * a value of 2, because "the" appears twice.
+ */
+ public int getMaxTermFrequency() {
+ return maxTermFrequency;
+ }
public AttributeSource getAttributeSource() {
return attributeSource;
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Sun Jan 16 02:25:24 2011
@@ -99,6 +99,11 @@ public class FilterIndexReader extends I
public long getUniqueTermCount() throws IOException {
return in.getUniqueTermCount();
}
+
+ @Override
+ public long getSumTotalTermFreq() throws IOException {
+ return in.getSumTotalTermFreq();
+ }
}
/** Base class for filtering {@link TermsEnum} implementations. */
@@ -156,6 +161,11 @@ public class FilterIndexReader extends I
}
@Override
+ public long totalTermFreq() {
+ return in.totalTermFreq();
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return in.docs(skipDocs, reuse);
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Sun Jan 16 02:25:24 2011
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
@@ -130,6 +131,7 @@ final class FreqProxTermsWriterPerField
postings.docFreqs[termID] = 1;
writeProx(termID, fieldState.position);
}
+ fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
}
@Override
@@ -163,11 +165,12 @@ final class FreqProxTermsWriterPerField
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
}
postings.docFreqs[termID] = 1;
+ fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
writeProx(termID, fieldState.position);
} else {
- postings.docFreqs[termID]++;
+ fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
}
@@ -237,7 +240,7 @@ final class FreqProxTermsWriterPerField
final ByteSliceReader freq = new ByteSliceReader();
final ByteSliceReader prox = new ByteSliceReader();
-
+ long sumTotalTermFreq = 0;
for (int i = 0; i < numTerms; i++) {
final int termID = termIDs[i];
// Get BytesRef
@@ -261,6 +264,7 @@ final class FreqProxTermsWriterPerField
// which all share the same term. Now we must
// interleave the docID streams.
int numDocs = 0;
+ long totTF = 0;
int docID = 0;
int termFreq = 0;
@@ -305,6 +309,7 @@ final class FreqProxTermsWriterPerField
// omitTermFreqAndPositions == false so we do write positions &
// payload
int position = 0;
+ totTF += termDocFreq;
for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt();
position += code >> 1;
@@ -338,10 +343,11 @@ final class FreqProxTermsWriterPerField
postingsConsumer.finishDoc();
}
}
- termsConsumer.finishTerm(text, numDocs);
+ termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
+ sumTotalTermFreq += totTF;
}
- termsConsumer.finish();
+ termsConsumer.finish(sumTotalTermFreq);
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/IndexReader.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/IndexReader.java Sun Jan 16 02:25:24 2011
@@ -997,6 +997,23 @@ public abstract class IndexReader implem
return terms.docFreq(term);
}
+ /** Returns the number of documents containing the term
+ * <code>t</code>. This method returns 0 if the term or
+ * field does not exists. This method does not take into
+ * account deleted documents that have not yet been merged
+ * away. */
+ public long totalTermFreq(String field, BytesRef term) throws IOException {
+ final Fields fields = fields();
+ if (fields == null) {
+ return 0;
+ }
+ final Terms terms = fields.terms(field);
+ if (terms == null) {
+ return 0;
+ }
+ return terms.totalTermFreq(term);
+ }
+
/** This may return null if the field does not exist.*/
public Terms terms(String field) throws IOException {
final Fields fields = fields();
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTerms.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTerms.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTerms.java Sun Jan 16 02:25:24 2011
@@ -77,6 +77,19 @@ public final class MultiTerms extends Te
}
@Override
+ public long getSumTotalTermFreq() throws IOException {
+ long sum = 0;
+ for(Terms terms : subs) {
+ final long v = terms.getSumTotalTermFreq();
+ if (v == -1) {
+ return -1;
+ }
+ sum += v;
+ }
+ return sum;
+ }
+
+ @Override
public Comparator<BytesRef> getComparator() {
return termComp;
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Sun Jan 16 02:25:24 2011
@@ -266,6 +266,19 @@ public final class MultiTermsEnum extend
}
@Override
+ public long totalTermFreq() {
+ long sum = 0;
+ for(int i=0;i<numTop;i++) {
+ final long v = top[i].terms.totalTermFreq();
+ if (v == -1) {
+ return v;
+ }
+ sum += v;
+ }
+ return sum;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final MultiDocsEnum docsEnum;
if (reuse != null) {
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java Sun Jan 16 02:25:24 2011
@@ -57,6 +57,18 @@ public abstract class Terms {
}
}
+ /** Returns the number of documents containing the
+ * specified term text. Returns 0 if the term does not
+ * exist. */
+ public long totalTermFreq(BytesRef text) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
+ return termsEnum.totalTermFreq();
+ } else {
+ return 0;
+ }
+ }
+
/** Get {@link DocsEnum} for the specified term. This
* method may return null if the term does not exist. */
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
@@ -115,6 +127,14 @@ public abstract class Terms {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
+ /** Returns the sum of {@link TermsEnum#totalTermFreq} for
+ * all terms in this field, or -1 if this measure isn't
+ * stored by the codec (or if this fields omits term freq
+ * and positions). Note that, just like other term
+ * measures, this measure does not take deleted documents
+ * into account. */
+ public abstract long getSumTotalTermFreq() throws IOException;
+
/**
* Returns a thread-private {@link TermsEnum} instance. Obtaining
* {@link TermsEnum} from this method might be more efficient than using
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java Sun Jan 16 02:25:24 2011
@@ -125,7 +125,15 @@ public abstract class TermsEnum {
* first time, after next() returns null or seek returns
* {@link SeekStatus#END}.*/
public abstract int docFreq();
-
+
+ /** Returns the total number of occurrences of this term
+ * across all documents (the sum of the freq() for each
+ * doc that has this term). This will be -1 if the
+ * codec doesn't support this measure. Note that, like
+ * other term measures, this measure does not take
+ * deleted documents into account. */
+ public abstract long totalTermFreq();
+
/** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link
* #seek} for the first time. This method will not
@@ -197,6 +205,11 @@ public abstract class TermsEnum {
public int docFreq() {
throw new IllegalStateException("this method should never be called");
}
+
+ @Override
+ public long totalTermFreq() {
+ throw new IllegalStateException("this method should never be called");
+ }
@Override
public long ord() {
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java Sun Jan 16 02:25:24 2011
@@ -132,7 +132,6 @@ public class FixedGapTermsIndexReader ex
private class IndexEnum extends FieldIndexEnum {
private final FieldIndexData.CoreFieldIndex fieldIndex;
private final BytesRef term = new BytesRef();
- private final BytesRef nextTerm = new BytesRef();
private long ord;
public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
@@ -192,7 +191,7 @@ public class FixedGapTermsIndexReader ex
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
- termBytesReader.fillSlice(nextTerm, fieldIndex.termBytesStart + offset, length);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java Sun Jan 16 02:25:24 2011
@@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter ex
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) {
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java Sun Jan 16 02:25:24 2011
@@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
/** Default merge impl: append documents, mapping around
* deletes */
- public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
+ public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
int df = 0;
+ long totTF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
while(true) {
@@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
this.startDoc(doc, postings.freq());
this.finishDoc();
df++;
+ totTF++;
}
} else {
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
@@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
}
final int freq = postingsEnum.freq();
this.startDoc(doc, freq);
+ totTF += freq;
for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition();
final BytesRef payload;
@@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
df++;
}
}
- return df;
+ return new TermStats(df, totTF);
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java Sun Jan 16 02:25:24 2011
@@ -34,7 +34,7 @@ public abstract class PostingsWriterBase
public abstract void startTerm() throws IOException;
/** Finishes the current term */
- public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
+ public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
public abstract void setField(FieldInfo fieldInfo);
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java Sun Jan 16 02:25:24 2011
@@ -27,7 +27,8 @@ import org.apache.lucene.index.TermState
public class PrefixCodedTermState extends OrdTermState {
public int docFreq; // how many docs have this term
public long filePointer; // fp into the terms dict primary file (_X.tis)
-
+ public long totalTermFreq; // total number of occurrences of this term
+
@Override
public void copyFrom(TermState _other) {
assert _other instanceof PrefixCodedTermState : "can not copy from " + _other.getClass().getName();
@@ -35,11 +36,12 @@ public class PrefixCodedTermState extend
super.copyFrom(_other);
filePointer = other.filePointer;
docFreq = other.docFreq;
+ totalTermFreq = other.totalTermFreq;
}
@Override
public String toString() {
- return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
+ return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java Sun Jan 16 02:25:24 2011
@@ -129,18 +129,17 @@ public class PrefixCodedTermsReader exte
// Read per-field details
seekDir(in, dirOffset);
- final int numFields = in.readInt();
+ final int numFields = in.readVInt();
for(int i=0;i<numFields;i++) {
- final int field = in.readInt();
- final long numTerms = in.readLong();
+ final int field = in.readVInt();
+ final long numTerms = in.readVLong();
assert numTerms >= 0;
- final long termsStartPointer = in.readLong();
+ final long termsStartPointer = in.readVLong();
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- if (numTerms > 0) {
- assert !fields.containsKey(fieldInfo.name);
- fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
- }
+ final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
+ assert !fields.containsKey(fieldInfo.name);
+ fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
}
success = true;
} finally {
@@ -245,12 +244,14 @@ public class PrefixCodedTermsReader exte
final long numTerms;
final FieldInfo fieldInfo;
final long termsStartPointer;
+ final long sumTotalTermFreq;
- FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer;
+ this.sumTotalTermFreq = sumTotalTermFreq;
}
@Override
@@ -273,6 +274,11 @@ public class PrefixCodedTermsReader exte
return numTerms;
}
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
+
// Iterates through terms in this field, not supporting ord()
private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in;
@@ -295,6 +301,7 @@ public class PrefixCodedTermsReader exte
bytesReader = new DeltaBytesReader(in);
fieldTerm.field = fieldInfo.name;
state = postingsReader.newTermState();
+ state.totalTermFreq = -1;
state.ord = -1;
}
@@ -494,6 +501,10 @@ public class PrefixCodedTermsReader exte
state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
}
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ state.totalTermFreq = state.docFreq + in.readVLong();
+ }
+
postingsReader.readTerm(in,
fieldInfo, state,
isIndexTerm);
@@ -512,6 +523,11 @@ public class PrefixCodedTermsReader exte
}
@Override
+ public long totalTermFreq() {
+ return state.totalTermFreq;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
assert docsEnum != null;
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java Sun Jan 16 02:25:24 2011
@@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter exte
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
- private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
+ private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private final Comparator<BytesRef> termComp;
public PrefixCodedTermsWriter(
@@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter exte
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
- TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
+ final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
fields.add(terms);
return terms;
}
@@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter exte
public void close() throws IOException {
try {
- final int fieldCount = fields.size();
+
+ int nonZeroCount = 0;
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ nonZeroCount++;
+ }
+ }
final long dirStart = out.getFilePointer();
- out.writeInt(fieldCount);
- for(int i=0;i<fieldCount;i++) {
- TermsWriter field = (TermsWriter) fields.get(i);
- out.writeInt(field.fieldInfo.number);
- out.writeLong(field.numTerms);
- out.writeLong(field.termsStartPointer);
+ out.writeVInt(nonZeroCount);
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVLong(field.numTerms);
+ out.writeVLong(field.termsStartPointer);
+ if (!field.fieldInfo.omitTermFreqAndPositions) {
+ out.writeVLong(field.sumTotalTermFreq);
+ }
+ }
}
writeTrailer(dirStart);
} finally {
@@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter exte
private final long termsStartPointer;
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ long sumTotalTermFreq;
TermsWriter(
TermsIndexWriterBase.FieldWriter fieldIndexWriter,
@@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter exte
}
@Override
- public void finishTerm(BytesRef text, int numDocs) throws IOException {
+ public void finishTerm(BytesRef text, TermStats stats) throws IOException {
- assert numDocs > 0;
+ assert stats.docFreq > 0;
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
- final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
+ final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
termWriter.write(text);
final int highBit = isIndexTerm ? 0x80 : 0;
@@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter exte
// This is a vInt, except, we steal top bit to record
// whether this was an indexed term:
- if ((numDocs & ~0x3F) == 0) {
+ if ((stats.docFreq & ~0x3F) == 0) {
// Fast case -- docFreq fits in 6 bits
- out.writeByte((byte) (highBit | numDocs));
+ out.writeByte((byte) (highBit | stats.docFreq));
} else {
// Write bottom 6 bits of docFreq, then write the
// remainder as vInt:
- out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
- out.writeVInt(numDocs >>> 6);
+ out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
+ out.writeVInt(stats.docFreq >>> 6);
+ }
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ assert stats.totalTermFreq >= stats.docFreq;
+ out.writeVLong(stats.totalTermFreq - stats.docFreq);
}
- postingsWriter.finishTerm(numDocs, isIndexTerm);
+ postingsWriter.finishTerm(stats, isIndexTerm);
numTerms++;
}
// Finishes all terms in this field
@Override
- public void finish() throws IOException {
+ public void finish(long sumTotalTermFreq) throws IOException {
// EOF marker:
+ this.sumTotalTermFreq = sumTotalTermFreq;
out.writeVInt(DeltaBytesWriter.TERM_EOF);
fieldIndexWriter.finish();
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java Sun Jan 16 02:25:24 2011
@@ -38,10 +38,10 @@ public abstract class TermsConsumer {
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */
- public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
+ public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */
- public abstract void finish() throws IOException;
+ public abstract void finish(long sumTotalTermFreq) throws IOException;
/** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */
@@ -55,6 +55,7 @@ public abstract class TermsConsumer {
BytesRef term;
assert termsEnum != null;
+ long sumTotalTermFreq = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
if (docsEnum == null) {
@@ -69,9 +70,9 @@ public abstract class TermsConsumer {
if (docsEnumIn != null) {
docsEnum.reset(docsEnumIn);
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
}
}
}
@@ -94,14 +95,15 @@ public abstract class TermsConsumer {
}
}
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
+ sumTotalTermFreq += stats.totalTermFreq;
}
}
}
}
- finish();
+ finish(sumTotalTermFreq);
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java Sun Jan 16 02:25:24 2011
@@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBa
public abstract void setTermsOutput(IndexOutput out);
public abstract class FieldWriter {
- public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
+ public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
public abstract void finish() throws IOException;
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Sun Jan 16 02:25:24 2011
@@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter
public static abstract class IndexTermSelector {
// Called sequentially on every term being written,
// returning true if this term should be indexed
- public abstract boolean isIndexTerm(BytesRef term, int docFreq);
+ public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
}
/** Same policy as {@link FixedGapTermsIndexWriter} */
@@ -74,7 +74,7 @@ public class VariableGapTermsIndexWriter
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (count >= interval) {
count = 0;
return true;
@@ -99,8 +99,8 @@ public class VariableGapTermsIndexWriter
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
- if (docFreq >= docFreqThresh || count >= interval) {
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
+ if (stats.docFreq >= docFreqThresh || count >= interval) {
count = 0;
return true;
} else {
@@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
- if (policy.isIndexTerm(text, docFreq) || first) {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
+ if (policy.isIndexTerm(text, stats) || first) {
first = false;
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
final int lengthSave = text.length;
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Sun Jan 16 02:25:24 2011
@@ -33,7 +33,6 @@ import org.apache.lucene.index.FieldsEnu
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.CompoundFileReader;
@@ -263,6 +262,11 @@ public class PreFlexFields extends Field
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return -1;
+ }
}
private class PreTermsEnum extends TermsEnum {
@@ -939,6 +943,11 @@ public class PreFlexFields extends Field
}
@Override
+ public long totalTermFreq() {
+ return -1;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
PreDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof PreDocsEnum)) {
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Sun Jan 16 02:25:24 2011
@@ -54,6 +54,7 @@ public class PulsingPostingsReaderImpl e
public void init(IndexInput termsIn) throws IOException {
CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
+ maxPositions = termsIn.readVInt();
wrappedPostingsReader.init(termsIn);
}
@@ -115,8 +116,10 @@ public class PulsingPostingsReaderImpl e
termState.pendingIndexTerm |= isIndexTerm;
- // TODO: wasteful to use whole byte for this (need just a 1 bit);
- if (termsIn.readByte() == 1) {
+ // total TF, but in the omitTFAP case its computed based on docFreq.
+ long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
+
+ if (count <= maxPositions) {
// Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Sun Jan 16 02:25:24 2011
@@ -21,15 +21,16 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
-// TODO: we now pulse entirely according to docFreq of the
-// term; it might be better to eg pulse by "net bytes used"
-// so that a term that has only 1 doc but zillions of
-// positions would not be inlined. Though this is
+// TODO: we pulse based on total TF of the term,
+// it might be better to eg pulse by "net bytes used"
+// so that a term that has only 1 posting but a huge
+// payload would not be inlined. Though this is
// presumably rare in practice...
/** @lucene.experimental */
@@ -85,6 +86,7 @@ public final class PulsingPostingsWriter
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+ termsOut.writeVInt(pending.length); // encode maxPositions in header
wrappedPostingsWriter.start(termsOut);
}
@@ -177,7 +179,7 @@ public final class PulsingPostingsWriter
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+ public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
//System.out.println("PW finishTerm docCount=" + docCount);
assert pendingCount > 0 || pendingCount == -1;
@@ -185,8 +187,7 @@ public final class PulsingPostingsWriter
pendingIsIndexTerm |= isIndexTerm;
if (pendingCount == -1) {
- termsOut.writeByte((byte) 0);
- wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
+ wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
pendingIsIndexTerm = false;
} else {
@@ -194,8 +195,6 @@ public final class PulsingPostingsWriter
// term, so we fully inline our postings data into
// terms dict, now:
- termsOut.writeByte((byte) 1);
-
// TODO: it'd be better to share this encoding logic
// in some inner codec that knows how to write a
// single doc / single position, etc. This way if a
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Sun Jan 16 02:25:24 2011
@@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -239,11 +240,11 @@ public final class SepPostingsWriterImpl
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+ public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
// TODO: -- wasteful we are counting this in two places?
- assert docCount > 0;
- assert docCount == df;
+ assert stats.docFreq > 0;
+ assert stats.docFreq == df;
docIndex.write(termsOut, isIndexTerm);
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Sun Jan 16 02:25:24 2011
@@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.FieldsEnum;
-import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
@@ -119,28 +118,31 @@ class SimpleTextFieldsReader extends Fie
private final IndexInput in;
private final boolean omitTF;
private int docFreq;
+ private long totalTermFreq;
private long docsStart;
private boolean ended;
- private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
+ private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
- public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
+ public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.omitTF = omitTF;
- fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
+ fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
}
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
//System.out.println("seek to text=" + text.utf8ToString());
- final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
+ final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END;
} else {
//System.out.println(" got text=" + term.utf8ToString());
- PairOutputs.Pair<Long,Long> pair = result.output;
- docsStart = pair.output1;
- docFreq = pair.output2.intValue();
+ PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
+ PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
+ docsStart = pair1.output1;
+ docFreq = pair2.output1.intValue();
+ totalTermFreq = pair2.output2;
if (result.input.equals(text)) {
//System.out.println(" match docsStart=" + docsStart);
@@ -155,11 +157,13 @@ class SimpleTextFieldsReader extends Fie
@Override
public BytesRef next() throws IOException {
assert !ended;
- final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
+ final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
if (result != null) {
- final PairOutputs.Pair<Long,Long> pair = result.output;
- docsStart = pair.output1;
- docFreq = pair.output2.intValue();
+ PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
+ PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
+ docsStart = pair1.output1;
+ docFreq = pair2.output1.intValue();
+ totalTermFreq = pair2.output2;
return result.input;
} else {
return null;
@@ -187,6 +191,11 @@ class SimpleTextFieldsReader extends Fie
}
@Override
+ public long totalTermFreq() {
+ return totalTermFreq;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
SimpleTextDocsEnum docsEnum;
if (reuse != null && reuse instanceof SimpleTextDocsEnum && ((SimpleTextDocsEnum) reuse).canReuse(in)) {
@@ -438,8 +447,9 @@ class SimpleTextFieldsReader extends Fie
private class SimpleTextTerms extends Terms {
private final long termsStart;
private final boolean omitTF;
- private FST<PairOutputs.Pair<Long,Long>> fst;
-
+ private long sumTotalTermFreq;
+ private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
+ private int termCount;
private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException {
@@ -450,24 +460,38 @@ class SimpleTextFieldsReader extends Fie
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
- Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
+ final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
+ b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
+ 0,
+ 0,
+ true,
+ new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
+ new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1;
int docFreq = 0;
+ long totalTermFreq = 0;
while(true) {
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) {
- b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
+ b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
+ new PairOutputs.Pair<Long,Long>((long) docFreq,
+ posIntOutputs.get(totalTermFreq))));
+ sumTotalTermFreq += totalTermFreq;
}
break;
} else if (scratch.startsWith(DOC)) {
docFreq++;
+ } else if (scratch.startsWith(POS)) {
+ totalTermFreq++;
} else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) {
- b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
+ b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
+ new PairOutputs.Pair<Long,Long>((long) docFreq,
+ posIntOutputs.get(totalTermFreq))));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
@@ -477,6 +501,9 @@ class SimpleTextFieldsReader extends Fie
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
lastTerm.length = len;
docFreq = 0;
+ sumTotalTermFreq += totalTermFreq;
+ totalTermFreq = 0;
+ termCount++;
}
}
fst = b.finish();
@@ -502,6 +529,16 @@ class SimpleTextFieldsReader extends Fie
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
+
+ @Override
+ public long getUniqueTermCount() {
+ return (long) termCount;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
}
@Override
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java Sun Jan 16 02:25:24 2011
@@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUti
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
@@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends Fie
}
@Override
- public void finishTerm(BytesRef term, int numDocs) throws IOException {
+ public void finishTerm(BytesRef term, TermStats stats) throws IOException {
}
@Override
- public void finish() throws IOException {
+ public void finish(long sumTotalTermFreq) throws IOException {
}
@Override
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java Sun Jan 16 02:25:24 2011
@@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWr
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -184,12 +185,12 @@ public final class StandardPostingsWrite
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
- assert docCount > 0;
+ public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
+ assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
- assert docCount == df;
+ assert stats.docFreq == df;
if (isIndexTerm) {
// Write absolute at seek points
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java Sun Jan 16 02:25:24 2011
@@ -20,7 +20,6 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java Sun Jan 16 02:25:24 2011
@@ -126,6 +126,11 @@ public abstract class FilteredTermsEnum
return tenum.docFreq();
}
+ @Override
+ public long totalTermFreq() {
+ return tenum.totalTermFreq();
+ }
+
/** This enum does not support seeking!
* @throws UnsupportedOperationException
*/
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java?rev=1059434&r1=1059433&r2=1059434&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Sun Jan 16 02:25:24 2011
@@ -244,6 +244,11 @@ public final class FuzzyTermsEnum extend
public int docFreq() {
return actualEnum.docFreq();
}
+
+ @Override
+ public long totalTermFreq() {
+ return actualEnum.totalTermFreq();
+ }
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {