You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/01/14 12:00:42 UTC

svn commit: r1058939 - in /lucene/dev/trunk/lucene: CHANGES.txt src/java/org/apache/lucene/index/FieldInvertState.java src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java src/test/org/apache/lucene/index/TestMaxTermFrequency.java

Author: rmuir
Date: Fri Jan 14 11:00:42 2011
New Revision: 1058939

URL: http://svn.apache.org/viewvc?rev=1058939&view=rev
Log:
LUCENE-2864: add FieldInvertState.getMaxTermFrequency

Added:
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1058939&r1=1058938&r2=1058939&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Jan 14 11:00:42 2011
@@ -737,6 +737,10 @@ New features
   is no longer needed and discouraged for that use case. Directly wrapping
   Query improves performance, as out-of-order collection is now supported.
   (Uwe Schindler)
+
+* LUCENE-2864: Add getMaxTermFrequency (maximum within-document TF) to 
+  FieldInvertState so that it can be used in Similarity.computeNorm.
+  (Robert Muir)
   
 Optimizations
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInvertState.java?rev=1058939&r1=1058938&r2=1058939&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInvertState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInvertState.java Fri Jan 14 11:00:42 2011
@@ -30,6 +30,7 @@ public final class FieldInvertState {
   int length;
   int numOverlap;
   int offset;
+  int maxTermFrequency;
   float boost;
   AttributeSource attributeSource;
 
@@ -53,6 +54,7 @@ public final class FieldInvertState {
     length = 0;
     numOverlap = 0;
     offset = 0;
+    maxTermFrequency = 0;
     boost = docBoost;
     attributeSource = null;
   }
@@ -110,6 +112,15 @@ public final class FieldInvertState {
   public void setBoost(float boost) {
     this.boost = boost;
   }
+
+  /**
+   * Get the maximum term-frequency encountered for any term in the field.  A
+   * field containing "the quick brown fox jumps over the lazy dog" would have
+   * a value of 2, because "the" appears twice.
+   */
+  public int getMaxTermFrequency() {
+    return maxTermFrequency;
+  }
   
   public AttributeSource getAttributeSource() {
     return attributeSource;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1058939&r1=1058938&r2=1058939&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Fri Jan 14 11:00:42 2011
@@ -125,6 +125,7 @@ final class FreqProxTermsWriterPerField 
       postings.docFreqs[termID] = 1;
       writeProx(termID, fieldState.position);
     }
+    fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
   }
 
   @Override
@@ -158,11 +159,12 @@ final class FreqProxTermsWriterPerField 
           termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
         }
         postings.docFreqs[termID] = 1;
+        fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
         postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
         postings.lastDocIDs[termID] = docState.docID;
         writeProx(termID, fieldState.position);
       } else {
-        postings.docFreqs[termID]++;
+        fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
         writeProx(termID, fieldState.position-postings.lastPositions[termID]);
       }
     }

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java?rev=1058939&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java Fri Jan 14 11:00:42 2011
@@ -0,0 +1,116 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Tests the maxTermFrequency statistic in FieldInvertState
+ */
+public class TestMaxTermFrequency extends LuceneTestCase { 
+  Directory dir;
+  IndexReader reader;
+  /* expected maxTermFrequency values for our documents */
+  ArrayList<Integer> expected = new ArrayList<Integer>();
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    dir = newDirectory();
+    IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, 
+        new MockAnalyzer(MockTokenizer.SIMPLE, true));
+    config.setSimilarity(new TestSimilarity());
+    RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
+    Document doc = new Document();
+    Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
+    doc.add(foo);
+    for (int i = 0; i < 100; i++) {
+      foo.setValue(addValue());
+      writer.addDocument(doc);
+    }
+    reader = writer.getReader();
+    writer.close();
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    reader.close();
+    dir.close();
+    super.tearDown();
+  }
+  
+  public void test() throws Exception {
+    byte fooNorms[] = MultiNorms.norms(reader, "foo");
+    for (int i = 0; i < reader.maxDoc(); i++)
+      assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
+  }
+
+  /**
+   * Makes a bunch of single-char tokens (the max freq will at most be 255).
+   * shuffles them around, and returns the whole list with Arrays.toString().
+   * This works fine because we use lettertokenizer.
+   * puts the max-frequency term into expected, to be checked against the norm.
+   */
+  private String addValue() {
+    List<String> terms = new ArrayList<String>();
+    int maxCeiling = _TestUtil.nextInt(random, 0, 255);
+    int max = 0;
+    for (char ch = 'a'; ch <= 'z'; ch++) {
+      int num = _TestUtil.nextInt(random, 0, maxCeiling);
+      for (int i = 0; i < num; i++)
+        terms.add(Character.toString(ch));
+      max = Math.max(max, num);
+    }
+    expected.add(max);
+    Collections.shuffle(terms, random);
+    return Arrays.toString(terms.toArray(new String[terms.size()]));
+  }
+  
+  /**
+   * Simple similarity that encodes maxTermFrequency directly as a byte
+   */
+  class TestSimilarity extends DefaultSimilarity {
+
+    @Override
+    public byte encodeNormValue(float f) {
+      return (byte) f;
+    }
+    
+    @Override
+    public float decodeNormValue(byte b) {
+      return (float) b;
+    }
+
+    @Override
+    public float computeNorm(String field, FieldInvertState state) {
+      return (float) state.getMaxTermFrequency();
+    }
+  }
+}