You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2008/11/03 19:03:59 UTC
svn commit: r710117 - in /lucene/java/trunk: ./
contrib/memory/src/java/org/apache/lucene/index/memory/
contrib/miscellaneous/src/java/org/apache/lucene/misc/
src/java/org/apache/lucene/document/ src/java/org/apache/lucene/index/
src/java/org/apache/lu...
Author: mikemccand
Date: Mon Nov 3 10:03:58 2008
New Revision: 710117
URL: http://svn.apache.org/viewvc?rev=710117&view=rev
Log:
LUCENE-1420: let Similarity.computeNorm compute the norm; add option to discount overlap tokens when computing lengthNorm
Added:
lucene/java/trunk/src/java/org/apache/lucene/index/FieldInvertState.java (with props)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
lucene/java/trunk/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java
lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerThread.java
lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java
lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java
lucene/java/trunk/src/java/org/apache/lucene/search/DefaultSimilarity.java
lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java
lucene/java/trunk/src/java/org/apache/lucene/search/SimilarityDelegator.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Nov 3 10:03:58 2008
@@ -36,9 +36,17 @@
then retrievable via IndexReader.getCommitUserData instance and
static methods. (Shalin Shekhar Mangar via Mike McCandless)
-
3. LUCENE-1406: Added Arabic analyzer. (Robert Muir via Grant Ingersoll)
+ 4. LUCENE-1420: Similarity now has a computeNorm method that allows
+ custom Similarity classes to override how norm is computed. It's
+ provided a FieldInvertState instance that contains details from
+ inverting the field. The default impl is boost *
+ lengthNorm(numTerms), to be backwards compatible. Also added
+ {set/get}DiscountOverlaps to DefaultSimilarity, to control whether
+ overlapping tokens (tokens with 0 position increment) should be
+ counted in lengthNorm. (Andrzej Bialecki via Mike McCandless)
+
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Mon Nov 3 10:03:58 2008
@@ -41,6 +41,7 @@
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.TermVectorMapper;
+import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@@ -348,6 +349,7 @@
HashMap terms = new HashMap();
int numTokens = 0;
+ int numOverlapTokens = 0;
int pos = -1;
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
@@ -355,7 +357,10 @@
if (term.length() == 0) continue; // nothing to do
// if (DEBUG) System.err.println("token='" + term + "'");
numTokens++;
- pos += nextToken.getPositionIncrement();
+ final int posIncr = nextToken.getPositionIncrement();
+ if (posIncr == 0)
+ numOverlapTokens++;
+ pos += posIncr;
ArrayIntList positions = (ArrayIntList) terms.get(term);
if (positions == null) { // term not seen before
@@ -372,7 +377,7 @@
// ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
if (numTokens > 0) {
boost = boost * docBoost; // see DocumentWriter.addDocument(...)
- fields.put(fieldName, new Info(terms, numTokens, boost));
+ fields.put(fieldName, new Info(terms, numTokens, numOverlapTokens, boost));
sortedFields = null; // invalidate sorted view, if any
}
} catch (IOException e) { // can never happen
@@ -574,6 +579,9 @@
/** Number of added tokens for this field */
private final int numTokens;
+ /** Number of overlapping tokens for this field */
+ private final int numOverlapTokens;
+
/** Boost factor for hits for this field */
private final float boost;
@@ -582,9 +590,10 @@
private static final long serialVersionUID = 2882195016849084649L;
- public Info(HashMap terms, int numTokens, float boost) {
+ public Info(HashMap terms, int numTokens, int numOverlapTokens, float boost) {
this.terms = terms;
this.numTokens = numTokens;
+ this.numOverlapTokens = numOverlapTokens;
this.boost = boost;
}
@@ -1067,9 +1076,10 @@
if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached?
Info info = getInfo(fieldName);
int numTokens = info != null ? info.numTokens : 0;
- float n = sim.lengthNorm(fieldName, numTokens);
+ int numOverlapTokens = info != null ? info.numOverlapTokens : 0;
float boost = info != null ? info.getBoost() : 1.0f;
- n = n * boost; // see DocumentWriter.writeNorms(String segment)
+ FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost);
+ float n = sim.computeNorm(fieldName, invertState);
byte norm = Similarity.encodeNorm(n);
norms = new byte[] {norm};
Modified: lucene/java/trunk/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java (original)
+++ lucene/java/trunk/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java Mon Nov 3 10:03:58 2008
@@ -19,6 +19,7 @@
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.index.FieldInvertState;
import java.util.Map;
import java.util.HashMap;
@@ -53,6 +54,7 @@
private Map ln_mins = new HashMap(7);
private Map ln_maxs = new HashMap(7);
private Map ln_steeps = new HashMap(7);
+ private Map ln_overlaps = new HashMap(7);
private float tf_base = 0.0f;
private float tf_min = 0.0f;
@@ -106,18 +108,67 @@
}
/**
- * Sets the function variables used by lengthNorm for a specific named field
+ * Sets the function variables used by lengthNorm for a
+ * specific named field.
+ *
+ * @deprecated Please call {@link #setLengthNormFactors(String,
+ * int, int, float, boolean)} instead.
+ *
+ * @param field field name
+ * @param min minimum value
+ * @param max maximum value
+ * @param steepness steepness of the curve
*
* @see #lengthNorm
*/
public void setLengthNormFactors(String field, int min, int max,
float steepness) {
+ setLengthNormFactors(field, min, max, steepness, false);
+ }
+
+ /**
+ * Sets the function variables used by lengthNorm for a specific named field.
+ *
+ * @param field field name
+ * @param min minimum value
+ * @param max maximum value
+ * @param steepness steepness of the curve
+ * @param discountOverlaps if true, <code>numOverlapTokens</code> will be
+ * subtracted from <code>numTokens</code>; if false then
+ * <code>numOverlapTokens</code> will be assumed to be 0 (see
+ * {@link DefaultSimilarity#computeNorm(String, FieldInvertState)} for details).
+ *
+ * @see #lengthNorm
+ */
+ public void setLengthNormFactors(String field, int min, int max,
+ float steepness, boolean discountOverlaps) {
ln_mins.put(field, new Integer(min));
ln_maxs.put(field, new Integer(max));
ln_steeps.put(field, new Float(steepness));
+ ln_overlaps.put(field, new Boolean(discountOverlaps));
}
/**
+ * Implemented as <code> state.getBoost() *
+ * lengthNorm(fieldName, numTokens) </code> where
+ * numTokens does not count overlap tokens if
+ * discountOverlaps is true by default or true for this
+ * specific field. */
+ public float computeNorm(String fieldName, FieldInvertState state) {
+ final int numTokens;
+ boolean overlaps = discountOverlaps;
+ if (ln_overlaps.containsKey(fieldName)) {
+ overlaps = ((Boolean)ln_overlaps.get(fieldName)).booleanValue();
+ }
+ if (overlaps)
+ numTokens = state.getLength() - state.getNumOverlap();
+ else
+ numTokens = state.getLength();
+
+ return state.getBoost() * lengthNorm(fieldName, numTokens);
+ }
+
+ /**
* Implemented as:
* <code>
* 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
Modified: lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java Mon Nov 3 10:03:58 2008
@@ -98,13 +98,19 @@
* <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
* containing this field. If a document has multiple fields with the same
* name, all such values are multiplied together. This product is then
- * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and
+ * used to compute the norm factor for the field. By
+ * default, in the {@link
+ * org.apache.lucene.search.Similarity#computeNorm(String,
+ * FieldInvertState)} method, the boost value is multipled
+ * by the {@link
+ * org.apache.lucene.search.Similarity#lengthNorm(String,
+ * int)} and then
* rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.document.Document#setBoost(float)
- * @see org.apache.lucene.search.Similarity#lengthNorm(String, int)
+ * @see org.apache.lucene.search.Similarity#computeNorm(String, org.apache.lucene.index.FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNorm(float)
*/
public void setBoost(float boost) {
Modified: lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java Mon Nov 3 10:03:58 2008
@@ -17,6 +17,7 @@
*/
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.FieldInvertState;
import java.io.Reader;
import java.io.Serializable;
@@ -39,13 +40,18 @@
* <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
* containing this field. If a document has multiple fields with the same
* name, all such values are multiplied together. This product is then
- * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and
- * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
+ * used to compute the norm factor for the field. By
+ * default, in the {@link
+ * org.apache.lucene.search.Similarity#computeNorm(String,
+ * FieldInvertState)} method, the boost value is multipled
+ * by the {@link
+ * org.apache.lucene.search.Similarity#lengthNorm(String,
+ * int)} and then rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.document.Document#setBoost(float)
- * @see org.apache.lucene.search.Similarity#lengthNorm(String, int)
+ * @see org.apache.lucene.search.Similarity#computeNorm(String, FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNorm(float)
*/
void setBoost(float boost);
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java Mon Nov 3 10:03:58 2008
@@ -92,18 +92,4 @@
public DocFieldConsumerPerThread addThread(DocFieldProcessorPerThread docFieldProcessorPerThread) {
return new DocInverterPerThread(docFieldProcessorPerThread, this);
}
-
- final static class FieldInvertState {
- int position;
- int length;
- int offset;
- float boost;
-
- void reset(float docBoost) {
- position = 0;
- length = 0;
- offset = 0;
- boost = docBoost;
- }
- }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java Mon Nov 3 10:03:58 2008
@@ -39,7 +39,7 @@
final InvertedDocConsumerPerField consumer;
final InvertedDocEndConsumerPerField endConsumer;
final DocumentsWriter.DocState docState;
- final DocInverter.FieldInvertState fieldState;
+ final FieldInvertState fieldState;
public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
this.perThread = perThread;
@@ -134,7 +134,11 @@
Token token = stream.next(localToken);
if (token == null) break;
- fieldState.position += (token.getPositionIncrement() - 1);
+ final int posIncr = token.getPositionIncrement();
+ fieldState.position += posIncr - 1;
+ if (posIncr == 0)
+ fieldState.numOverlap++;
+
boolean success = false;
try {
// If we hit an exception in here, we abort
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerThread.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerThread.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerThread.java Mon Nov 3 10:03:58 2008
@@ -32,7 +32,7 @@
final Token localToken = new Token();
final DocumentsWriter.DocState docState;
- final DocInverter.FieldInvertState fieldState = new DocInverter.FieldInvertState();
+ final FieldInvertState fieldState = new FieldInvertState();
// Used to read a string value for a field
final ReusableStringReader stringReader = new ReusableStringReader();
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FieldInvertState.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldInvertState.java?rev=710117&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldInvertState.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldInvertState.java Mon Nov 3 10:03:58 2008
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+import org.apache.lucene.search.Similarity;
+
+/**
+ * This class tracks the number and position / offset parameters of terms
+ * being added to the index. The information collected in this class is
+ * also used to calculate the normalization factor for a field.
+ *
+ * <p><b>WARNING</b>: This API is new and experimental, and may suddenly
+ * change.</p>
+ */
+public final class FieldInvertState {
+ int position;
+ int length;
+ int numOverlap;
+ int offset;
+ float boost;
+
+ public FieldInvertState() {
+ }
+
+ public FieldInvertState(int position, int length, int numOverlap, int offset, float boost) {
+ this.position = position;
+ this.length = length;
+ this.numOverlap = numOverlap;
+ this.offset = offset;
+ this.boost = boost;
+ }
+
+ /**
+ * Re-initialize the state, using this boost value.
+ * @param docBoost boost value to use.
+ */
+ void reset(float docBoost) {
+ position = 0;
+ length = 0;
+ numOverlap = 0;
+ offset = 0;
+ boost = docBoost;
+ }
+
+ /**
+ * Get the last processed term position.
+ * @return the position
+ */
+ public int getPosition() {
+ return position;
+ }
+
+ /**
+ * Get total number of terms in this field.
+ * @return the length
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Get the number of terms with <code>positionIncrement == 0</code>.
+ * @return the numOverlap
+ */
+ public int getNumOverlap() {
+ return numOverlap;
+ }
+
+ /**
+ * Get end offset of the last processed term.
+ * @return the offset
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * Get boost value. This is the cumulative product of
+ * document boost and field boost for all field instances
+ * sharing the same field name.
+ * @return the boost
+ */
+ public float getBoost() {
+ return boost;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FieldInvertState.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Mon Nov 3 10:03:58 2008
@@ -30,7 +30,7 @@
final TermsHashPerField termsHashPerField;
final FieldInfo fieldInfo;
final DocumentsWriter.DocState docState;
- final DocInverter.FieldInvertState fieldState;
+ final FieldInvertState fieldState;
boolean omitTf;
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) {
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java Mon Nov 3 10:03:58 2008
@@ -36,7 +36,7 @@
byte[] norms = new byte[1];
int upto;
- final DocInverter.FieldInvertState fieldState;
+ final FieldInvertState fieldState;
public void reset() {
// Shrink back if we are overallocated now:
@@ -68,7 +68,7 @@
docIDs = ArrayUtil.grow(docIDs, 1+upto);
norms = ArrayUtil.grow(norms, 1+upto);
}
- final float norm = fieldState.boost * docState.similarity.lengthNorm(fieldInfo.name, fieldState.length);
+ final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState);
norms[upto] = Similarity.encodeNorm(norm);
docIDs[upto] = docState.docID;
upto++;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Mon Nov 3 10:03:58 2008
@@ -30,7 +30,7 @@
final TermVectorsTermsWriter termsWriter;
final FieldInfo fieldInfo;
final DocumentsWriter.DocState docState;
- final DocInverter.FieldInvertState fieldState;
+ final FieldInvertState fieldState;
boolean doVectors;
boolean doVectorPositions;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java Mon Nov 3 10:03:58 2008
@@ -30,7 +30,7 @@
final TermsHashPerField nextPerField;
final TermsHashPerThread perThread;
final DocumentsWriter.DocState docState;
- final DocInverter.FieldInvertState fieldState;
+ final FieldInvertState fieldState;
// Copied from our perThread
final CharBlockPool charPool;
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/DefaultSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/DefaultSimilarity.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/DefaultSimilarity.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/DefaultSimilarity.java Mon Nov 3 10:03:58 2008
@@ -1,5 +1,7 @@
package org.apache.lucene.search;
+import org.apache.lucene.index.FieldInvertState;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -19,6 +21,25 @@
/** Expert: Default scoring implementation. */
public class DefaultSimilarity extends Similarity {
+
+ /** Implemented as
+ * <code>state.getBoost()*lengthNorm(numTerms)</code>, where
+ * <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
+ * #setDiscountOverlaps} is false, else it's {@link
+ * FieldInvertState#getLength()} - {@link
+ * FieldInvertState#getNumOverlap()}.
+ *
+ * <p><b>WARNING</b>: This API is new and experimental, and may suddenly
+ * change.</p> */
+ public float computeNorm(String field, FieldInvertState state) {
+ final int numTerms;
+ if (discountOverlaps)
+ numTerms = state.getLength() - state.getNumOverlap();
+ else
+ numTerms = state.getLength();
+ return (float) (state.getBoost() * lengthNorm(field, numTerms));
+ }
+
/** Implemented as <code>1/sqrt(numTerms)</code>. */
public float lengthNorm(String fieldName, int numTerms) {
return (float)(1.0 / Math.sqrt(numTerms));
@@ -48,4 +69,26 @@
public float coord(int overlap, int maxOverlap) {
return overlap / (float)maxOverlap;
}
+
+ // Default false
+ protected boolean discountOverlaps;
+
+ /** Determines whether overlap tokens (Tokens with
+ * 0 position increment) are ignored when computing
+ * norm. By default this is false, meaning overlap
+ * tokens are counted just like non-overlap tokens.
+ *
+ * <p><b>WARNING</b>: This API is new and experimental, and may suddenly
+ * change.</p>
+ *
+ * @see #computeNorm
+ */
+ public void setDiscountOverlaps(boolean v) {
+ discountOverlaps = v;
+ }
+
+ /** @see #setDiscountOverlaps */
+ public boolean getDiscountOverlaps() {
+ return discountOverlaps;
+ }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java Mon Nov 3 10:03:58 2008
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.SmallFloat;
@@ -333,6 +334,29 @@
return NORM_TABLE;
}
+ /**
+ * Compute the normalization value for a field, given the accumulated
+ * state of term processing for this field (see {@link FieldInvertState}).
+ *
+ * <p>Implementations should calculate a float value based on the field
+ * state and then return that value.
+ *
+ * <p>For backward compatibility this method by default calls
+ * {@link #lengthNorm(String, int)} passing
+ * {@link FieldInvertState#getLength()} as the second argument, and
+ * then multiplies this value by {@link FieldInvertState#getBoost()}.</p>
+ *
+ * <p><b>WARNING</b>: This API is new and experimental and may
+ * suddenly change.</p>
+ *
+ * @param field field name
+ * @param state current processing state for this field
+ * @return the calculated float norm
+ */
+ public float computeNorm(String field, FieldInvertState state) {
+ return (float) (state.getBoost() * lengthNorm(field, state.getLength()));
+ }
+
/** Computes the normalization value for a field given the total number of
* terms contained in a field. These values, together with field boosts, are
* stored in an index and multipled into scores for hits on each field by the
@@ -341,10 +365,10 @@
* <p>Matches in longer fields are less precise, so implementations of this
* method usually return smaller values when <code>numTokens</code> is large,
* and larger values when <code>numTokens</code> is small.
- *
- * <p>That these values are computed under
+ *
+ * <p>Note that the return values are computed under
* {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
- * and stored then using
+ * and then stored using
* {@link #encodeNorm(float)}.
* Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/SimilarityDelegator.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/SimilarityDelegator.java?rev=710117&r1=710116&r2=710117&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/SimilarityDelegator.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/SimilarityDelegator.java Mon Nov 3 10:03:58 2008
@@ -1,5 +1,7 @@
package org.apache.lucene.search;
+import org.apache.lucene.index.FieldInvertState;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -32,6 +34,10 @@
this.delegee = delegee;
}
+ public float computeNorm(String fieldName, FieldInvertState state) {
+ return delegee.computeNorm(fieldName, state);
+ }
+
public float lengthNorm(String fieldName, int numTerms) {
return delegee.lengthNorm(fieldName, numTerms);
}