You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/20 22:23:53 UTC

svn commit: r1159912 [2/2] - in /lucene/dev/branches/flexscoring/lucene/src: java/org/apache/lucene/search/similarities/ test/org/apache/lucene/search/ test/org/apache/lucene/search/similarities/

Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,152 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.TermContext;
+
+/**
+ * Abstract superclass for language modeling Similarities. The following inner
+ * types are introduced:
+ * <ul>
+ *   <li>{@link LMStats}, which defines a new statistic, the probability that
+ *   the collection language model generates the current term;</li>
+ *   <li>{@link CollectionModel}, which is a strategy interface for object that
+ *   compute the collection language model {@code p(w|C)};</li>
+ *   <li>{@link DefaultCollectionModel}, an implementation of the former, that
+ *   computes the term probability as the number of occurrences of the term in the
+ *   collection, divided by the total number of tokens.</li>
+ * </ul> 
+ * 
+ * @lucene.experimental
+ */
+public abstract class LMSimilarity extends EasySimilarity {
+  /** The collection model. */
+  protected final CollectionModel collectionModel;
+  
+  /** Creates a new instance with the specified collection language model. */
+  public LMSimilarity(CollectionModel collectionModel) {
+    this.collectionModel = collectionModel;
+  }
+  
+  /** Creates a new instance with the default collection language model. */
+  public LMSimilarity() {
+    this(new DefaultCollectionModel());
+  }
+  
+  /**
+   * Computes the collection probability of the current term in addition to the
+   * usual statistics.
+   */
+  @Override
+  public EasyStats computeStats(IndexSearcher searcher, String fieldName,
+      float queryBoost, TermContext... termContexts) throws IOException {
+    LMStats stats = new LMStats(queryBoost);
+    fillEasyStats(stats, searcher, fieldName, termContexts);
+    stats.setCollectionProbability(collectionModel.computeProbability(stats));
+    return stats;
+  }
+
+  @Override
+  protected void explain(Explanation expl, EasyStats stats, int doc,
+      float freq, float docLen) {
+    expl.addDetail(new Explanation(collectionModel.computeProbability(stats),
+                                   "collection probability"));
+  }
+  
+  /**
+   * Returns the name of the LM method. The values of the parameters should be
+   * included as well.
+   * <p>Used in {@link #toString()}</p>.
+   */
+  public abstract String getName();
+  
+  /**
+   * Returns the name of the LM method. If a custom collection model strategy is
+   * used, its name is included as well.
+   * @see #getName()
+   * @see CollectionModel#getName()
+   * @see DefaultCollectionModel 
+   */
+  @Override
+  public String toString() {
+    String coll = collectionModel.getName();
+    if (coll != null) {
+      return String.format("LM %s - %s", getName(), coll);
+    } else {
+      return String.format("LM %s", getName());
+    }
+  }
+
+  /** Stores the collection distribution of the current term. */
+  public static class LMStats extends EasyStats {
+    /** The probability that the current term is generated by the collection. */
+    private float collectionProbability;
+    
+    public LMStats(float queryBoost) {
+      super(queryBoost);
+    }
+    
+    /**
+     * Returns the probability that the current term is generated by the
+     * collection.
+     */
+    public final float getCollectionProbability() {
+      return collectionProbability;
+    }
+    
+    /**
+     * Sets the probability that the current term is generated by the
+     * collection.
+     */
+    public final void setCollectionProbability(float collectionProbability) {
+      this.collectionProbability = collectionProbability;
+    } 
+  }
+  
+  /** A strategy for computing the collection language model. */
+  public static interface CollectionModel {
+    /**
+     * Computes the probability {@code p(w|C)} according to the language model
+     * strategy for the current term.
+     */
+    public float computeProbability(EasyStats stats);
+    
+    /** The name of the collection model strategy. */
+    public String getName();
+  }
+  
+  /**
+   * Models {@code p(w|C)} as the number of occurrences of the term in the
+   * collection, divided by the total number of tokens {@code + 1}.
+   */
+  public static class DefaultCollectionModel implements CollectionModel {
+    @Override
+    public float computeProbability(EasyStats stats) {
+      return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1);
+    }
+    
+    @Override
+    public String getName() {
+      return null;
+    }
+  }
+}

Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,42 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * The <em>lambda (&lambda;<sub>w</sub>)</em> parameter in information-based
+ * models.
+ * @see IBSimilarity
+ * @lucene.experimental
+ */
+public abstract class Lambda {
+  /** Computes the lambda parameter. */
+  public abstract float lambda(EasyStats stats);
+  /** Explains the lambda parameter. */
+  public abstract Explanation explain(EasyStats stats);
+  
+  /**
+   * Subclasses must override this method to return the code of the lambda
+   * formula. Since the original paper is not very clear on this matter, and
+   * also uses the DFR naming scheme incorrectly, the codes here were chosen
+   * arbitrarily.
+   */
+  @Override
+  public abstract String toString();
+}

Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,48 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Computes lambda as {@code totalTermFreq / numberOfDocuments}.
+ * @lucene.experimental
+ */
+public class LambdaDF extends Lambda {
+  @Override
+  public final float lambda(EasyStats stats) {
+    return (float)stats.getDocFreq() / stats.getNumberOfDocuments();
+  }
+  
+  @Override
+  public final Explanation explain(EasyStats stats) {
+    Explanation result = new Explanation();
+    result.setDescription(getClass().getSimpleName() + ", computed from: ");
+    result.setValue(lambda(stats));
+    result.addDetail(
+        new Explanation(stats.getDocFreq(), "docFreq"));
+    result.addDetail(
+        new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
+    return result;
+  }
+  
+  @Override
+  public String toString() {
+    return "D";
+  }
+}

Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,48 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Computes lambda as {@code docFreq / numberOfDocuments}.
+ * @lucene.experimental
+ */
+public class LambdaTTF extends Lambda {  
+  @Override
+  public final float lambda(EasyStats stats) {
+    return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
+  }
+
+  @Override
+  public final Explanation explain(EasyStats stats) {
+    Explanation result = new Explanation();
+    result.setDescription(getClass().getSimpleName() + ", computed from: ");
+    result.setValue(lambda(stats));
+    result.addDetail(
+        new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
+    result.addDetail(
+        new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
+    return result;
+  }
+  
+  @Override
+  public String toString() {
+    return "L";
+  }
+}

Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,75 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * This class acts as the base class for the implementations of the term
+ * frequency normalization methods in the DFR framework.
+ * 
+ * @see DFRSimilarity
+ * @lucene.experimental
+ */
+public abstract class Normalization {
+  /** Returns the normalized term frequency.
+   * @param len the field length. */
+  public abstract float tfn(EasyStats stats, float tf, float len);
+  
+  /** Returns an explanation for the normalized term frequency.
+   * <p>The default normalization methods use the field length of the document
+   * and the average field length to compute the normalized term frequency.
+   * This method provides a generic explanation for such methods.
+   * Subclasses that use other statistics must override this method.</p>
+   */
+  public Explanation explain(EasyStats stats, float tf, float len) {
+    Explanation result = new Explanation();
+    result.setDescription(getClass().getSimpleName() + ", computed from: ");
+    result.setValue(tfn(stats, tf, len));
+    result.addDetail(new Explanation(tf, "tf"));
+    result.addDetail(
+        new Explanation(stats.getAvgFieldLength(), "avgFieldLength"));
+    result.addDetail(new Explanation(len, "len"));
+    return result;
+  }
+
+  /** Implementation used when there is no normalization. */
+  public static final class NoNormalization extends Normalization {
+    @Override
+    public final float tfn(EasyStats stats, float tf, float len) {
+      return tf;
+    }
+
+    @Override
+    public final Explanation explain(EasyStats stats, float tf, float len) {
+      return new Explanation(1, "no normalization");
+    }
+    
+    @Override
+    public String toString() {
+      return "";
+    }
+  }
+  
+  /**
+   * Subclasses must override this method to return the code of the
+   * normalization formula. Refer to the original paper for the list. 
+   */
+  @Override
+  public abstract String toString();
+}

Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,33 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalization model that assumes a uniform distribution of the term frequency.
+ */
+public class NormalizationH1 extends Normalization {
+  @Override
+  public final float tfn(EasyStats stats, float tf, float len) {
+    return tf * stats.getAvgFieldLength() / len;
+  }
+
+  @Override
+  public String toString() {
+    return "1";
+  }
+}

Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,36 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.search.similarities.EasySimilarity.log2;
+
+/**
+ * Normalization model in which the term frequency is inversely related to the
+ * length.
+ */
+public class NormalizationH2 extends Normalization {
+  @Override
+  public final float tfn(EasyStats stats, float tf, float len) {
+    return (float)(tf * log2(1 + stats.getAvgFieldLength() / len));
+  }
+
+  @Override
+  public String toString() {
+    return "2";
+  }
+}
\ No newline at end of file