You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/20 22:23:53 UTC
svn commit: r1159912 [2/2] - in /lucene/dev/branches/flexscoring/lucene/src:
java/org/apache/lucene/search/similarities/ test/org/apache/lucene/search/
test/org/apache/lucene/search/similarities/
Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,152 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.TermContext;
+
+/**
+ * Abstract superclass for language modeling Similarities. The following inner
+ * types are introduced:
+ * <ul>
+ * <li>{@link LMStats}, which defines a new statistic, the probability that
+ * the collection language model generates the current term;</li>
+ * <li>{@link CollectionModel}, which is a strategy interface for object that
+ * compute the collection language model {@code p(w|C)};</li>
+ * <li>{@link DefaultCollectionModel}, an implementation of the former, that
+ * computes the term probability as the number of occurrences of the term in the
+ * collection, divided by the total number of tokens.</li>
+ * </ul>
+ *
+ * @lucene.experimental
+ */
+public abstract class LMSimilarity extends EasySimilarity {
+ /** The collection model. */
+ protected final CollectionModel collectionModel;
+
+ /** Creates a new instance with the specified collection language model. */
+ public LMSimilarity(CollectionModel collectionModel) {
+ this.collectionModel = collectionModel;
+ }
+
+ /** Creates a new instance with the default collection language model. */
+ public LMSimilarity() {
+ this(new DefaultCollectionModel());
+ }
+
+ /**
+ * Computes the collection probability of the current term in addition to the
+ * usual statistics.
+ */
+ @Override
+ public EasyStats computeStats(IndexSearcher searcher, String fieldName,
+ float queryBoost, TermContext... termContexts) throws IOException {
+ LMStats stats = new LMStats(queryBoost);
+ fillEasyStats(stats, searcher, fieldName, termContexts);
+ stats.setCollectionProbability(collectionModel.computeProbability(stats));
+ return stats;
+ }
+
+ @Override
+ protected void explain(Explanation expl, EasyStats stats, int doc,
+ float freq, float docLen) {
+ expl.addDetail(new Explanation(collectionModel.computeProbability(stats),
+ "collection probability"));
+ }
+
+ /**
+ * Returns the name of the LM method. The values of the parameters should be
+ * included as well.
+ * <p>Used in {@link #toString()}</p>.
+ */
+ public abstract String getName();
+
+ /**
+ * Returns the name of the LM method. If a custom collection model strategy is
+ * used, its name is included as well.
+ * @see #getName()
+ * @see CollectionModel#getName()
+ * @see DefaultCollectionModel
+ */
+ @Override
+ public String toString() {
+ String coll = collectionModel.getName();
+ if (coll != null) {
+ return String.format("LM %s - %s", getName(), coll);
+ } else {
+ return String.format("LM %s", getName());
+ }
+ }
+
+ /** Stores the collection distribution of the current term. */
+ public static class LMStats extends EasyStats {
+ /** The probability that the current term is generated by the collection. */
+ private float collectionProbability;
+
+ public LMStats(float queryBoost) {
+ super(queryBoost);
+ }
+
+ /**
+ * Returns the probability that the current term is generated by the
+ * collection.
+ */
+ public final float getCollectionProbability() {
+ return collectionProbability;
+ }
+
+ /**
+ * Sets the probability that the current term is generated by the
+ * collection.
+ */
+ public final void setCollectionProbability(float collectionProbability) {
+ this.collectionProbability = collectionProbability;
+ }
+ }
+
+ /** A strategy for computing the collection language model. */
+ public static interface CollectionModel {
+ /**
+ * Computes the probability {@code p(w|C)} according to the language model
+ * strategy for the current term.
+ */
+ public float computeProbability(EasyStats stats);
+
+ /** The name of the collection model strategy. */
+ public String getName();
+ }
+
+ /**
+ * Models {@code p(w|C)} as the number of occurrences of the term in the
+ * collection, divided by the total number of tokens {@code + 1}.
+ */
+ public static class DefaultCollectionModel implements CollectionModel {
+ @Override
+ public float computeProbability(EasyStats stats) {
+ return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1);
+ }
+
+ @Override
+ public String getName() {
+ return null;
+ }
+ }
+}
Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,42 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * The <em>lambda (λ<sub>w</sub>)</em> parameter in information-based
+ * models.
+ * @see IBSimilarity
+ * @lucene.experimental
+ */
+public abstract class Lambda {
+ /** Computes the lambda parameter. */
+ public abstract float lambda(EasyStats stats);
+ /** Explains the lambda parameter. */
+ public abstract Explanation explain(EasyStats stats);
+
+ /**
+ * Subclasses must override this method to return the code of the lambda
+ * formula. Since the original paper is not very clear on this matter, and
+ * also uses the DFR naming scheme incorrectly, the codes here were chosen
+ * arbitrarily.
+ */
+ @Override
+ public abstract String toString();
+}
Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,48 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Computes lambda as {@code totalTermFreq / numberOfDocuments}.
+ * @lucene.experimental
+ */
+public class LambdaDF extends Lambda {
+ @Override
+ public final float lambda(EasyStats stats) {
+ return (float)stats.getDocFreq() / stats.getNumberOfDocuments();
+ }
+
+ @Override
+ public final Explanation explain(EasyStats stats) {
+ Explanation result = new Explanation();
+ result.setDescription(getClass().getSimpleName() + ", computed from: ");
+ result.setValue(lambda(stats));
+ result.addDetail(
+ new Explanation(stats.getDocFreq(), "docFreq"));
+ result.addDetail(
+ new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "D";
+ }
+}
Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,48 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Computes lambda as {@code docFreq / numberOfDocuments}.
+ * @lucene.experimental
+ */
+public class LambdaTTF extends Lambda {
+ @Override
+ public final float lambda(EasyStats stats) {
+ return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
+ }
+
+ @Override
+ public final Explanation explain(EasyStats stats) {
+ Explanation result = new Explanation();
+ result.setDescription(getClass().getSimpleName() + ", computed from: ");
+ result.setValue(lambda(stats));
+ result.addDetail(
+ new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
+ result.addDetail(
+ new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "L";
+ }
+}
Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,75 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * This class acts as the base class for the implementations of the term
+ * frequency normalization methods in the DFR framework.
+ *
+ * @see DFRSimilarity
+ * @lucene.experimental
+ */
+public abstract class Normalization {
+ /** Returns the normalized term frequency.
+ * @param len the field length. */
+ public abstract float tfn(EasyStats stats, float tf, float len);
+
+ /** Returns an explanation for the normalized term frequency.
+ * <p>The default normalization methods use the field length of the document
+ * and the average field length to compute the normalized term frequency.
+ * This method provides a generic explanation for such methods.
+ * Subclasses that use other statistics must override this method.</p>
+ */
+ public Explanation explain(EasyStats stats, float tf, float len) {
+ Explanation result = new Explanation();
+ result.setDescription(getClass().getSimpleName() + ", computed from: ");
+ result.setValue(tfn(stats, tf, len));
+ result.addDetail(new Explanation(tf, "tf"));
+ result.addDetail(
+ new Explanation(stats.getAvgFieldLength(), "avgFieldLength"));
+ result.addDetail(new Explanation(len, "len"));
+ return result;
+ }
+
+ /** Implementation used when there is no normalization. */
+ public static final class NoNormalization extends Normalization {
+ @Override
+ public final float tfn(EasyStats stats, float tf, float len) {
+ return tf;
+ }
+
+ @Override
+ public final Explanation explain(EasyStats stats, float tf, float len) {
+ return new Explanation(1, "no normalization");
+ }
+
+ @Override
+ public String toString() {
+ return "";
+ }
+ }
+
+ /**
+ * Subclasses must override this method to return the code of the
+ * normalization formula. Refer to the original paper for the list.
+ */
+ @Override
+ public abstract String toString();
+}
Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,33 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalization model that assumes a uniform distribution of the term frequency.
+ */
+public class NormalizationH1 extends Normalization {
+ @Override
+ public final float tfn(EasyStats stats, float tf, float len) {
+ return tf * stats.getAvgFieldLength() / len;
+ }
+
+ @Override
+ public String toString() {
+ return "1";
+ }
+}
Added: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java?rev=1159912&view=auto
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java (added)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java Sat Aug 20 20:23:51 2011
@@ -0,0 +1,36 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.search.similarities.EasySimilarity.log2;
+
+/**
+ * Normalization model in which the term frequency is inversely related to the
+ * length.
+ */
+public class NormalizationH2 extends Normalization {
+ @Override
+ public final float tfn(EasyStats stats, float tf, float len) {
+ return (float)(tf * log2(1 + stats.getAvgFieldLength() / len));
+ }
+
+ @Override
+ public String toString() {
+ return "2";
+ }
+}
\ No newline at end of file