You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2019/06/10 07:07:29 UTC

[lucene-solr] branch master updated: LUCENE-8815: Adds a DoubleValues implementation for feature fields (#687)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 5ef2b3f  LUCENE-8815: Adds a DoubleValues implementation for feature fields (#687)
5ef2b3f is described below

commit 5ef2b3f6b87671769dba61f67c80b410a54b6004
Author: Colin Goodheart-Smithe <co...@users.noreply.github.com>
AuthorDate: Mon Jun 10 08:07:24 2019 +0100

    LUCENE-8815: Adds a DoubleValues implementation for feature fields (#687)
    
    This change adds a static method FeatureField#newDoubleValues() which can be used to retrieved the values of a feature for documents directly rathert than having to store the values in a numeric field alongsidde the feature field.
---
 lucene/CHANGES.txt                                 |   5 +
 .../lucene/document/FeatureDoubleValuesSource.java | 132 +++++++++++
 .../org/apache/lucene/document/FeatureField.java   |  14 ++
 .../lucene/document/TestFeatureDoubleValues.java   | 248 +++++++++++++++++++++
 4 files changed, 399 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 61a056b..70b1952 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -35,6 +35,11 @@ Other
 
 New Features
 
+* LUCENE-8815: Provide a DoubleValues implementation for retrieving the value of features without
+  requiring a separate numeric field. Note that as feature values are stored with only 8 bits of 
+  mantissa the values returned may have a delta from the original values indexed.
+  (Colin Goodheart-Smithe via Adrien Grand)
+
 * LUCENE-8803: Provide a FeatureSortfield to allow sorting search hits by descending value of a
   feature. This is exposed via the factory method FeatureField#newFeatureSort.
   (Colin Goodheart-Smithe via Adrien Grand)
diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureDoubleValuesSource.java b/lucene/core/src/java/org/apache/lucene/document/FeatureDoubleValuesSource.java
new file mode 100644
index 0000000..41af9f8
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/FeatureDoubleValuesSource.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+import java.util.Objects;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DoubleValues;
+import org.apache.lucene.search.DoubleValuesSource;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * A {@link DoubleValuesSource} instance which can be used to read the values of a feature from a 
+ * {@link FeatureField} for documents.
+ */
+class FeatureDoubleValuesSource extends DoubleValuesSource {
+  
+  private final BytesRef featureName;
+  private final String field;
+
+  /**
+   * Creates a {@link DoubleValuesSource} instance which can be used to read the values of a feature from the a 
+   * {@link FeatureField} for documents.
+   * 
+   * @param field field name. Must not be null.
+   * @param featureName feature name. Must not be null.
+   * @throws NullPointerException if {@code field} or {@code featureName} is null.
+   */
+  public FeatureDoubleValuesSource(String field, String featureName) {
+    this.field = Objects.requireNonNull(field);
+    this.featureName = new BytesRef(Objects.requireNonNull(featureName));
+  }
+
+  @Override
+  public boolean isCacheable(LeafReaderContext ctx) {
+    return true;
+  }
+
+  @Override
+  public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
+    Terms terms = ctx.reader().terms(field);
+    if (terms == null) {
+      return DoubleValues.EMPTY;
+    } else {
+      TermsEnum termsEnum = terms.iterator();
+      if (termsEnum.seekExact(featureName) == false) {
+        return DoubleValues.EMPTY;
+      } else {
+        PostingsEnum currentReaderPostingsValues = termsEnum.postings(null, PostingsEnum.FREQS);
+        return new FeatureDoubleValues(currentReaderPostingsValues);
+      }
+    }
+  }
+
+  @Override
+  public boolean needsScores() {
+    return false;
+  }
+
+  @Override
+  public DoubleValuesSource rewrite(IndexSearcher reader) throws IOException {
+    return this;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(field, featureName);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == null) {
+      return false;
+    }
+    if (obj.getClass() != getClass()) {
+      return false;
+    }
+    FeatureDoubleValuesSource other = (FeatureDoubleValuesSource) obj;
+    return Objects.equals(field, other.field) &&
+        Objects.equals(featureName, other.featureName);
+  }
+
+  @Override
+  public String toString() {
+    return "FeatureDoubleValuesSource("+field+", "+featureName.utf8ToString()+")";
+  }
+  
+  static class FeatureDoubleValues extends DoubleValues {
+    
+    private final PostingsEnum currentReaderPostingsValues;
+
+    public FeatureDoubleValues(PostingsEnum currentReaderPostingsValues) throws IOException {
+      this.currentReaderPostingsValues = currentReaderPostingsValues;
+    }
+
+    @Override
+    public double doubleValue() throws IOException {
+      return FeatureField.decodeFeatureValue(currentReaderPostingsValues.freq());
+    }
+
+    @Override
+    public boolean advanceExact(int doc) throws IOException {
+      if (doc >= currentReaderPostingsValues.docID()
+          && (currentReaderPostingsValues.docID() == doc || currentReaderPostingsValues.advance(doc) == doc)) {
+        return true;
+      } else {
+        return false;
+      }
+    }
+    
+  }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
index d060829..7cf22ef 100644
--- a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
+++ b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
@@ -29,6 +29,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermStates;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.DoubleValuesSource;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.Query;
@@ -538,4 +539,17 @@ public final class FeatureField extends Field {
   public static SortField newFeatureSort(String field, String featureName) {
     return new FeatureSortField(field, featureName);
   }
+  
+  /**
+   * Creates a {@link DoubleValuesSource} instance which can be used to read the values of a feature from the a 
+   * {@link FeatureField} for documents.
+   * 
+   * @param field field name. Must not be null.
+   * @param featureName feature name. Must not be null.
+   * @return a {@link DoubleValuesSource} which can be used to access the values of the feature for documents
+   * @throws NullPointerException if {@code field} or {@code featureName} is null.
+   */
+  public static DoubleValuesSource newDoubleValues(String field, String featureName) {
+    return new FeatureDoubleValuesSource(field, featureName);
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureDoubleValues.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureDoubleValues.java
new file mode 100644
index 0000000..5640d18
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureDoubleValues.java
@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.search.DoubleValues;
+import org.apache.lucene.search.DoubleValuesSource;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/*
+ * Test for retrieving values from a feature using a FeatureDoubleValuesSource.
+ *
+ * THE RULES:
+ * 1. keywords like 'abstract' and 'static' should not appear in this file.
+ * 2. each test method should be self-contained and understandable.
+ * 3. no test methods should share code with other test methods.
+ * 4. no testing of things unrelated to sorting.
+ * 5. no tracers.
+ * 6. keyword 'class' should appear only once in this file, here ----
+ *                                                                  |
+ *        -----------------------------------------------------------
+ *        |
+ *       \./
+ */
+public class TestFeatureDoubleValues extends LuceneTestCase {
+
+  public void testFeature() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(new FeatureField("field", "name", 30F));
+    writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new FeatureField("field", "name", 1F));
+    writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new FeatureField("field", "name", 4F));
+    writer.addDocument(doc);
+    IndexReader ir = writer.getReader();
+    writer.close();
+
+    assertEquals(1, ir.leaves().size());
+    LeafReaderContext context = ir.leaves().get(0);
+    DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
+    DoubleValues values = valuesSource.getValues(context, null);
+
+    assertTrue(values.advanceExact(0));
+    assertEquals(30, values.doubleValue(), 0f);
+    assertTrue(values.advanceExact(1));
+    assertEquals(1, values.doubleValue(), 0f);
+    assertTrue(values.advanceExact(2));
+    assertEquals(4, values.doubleValue(), 0f);
+    assertFalse(values.advanceExact(3));
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFeatureMissing() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new FeatureField("field", "name", 1F));
+    writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new FeatureField("field", "name", 4F));
+    writer.addDocument(doc);
+    IndexReader ir = writer.getReader();
+    writer.close();
+
+    assertEquals(1, ir.leaves().size());
+    LeafReaderContext context = ir.leaves().get(0);
+    DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
+    DoubleValues values = valuesSource.getValues(context, null);
+
+    assertFalse(values.advanceExact(0));
+    assertTrue(values.advanceExact(1));
+    assertEquals(1, values.doubleValue(), 0f);
+    assertTrue(values.advanceExact(2));
+    assertEquals(4, values.doubleValue(), 0f);
+    assertFalse(values.advanceExact(3));
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFeatureMissingFieldInSegment() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    writer.addDocument(doc);
+    writer.commit();
+    IndexReader ir = writer.getReader();
+    writer.close();
+    
+    assertEquals(1, ir.leaves().size());
+    LeafReaderContext context = ir.leaves().get(0);
+    DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
+    DoubleValues values = valuesSource.getValues(context, null);
+
+    assertFalse(values.advanceExact(0));
+    assertFalse(values.advanceExact(1));
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFeatureMissingFeatureNameInSegment() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(new FeatureField("field", "different_name", 0.5F));
+    writer.addDocument(doc);
+    writer.commit();
+    IndexReader ir = writer.getReader();
+    writer.close();
+    
+    assertEquals(1, ir.leaves().size());
+    LeafReaderContext context = ir.leaves().get(0);
+    DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
+    DoubleValues values = valuesSource.getValues(context, null);
+
+    assertFalse(values.advanceExact(0));
+    assertFalse(values.advanceExact(1));
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFeatureMultipleMissing() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    writer.addDocument(doc);
+    doc = new Document();
+    writer.addDocument(doc);
+    doc = new Document();
+    writer.addDocument(doc);
+    doc = new Document();
+    writer.addDocument(doc);
+    doc = new Document();
+    writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new FeatureField("field", "name", 1F));
+    writer.addDocument(doc);
+    doc = new Document();
+    doc.add(new FeatureField("field", "name", 4F));
+    writer.addDocument(doc);
+    IndexReader ir = writer.getReader();
+    writer.close();
+
+    assertEquals(1, ir.leaves().size());
+    LeafReaderContext context = ir.leaves().get(0);
+    DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
+    DoubleValues values = valuesSource.getValues(context, null);
+
+    assertFalse(values.advanceExact(0));
+    assertFalse(values.advanceExact(1));
+    assertFalse(values.advanceExact(2));
+    assertFalse(values.advanceExact(3));
+    assertFalse(values.advanceExact(4));
+    assertTrue(values.advanceExact(5));
+    assertEquals(1, values.doubleValue(), 0f);
+    assertTrue(values.advanceExact(6));
+    assertEquals(4, values.doubleValue(), 0f);
+    assertFalse(values.advanceExact(7));
+
+    ir.close();
+    dir.close();
+  }
+  
+  public void testHashCodeAndEquals() {
+    FeatureDoubleValuesSource valuesSource = new FeatureDoubleValuesSource("test_field", "test_feature");
+    FeatureDoubleValuesSource equal = new FeatureDoubleValuesSource("test_field", "test_feature");
+
+    FeatureDoubleValuesSource differentField = new FeatureDoubleValuesSource("other field", "test_feature");
+    FeatureDoubleValuesSource differentFeature = new FeatureDoubleValuesSource("test_field", "other_feature");
+    DoubleValuesSource otherImpl = new DoubleValuesSource() {
+      
+      @Override
+      public boolean isCacheable(LeafReaderContext ctx) {
+        return false;
+      }
+      
+      @Override
+      public String toString() {
+        return null;
+      }
+      
+      @Override
+      public DoubleValuesSource rewrite(IndexSearcher reader) throws IOException {
+        return null;
+      }
+      
+      @Override
+      public boolean needsScores() {
+        return false;
+      }
+      
+      @Override
+      public int hashCode() {
+        return 0;
+      }
+      
+      @Override
+      public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
+        return null;
+      }
+      
+      @Override
+      public boolean equals(Object obj) {
+        return false;
+      }
+    };
+    
+    assertTrue(valuesSource.equals(equal));
+    assertEquals(valuesSource.hashCode(), equal.hashCode());
+    assertFalse(valuesSource.equals(null));
+    assertFalse(valuesSource.equals(otherImpl));
+    assertNotEquals(valuesSource.hashCode(), otherImpl.hashCode());
+    assertFalse(valuesSource.equals(differentField));
+    assertNotEquals(valuesSource.hashCode(), differentField.hashCode());
+    assertFalse(valuesSource.equals(differentFeature));
+    assertNotEquals(valuesSource.hashCode(), differentFeature.hashCode());
+  }
+}