You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2023/02/21 13:04:58 UTC

[lucene] branch branch_9x updated (57e1a4a40ef -> 668246439dd)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a change to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


    from 57e1a4a40ef Ensure caching all leaves from the upper tier (#12147)
     new ecaee57aceb Introduce a new `KeywordField`. (#12054)
     new 668246439dd Skip the TokenStream overhead when indexing simple keywords. (#12139)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 lucene/CHANGES.txt                                 |   6 +-
 .../src/java/org/apache/lucene/document/Field.java |   5 +
 .../org/apache/lucene/document/InvertableType.java |  40 ++++
 .../org/apache/lucene/document/KeywordField.java   | 193 +++++++++++++++++
 .../org/apache/lucene/document/StringField.java    |  15 ++
 .../org/apache/lucene/index/FieldInvertState.java  |  18 +-
 .../lucene/index/FreqProxTermsWriterPerField.java  |   4 +-
 .../org/apache/lucene/index/IndexableField.java    |   7 +
 .../org/apache/lucene/index/IndexingChain.java     |  60 ++++++
 .../lucene/index/TermVectorsConsumerPerField.java  |   3 +
 .../apache/lucene/document/TestKeywordField.java   | 126 ++++++++++++
 .../apache/lucene/index/TestDocumentWriter.java    | 228 ++++++++++++++++++++-
 .../lucene/index/TestExceedMaxTermLength.java      |  67 +++++-
 .../org/apache/lucene/index/TestFieldReuse.java    |  12 +-
 .../apache/lucene/index/TestIndexableField.java    |  11 +
 .../apache/lucene/search/TestSortOptimization.java |  44 ++--
 .../lucene/search/TestSortedSetSortField.java      |  30 +--
 .../java/org/apache/lucene/demo/IndexFiles.java    |   5 +-
 .../apache/lucene/index/memory/MemoryIndex.java    |  30 +++
 .../lucene/index/memory/TestMemoryIndex.java       | 149 ++++++++++++++
 .../apache/lucene/misc/document/LazyDocument.java  |   8 +-
 21 files changed, 1004 insertions(+), 57 deletions(-)
 create mode 100644 lucene/core/src/java/org/apache/lucene/document/InvertableType.java
 create mode 100644 lucene/core/src/java/org/apache/lucene/document/KeywordField.java
 create mode 100644 lucene/core/src/test/org/apache/lucene/document/TestKeywordField.java


[lucene] 01/02: Introduce a new `KeywordField`. (#12054)

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit ecaee57aceb7106e52a317490bc9448cf6861962
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Tue Feb 7 18:19:09 2023 +0100

    Introduce a new `KeywordField`. (#12054)
    
    `KeywordField` is a combination of `StringField` and `SortedSetDocValuesField`,
    similarly to how `LongField` is a combination of `LongPoint` and
    `SortedNumericDocValuesField`. This makes it easier for users to create fields
    that can be used for filtering, sorting and faceting.
---
 lucene/CHANGES.txt                                 |   4 +-
 .../org/apache/lucene/document/KeywordField.java   | 188 +++++++++++++++++++++
 .../apache/lucene/document/TestKeywordField.java   | 126 ++++++++++++++
 .../apache/lucene/search/TestSortOptimization.java |  44 ++---
 .../lucene/search/TestSortedSetSortField.java      |  30 ++--
 .../java/org/apache/lucene/demo/IndexFiles.java    |   5 +-
 6 files changed, 356 insertions(+), 41 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 585715f03e9..54b411b53e2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -17,7 +17,9 @@ API Changes
 
 New Features
 ---------------------
-(No changes)
+
+* GITHUB#12054: Introduce a new KeywordField for simple and efficient
+  filtering, sorting and faceting. (Adrien Grand)
 
 Improvements
 ---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/document/KeywordField.java b/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
new file mode 100644
index 00000000000..70b27ad671a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.util.Objects;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.IndexOrDocValuesQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.search.SortedSetSortField;
+import org.apache.lucene.search.TermInSetQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Field that indexes a per-document String or {@link BytesRef} into an inverted index for fast
+ * filtering, stores values in a columnar fashion using {@link DocValuesType#SORTED_SET} doc values
+ * for sorting and faceting, and optionally stores values as stored fields for top-hits retrieval.
+ * This field does not support scoring: queries produce constant scores. If you need more
+ * fine-grained control you can use {@link StringField}, {@link SortedDocValuesField} or {@link
+ * SortedSetDocValuesField}, and {@link StoredField}.
+ *
+ * <p>This field defines static factory methods for creating common query objects:
+ *
+ * <ul>
+ *   <li>{@link #newExactQuery} for matching a value.
+ *   <li>{@link #newSetQuery} for matching any of the values coming from a set.
+ *   <li>{@link #newSortField} for matching a value.
+ * </ul>
+ */
+public class KeywordField extends Field {
+
+  private static final FieldType FIELD_TYPE = new FieldType();
+  private static final FieldType FIELD_TYPE_STORED;
+
+  static {
+    FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
+    FIELD_TYPE.setOmitNorms(true);
+    FIELD_TYPE.setTokenized(false);
+    FIELD_TYPE.setDocValuesType(DocValuesType.SORTED_SET);
+    FIELD_TYPE.freeze();
+
+    FIELD_TYPE_STORED = new FieldType(FIELD_TYPE);
+    FIELD_TYPE_STORED.setStored(true);
+    FIELD_TYPE_STORED.freeze();
+  }
+
+  private final StoredValue storedValue;
+
+  /**
+   * Creates a new KeywordField.
+   *
+   * @param name field name
+   * @param value the BytesRef value
+   * @param stored whether to store the field
+   * @throws IllegalArgumentException if the field name or value is null.
+   */
+  public KeywordField(String name, BytesRef value, Store stored) {
+    super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
+    if (stored == Store.YES) {
+      storedValue = new StoredValue(value);
+    } else {
+      storedValue = null;
+    }
+  }
+
+  /**
+   * Creates a new KeywordField from a String value, by indexing its UTF-8 representation.
+   *
+   * @param name field name
+   * @param value the BytesRef value
+   * @param stored whether to store the field
+   * @throws IllegalArgumentException if the field name or value is null.
+   */
+  public KeywordField(String name, String value, Store stored) {
+    super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
+    if (stored == Store.YES) {
+      storedValue = new StoredValue(value);
+    } else {
+      storedValue = null;
+    }
+  }
+
+  @Override
+  public BytesRef binaryValue() {
+    BytesRef binaryValue = super.binaryValue();
+    if (binaryValue != null) {
+      return binaryValue;
+    } else {
+      return new BytesRef(stringValue());
+    }
+  }
+
+  @Override
+  public void setStringValue(String value) {
+    super.setStringValue(value);
+    if (storedValue != null) {
+      storedValue.setStringValue(value);
+    }
+  }
+
+  @Override
+  public void setBytesValue(BytesRef value) {
+    super.setBytesValue(value);
+    if (storedValue != null) {
+      storedValue.setBinaryValue(value);
+    }
+  }
+
+  @Override
+  public StoredValue storedValue() {
+    return storedValue;
+  }
+
+  /**
+   * Create a query for matching an exact {@link BytesRef} value.
+   *
+   * @param field field name. must not be {@code null}.
+   * @param value exact value
+   * @throws NullPointerException if {@code field} is null.
+   * @return a query matching documents with this exact value
+   */
+  public static Query newExactQuery(String field, BytesRef value) {
+    Objects.requireNonNull(field, "field must not be null");
+    Objects.requireNonNull(value, "value must not be null");
+    return new ConstantScoreQuery(new TermQuery(new Term(field, value)));
+  }
+
+  /**
+   * Create a query for matching an exact {@link String} value.
+   *
+   * @param field field name. must not be {@code null}.
+   * @param value exact value
+   * @throws NullPointerException if {@code field} is null.
+   * @return a query matching documents with this exact value
+   */
+  public static Query newExactQuery(String field, String value) {
+    Objects.requireNonNull(value, "value must not be null");
+    return newExactQuery(field, new BytesRef(value));
+  }
+
+  /**
+   * Create a query for matching any of a set of provided {@link BytesRef} values.
+   *
+   * @param field field name. must not be {@code null}.
+   * @param values the set of values to match
+   * @throws NullPointerException if {@code field} is null.
+   * @return a query matching documents with this exact value
+   */
+  public static Query newSetQuery(String field, BytesRef... values) {
+    Objects.requireNonNull(field, "field must not be null");
+    Objects.requireNonNull(values, "values must not be null");
+    return new IndexOrDocValuesQuery(
+        new TermInSetQuery(field, values), new SortedSetDocValuesSetQuery(field, values));
+  }
+
+  /**
+   * Create a new {@link SortField} for {@link BytesRef} values.
+   *
+   * @param field field name. must not be {@code null}.
+   * @param reverse true if natural order should be reversed.
+   * @param selector custom selector type for choosing the sort value from the set.
+   */
+  public static SortField newSortField(
+      String field, boolean reverse, SortedSetSelector.Type selector) {
+    Objects.requireNonNull(field, "field must not be null");
+    Objects.requireNonNull(selector, "selector must not be null");
+    return new SortedSetSortField(field, reverse, selector);
+  }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestKeywordField.java b/lucene/core/src/test/org/apache/lucene/document/TestKeywordField.java
new file mode 100644
index 00000000000..41ffd13b9ea
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/document/TestKeywordField.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+import java.util.Collections;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+
+public class TestKeywordField extends LuceneTestCase {
+
+  public void testSetBytesValue() {
+    Field[] fields =
+        new Field[] {
+          new KeywordField("name", newBytesRef("value"), Field.Store.NO),
+          new KeywordField("name", newBytesRef("value"), Field.Store.YES)
+        };
+    for (Field field : fields) {
+      assertEquals(newBytesRef("value"), field.binaryValue());
+      assertNull(field.stringValue());
+      if (field.fieldType().stored()) {
+        assertEquals(newBytesRef("value"), field.storedValue().getBinaryValue());
+      } else {
+        assertNull(field.storedValue());
+      }
+      assertThrows(
+          IllegalArgumentException.class, () -> field.setBytesValue(newBytesRef("value2")));
+      assertEquals(newBytesRef("value"), field.binaryValue());
+      assertNull(field.stringValue());
+      if (field.fieldType().stored()) {
+        assertEquals(newBytesRef("value"), field.storedValue().getBinaryValue());
+      } else {
+        assertNull(field.storedValue());
+      }
+    }
+  }
+
+  public void testSetStringValue() {
+    Field[] fields =
+        new Field[] {
+          new KeywordField("name", "value", Field.Store.NO),
+          new KeywordField("name", "value", Field.Store.YES)
+        };
+    for (Field field : fields) {
+      assertEquals("value", field.stringValue());
+      assertEquals(newBytesRef("value"), field.binaryValue());
+      if (field.fieldType().stored()) {
+        assertEquals("value", field.storedValue().getStringValue());
+      } else {
+        assertNull(field.storedValue());
+      }
+      field.setStringValue("value2");
+      assertEquals("value2", field.stringValue());
+      assertEquals(newBytesRef("value2"), field.binaryValue());
+      if (field.fieldType().stored()) {
+        assertEquals("value2", field.storedValue().getStringValue());
+      } else {
+        assertNull(field.storedValue());
+      }
+    }
+  }
+
+  public void testIndexBytesValue() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    w.addDocument(
+        Collections.singleton(new KeywordField("field", newBytesRef("value"), Field.Store.YES)));
+    IndexReader reader = DirectoryReader.open(w);
+    w.close();
+    LeafReader leaf = getOnlyLeafReader(reader);
+    TermsEnum terms = leaf.terms("field").iterator();
+    assertEquals(new BytesRef("value"), terms.next());
+    assertNull(terms.next());
+    SortedSetDocValues values = leaf.getSortedSetDocValues("field");
+    assertTrue(values.advanceExact(0));
+    assertEquals(1, values.docValueCount());
+    assertEquals(0L, values.nextOrd());
+    assertEquals(new BytesRef("value"), values.lookupOrd(0));
+    Document storedDoc = leaf.storedFields().document(0);
+    assertEquals(new BytesRef("value"), storedDoc.getBinaryValue("field"));
+    reader.close();
+    dir.close();
+  }
+
+  public void testIndexStringValue() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    w.addDocument(Collections.singleton(new KeywordField("field", "value", Field.Store.YES)));
+    IndexReader reader = DirectoryReader.open(w);
+    w.close();
+    LeafReader leaf = getOnlyLeafReader(reader);
+    TermsEnum terms = leaf.terms("field").iterator();
+    assertEquals(new BytesRef("value"), terms.next());
+    assertNull(terms.next());
+    SortedSetDocValues values = leaf.getSortedSetDocValues("field");
+    assertTrue(values.advanceExact(0));
+    assertEquals(1, values.docValueCount());
+    assertEquals(0L, values.nextOrd());
+    assertEquals(new BytesRef("value"), values.lookupOrd(0));
+    Document storedDoc = leaf.storedFields().document(0);
+    assertEquals("value", storedDoc.get("field"));
+    reader.close();
+    dir.close();
+  }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
index 5c0aad74d97..d30146f39a3 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@@ -26,15 +26,14 @@ import java.util.Collections;
 import java.util.List;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.FloatDocValuesField;
 import org.apache.lucene.document.FloatPoint;
 import org.apache.lucene.document.IntPoint;
 import org.apache.lucene.document.IntRange;
+import org.apache.lucene.document.KeywordField;
 import org.apache.lucene.document.LongField;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.NumericDocValuesField;
-import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
@@ -809,8 +808,8 @@ public class TestSortOptimization extends LuceneTestCase {
       int value = random().nextInt();
       int value2 = random().nextInt();
       final Document doc = new Document();
-      doc.add(new LongField("my_field", value, Store.NO));
-      doc.add(new LongField("my_field", value2, Store.NO));
+      doc.add(new LongField("my_field", value, Field.Store.NO));
+      doc.add(new LongField("my_field", value2, Field.Store.NO));
       writer.addDocument(doc);
     }
     final IndexReader reader = DirectoryReader.open(writer);
@@ -891,8 +890,7 @@ public class TestSortOptimization extends LuceneTestCase {
     for (int i = 0; i < numDocs; ++i) {
       final Document doc = new Document();
       final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
-      doc.add(new StringField("my_field", value, Store.NO));
-      doc.add(new SortedDocValuesField("my_field", value));
+      doc.add(new KeywordField("my_field", value, Field.Store.NO));
       writer.addDocument(doc);
       if (i % 2000 == 0) writer.flush(); // multiple segments
     }
@@ -916,8 +914,7 @@ public class TestSortOptimization extends LuceneTestCase {
       final Document doc = new Document();
       if (random().nextInt(2) == 0) {
         final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
-        doc.add(new StringField("my_field", value, Store.NO));
-        doc.add(new SortedDocValuesField("my_field", value));
+        doc.add(new KeywordField("my_field", value, Field.Store.NO));
       }
       writer.addDocument(doc);
     }
@@ -936,7 +933,8 @@ public class TestSortOptimization extends LuceneTestCase {
     final int numHits = 5;
 
     { // simple ascending sort
-      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      SortField sortField =
+          KeywordField.newSortField("my_field", false, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_LAST);
       Sort sort = new Sort(sortField);
       TopDocs topDocs = assertSort(reader, sort, numHits, null);
@@ -944,7 +942,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
 
     { // simple descending sort
-      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      SortField sortField = KeywordField.newSortField("my_field", true, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_FIRST);
       Sort sort = new Sort(sortField);
       TopDocs topDocs = assertSort(reader, sort, numHits, null);
@@ -952,21 +950,23 @@ public class TestSortOptimization extends LuceneTestCase {
     }
 
     { // ascending sort that returns missing values first
-      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      SortField sortField =
+          KeywordField.newSortField("my_field", false, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_FIRST);
       Sort sort = new Sort(sortField);
       assertSort(reader, sort, numHits, null);
     }
 
     { // descending sort that returns missing values last
-      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      SortField sortField = KeywordField.newSortField("my_field", true, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_LAST);
       Sort sort = new Sort(sortField);
       assertSort(reader, sort, numHits, null);
     }
 
     { // paging ascending sort with after
-      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      SortField sortField =
+          KeywordField.newSortField("my_field", false, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_LAST);
       Sort sort = new Sort(sortField);
       BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000");
@@ -976,7 +976,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
 
     { // paging descending sort with after
-      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      SortField sortField = KeywordField.newSortField("my_field", true, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_FIRST);
       Sort sort = new Sort(sortField);
       BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000");
@@ -986,7 +986,8 @@ public class TestSortOptimization extends LuceneTestCase {
     }
 
     { // paging ascending sort with after that returns missing values first
-      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      SortField sortField =
+          KeywordField.newSortField("my_field", false, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_FIRST);
       Sort sort = new Sort(sortField);
       BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000");
@@ -996,7 +997,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
 
     { // paging descending sort with after that returns missing values first
-      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      SortField sortField = KeywordField.newSortField("my_field", true, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_LAST);
       Sort sort = new Sort(sortField);
       BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000");
@@ -1006,7 +1007,8 @@ public class TestSortOptimization extends LuceneTestCase {
     }
 
     { // test that if there is the secondary sort on _score, hits are still skipped
-      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      SortField sortField =
+          KeywordField.newSortField("my_field", false, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_LAST);
       Sort sort = new Sort(sortField, FIELD_SCORE);
       TopDocs topDocs = assertSort(reader, sort, numHits, null);
@@ -1014,7 +1016,8 @@ public class TestSortOptimization extends LuceneTestCase {
     }
 
     { // test that if string field is a secondary sort, no optimization is run
-      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      SortField sortField =
+          KeywordField.newSortField("my_field", false, SortedSetSelector.Type.MIN);
       sortField.setMissingValue(SortField.STRING_LAST);
       Sort sort = new Sort(FIELD_SCORE, sortField);
       TopDocs topDocs = assertSort(reader, sort, numHits, null);
@@ -1025,10 +1028,7 @@ public class TestSortOptimization extends LuceneTestCase {
   }
 
   public void doTestStringSortOptimizationDisabled(DirectoryReader reader) throws IOException {
-    SortField sortField =
-        random().nextBoolean()
-            ? new SortedSetSortField("my_field", false)
-            : new SortField("my_field", SortField.Type.STRING);
+    SortField sortField = KeywordField.newSortField("my_field", false, SortedSetSelector.Type.MIN);
     sortField.setMissingValue(SortField.STRING_LAST);
     sortField.setOptimizeSortWithIndexedData(false);
 
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java b/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java
index aad6552ccf8..873d948373f 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortedSetSortField.java
@@ -18,7 +18,7 @@ package org.apache.lucene.search;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.KeywordField;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.Term;
@@ -64,12 +64,12 @@ public class TestSortedSetSortField extends LuceneTestCase {
     Directory dir = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
     Document doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("baz")));
+    doc.add(new KeywordField("value", newBytesRef("baz"), Field.Store.NO));
     doc.add(newStringField("id", "2", Field.Store.YES));
     writer.addDocument(doc);
     doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("foo")));
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("bar")));
+    doc.add(new KeywordField("value", newBytesRef("foo"), Field.Store.NO));
+    doc.add(new KeywordField("value", newBytesRef("bar"), Field.Store.NO));
     doc.add(newStringField("id", "1", Field.Store.YES));
     writer.addDocument(doc);
     IndexReader ir = writer.getReader();
@@ -92,12 +92,12 @@ public class TestSortedSetSortField extends LuceneTestCase {
     Directory dir = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
     Document doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("foo")));
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("bar")));
+    doc.add(new KeywordField("value", newBytesRef("foo"), Field.Store.NO));
+    doc.add(new KeywordField("value", newBytesRef("bar"), Field.Store.NO));
     doc.add(newStringField("id", "1", Field.Store.YES));
     writer.addDocument(doc);
     doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("baz")));
+    doc.add(new KeywordField("value", newBytesRef("baz"), Field.Store.NO));
     doc.add(newStringField("id", "2", Field.Store.YES));
     writer.addDocument(doc);
 
@@ -121,12 +121,12 @@ public class TestSortedSetSortField extends LuceneTestCase {
     Directory dir = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
     Document doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("baz")));
+    doc.add(new KeywordField("value", newBytesRef("baz"), Field.Store.NO));
     doc.add(newStringField("id", "2", Field.Store.YES));
     writer.addDocument(doc);
     doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("foo")));
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("bar")));
+    doc.add(new KeywordField("value", newBytesRef("foo"), Field.Store.NO));
+    doc.add(new KeywordField("value", newBytesRef("bar"), Field.Store.NO));
     doc.add(newStringField("id", "1", Field.Store.YES));
     writer.addDocument(doc);
     doc = new Document();
@@ -156,12 +156,12 @@ public class TestSortedSetSortField extends LuceneTestCase {
     Directory dir = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
     Document doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("baz")));
+    doc.add(new KeywordField("value", newBytesRef("baz"), Field.Store.NO));
     doc.add(newStringField("id", "2", Field.Store.YES));
     writer.addDocument(doc);
     doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("foo")));
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("bar")));
+    doc.add(new KeywordField("value", newBytesRef("foo"), Field.Store.NO));
+    doc.add(new KeywordField("value", newBytesRef("bar"), Field.Store.NO));
     doc.add(newStringField("id", "1", Field.Store.YES));
     writer.addDocument(doc);
     doc = new Document();
@@ -191,11 +191,11 @@ public class TestSortedSetSortField extends LuceneTestCase {
     Directory dir = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
     Document doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("baz")));
+    doc.add(new KeywordField("value", newBytesRef("baz"), Field.Store.NO));
     doc.add(newStringField("id", "2", Field.Store.YES));
     writer.addDocument(doc);
     doc = new Document();
-    doc.add(new SortedSetDocValuesField("value", newBytesRef("bar")));
+    doc.add(new KeywordField("value", newBytesRef("bar"), Field.Store.NO));
     doc.add(newStringField("id", "1", Field.Store.YES));
     writer.addDocument(doc);
     IndexReader ir = writer.getReader();
diff --git a/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java b/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java
index 7b172d65d41..9c683d3937c 100644
--- a/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java
+++ b/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java
@@ -34,9 +34,9 @@ import org.apache.lucene.demo.knn.DemoEmbeddings;
 import org.apache.lucene.demo.knn.KnnVectorDict;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.KeywordField;
 import org.apache.lucene.document.KnnFloatVectorField;
 import org.apache.lucene.document.LongField;
-import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -234,8 +234,7 @@ public class IndexFiles implements AutoCloseable {
       // field that is indexed (i.e. searchable), but don't tokenize
       // the field into separate words and don't index term frequency
       // or positional information:
-      Field pathField = new StringField("path", file.toString(), Field.Store.YES);
-      doc.add(pathField);
+      doc.add(new KeywordField("path", file.toString(), Field.Store.YES));
 
       // Add the last modified date of the file a field named "modified".
       // Use a LongField that is indexed with points and doc values, and is efficient


[lucene] 02/02: Skip the TokenStream overhead when indexing simple keywords. (#12139)

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit 668246439ddfcc6ece0b1ee4a06d51e2693b6ff3
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Tue Feb 21 14:00:11 2023 +0100

    Skip the TokenStream overhead when indexing simple keywords. (#12139)
    
    Indexing simple keywords through a `TokenStream` abstraction introduces a bit
    of overhead due to attribute management. Not much, but indexing keywords boils
    down to adding to a hash map and appending to a postings list, which is quite
    cheap too so even some low overhead can significantly impact indexing speed.
---
 lucene/CHANGES.txt                                 |   2 +
 .../src/java/org/apache/lucene/document/Field.java |   5 +
 .../org/apache/lucene/document/InvertableType.java |  40 ++++
 .../org/apache/lucene/document/KeywordField.java   |  17 +-
 .../org/apache/lucene/document/StringField.java    |  15 ++
 .../org/apache/lucene/index/FieldInvertState.java  |  18 +-
 .../lucene/index/FreqProxTermsWriterPerField.java  |   4 +-
 .../org/apache/lucene/index/IndexableField.java    |   7 +
 .../org/apache/lucene/index/IndexingChain.java     |  60 ++++++
 .../lucene/index/TermVectorsConsumerPerField.java  |   3 +
 .../apache/lucene/index/TestDocumentWriter.java    | 228 ++++++++++++++++++++-
 .../lucene/index/TestExceedMaxTermLength.java      |  67 +++++-
 .../org/apache/lucene/index/TestFieldReuse.java    |  12 +-
 .../apache/lucene/index/TestIndexableField.java    |  11 +
 .../apache/lucene/index/memory/MemoryIndex.java    |  30 +++
 .../lucene/index/memory/TestMemoryIndex.java       | 149 ++++++++++++++
 .../apache/lucene/misc/document/LazyDocument.java  |   8 +-
 17 files changed, 654 insertions(+), 22 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 54b411b53e2..82cf4fa3075 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -40,6 +40,8 @@ Optimizations
 
 * GITHUB#12155: Speed up DocValuesRewriteMethod by making use of sortedness. (Greg Miller)
 
+* GITHUB#12139: Faster indexing of string fields. (Adrien Grand)
+
 Bug Fixes
 ---------------------
 (No changes)
diff --git a/lucene/core/src/java/org/apache/lucene/document/Field.java b/lucene/core/src/java/org/apache/lucene/document/Field.java
index db038ff37cc..68a9059039a 100644
--- a/lucene/core/src/java/org/apache/lucene/document/Field.java
+++ b/lucene/core/src/java/org/apache/lucene/document/Field.java
@@ -449,6 +449,11 @@ public class Field implements IndexableField {
     return type;
   }
 
+  @Override
+  public InvertableType invertableType() {
+    return InvertableType.TOKEN_STREAM;
+  }
+
   @Override
   public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
     if (fieldType().indexOptions() == IndexOptions.NONE) {
diff --git a/lucene/core/src/java/org/apache/lucene/document/InvertableType.java b/lucene/core/src/java/org/apache/lucene/document/InvertableType.java
new file mode 100644
index 00000000000..febc660817b
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/InvertableType.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexableField;
+
+/** Describes how an {@link IndexableField} should be inverted for indexing terms and postings. */
+public enum InvertableType {
+
+  /**
+   * The field should be treated as a single value whose binary content is returned by {@link
+   * IndexableField#binaryValue()}. The term frequency is assumed to be one. If you need to index
+   * multiple values, you should pass multiple {@link IndexableField} instances to the {@link
+   * IndexWriter}. If the same value is provided multiple times, the term frequency will be equal to
+   * the number of times that this value occurred in the same document.
+   */
+  BINARY,
+
+  /**
+   * The field should be inverted through its {@link
+   * IndexableField#tokenStream(org.apache.lucene.analysis.Analyzer,
+   * org.apache.lucene.analysis.TokenStream)}.
+   */
+  TOKEN_STREAM;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/document/KeywordField.java b/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
index 70b27ad671a..6ff0ecaf4dd 100644
--- a/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
+++ b/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
@@ -63,6 +63,7 @@ public class KeywordField extends Field {
     FIELD_TYPE_STORED.freeze();
   }
 
+  private BytesRef binaryValue;
   private final StoredValue storedValue;
 
   /**
@@ -75,6 +76,7 @@ public class KeywordField extends Field {
    */
   public KeywordField(String name, BytesRef value, Store stored) {
     super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
+    this.binaryValue = value;
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -92,6 +94,7 @@ public class KeywordField extends Field {
    */
   public KeywordField(String name, String value, Store stored) {
     super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
+    this.binaryValue = new BytesRef(value);
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -101,17 +104,18 @@ public class KeywordField extends Field {
 
   @Override
   public BytesRef binaryValue() {
-    BytesRef binaryValue = super.binaryValue();
-    if (binaryValue != null) {
-      return binaryValue;
-    } else {
-      return new BytesRef(stringValue());
-    }
+    return binaryValue;
+  }
+
+  @Override
+  public InvertableType invertableType() {
+    return InvertableType.BINARY;
   }
 
   @Override
   public void setStringValue(String value) {
     super.setStringValue(value);
+    binaryValue = new BytesRef(value);
     if (storedValue != null) {
       storedValue.setStringValue(value);
     }
@@ -120,6 +124,7 @@ public class KeywordField extends Field {
   @Override
   public void setBytesValue(BytesRef value) {
     super.setBytesValue(value);
+    binaryValue = value;
     if (storedValue != null) {
       storedValue.setBinaryValue(value);
     }
diff --git a/lucene/core/src/java/org/apache/lucene/document/StringField.java b/lucene/core/src/java/org/apache/lucene/document/StringField.java
index fb242346258..4a502d79f89 100644
--- a/lucene/core/src/java/org/apache/lucene/document/StringField.java
+++ b/lucene/core/src/java/org/apache/lucene/document/StringField.java
@@ -45,6 +45,7 @@ public final class StringField extends Field {
     TYPE_STORED.freeze();
   }
 
+  private BytesRef binaryValue;
   private final StoredValue storedValue;
 
   /**
@@ -57,6 +58,7 @@ public final class StringField extends Field {
    */
   public StringField(String name, String value, Store stored) {
     super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
+    binaryValue = new BytesRef(value);
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -76,6 +78,7 @@ public final class StringField extends Field {
    */
   public StringField(String name, BytesRef value, Store stored) {
     super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
+    binaryValue = value;
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -83,9 +86,20 @@ public final class StringField extends Field {
     }
   }
 
+  @Override
+  public InvertableType invertableType() {
+    return InvertableType.BINARY;
+  }
+
+  @Override
+  public BytesRef binaryValue() {
+    return binaryValue;
+  }
+
   @Override
   public void setStringValue(String value) {
     super.setStringValue(value);
+    binaryValue = new BytesRef(value);
     if (storedValue != null) {
       storedValue.setStringValue(value);
     }
@@ -94,6 +108,7 @@ public final class StringField extends Field {
   @Override
   public void setBytesValue(BytesRef value) {
     super.setBytesValue(value);
+    binaryValue = value;
     if (storedValue != null) {
       storedValue.setBinaryValue(value);
     }
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java b/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
index 4bf16bb6c27..d5357fa4875 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
@@ -96,11 +96,19 @@ public final class FieldInvertState {
   void setAttributeSource(AttributeSource attributeSource) {
     if (this.attributeSource != attributeSource) {
       this.attributeSource = attributeSource;
-      termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
-      termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
-      posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
-      offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
-      payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
+      if (attributeSource == null) {
+        termAttribute = null;
+        termFreqAttribute = null;
+        posIncrAttribute = null;
+        offsetAttribute = null;
+        payloadAttribute = null;
+      } else {
+        termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
+        termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
+        posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
+        offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
+        payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
+      }
     }
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
index b05747b8755..04d949f5ef9 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@@ -144,7 +144,7 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
 
     if (!hasFreq) {
       assert postings.termFreqs == null;
-      if (termFreqAtt.getTermFrequency() != 1) {
+      if (termFreqAtt != null && termFreqAtt.getTermFrequency() != 1) {
         throw new IllegalStateException(
             "field \""
                 + getFieldName()
@@ -203,7 +203,7 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
   }
 
   private int getTermFreq() {
-    int freq = termFreqAtt.getTermFrequency();
+    int freq = termFreqAtt == null ? 1 : termFreqAtt.getTermFrequency();
     if (freq != 1) {
       if (hasProx) {
         throw new IllegalStateException(
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexableField.java b/lucene/core/src/java/org/apache/lucene/index/IndexableField.java
index 0715d327b83..7293580df82 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexableField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexableField.java
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
 import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.util.BytesRef;
 
@@ -75,4 +76,10 @@ public interface IndexableField {
    * if the field stored.
    */
   public StoredValue storedValue();
+
+  /**
+   * Describes how this field should be inverted. This must return a non-null value if the field
+   * indexes terms and postings.
+   */
+  public InvertableType invertableType();
 }
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
index 3f4930403b6..a68a84ff5ac 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
@@ -1111,11 +1111,27 @@ final class IndexingChain implements Accountable {
      * this field name in this document.
      */
     public void invert(int docID, IndexableField field, boolean first) throws IOException {
+      assert field.fieldType().indexOptions().compareTo(IndexOptions.DOCS) >= 0;
+
       if (first) {
         // First time we're seeing this field (indexed) in this document
         invertState.reset();
       }
 
+      switch (field.invertableType()) {
+        case BINARY:
+          invertTerm(docID, field, first);
+          break;
+        case TOKEN_STREAM:
+          invertTokenStream(docID, field, first);
+          break;
+        default:
+          throw new AssertionError();
+      }
+    }
+
+    private void invertTokenStream(int docID, IndexableField field, boolean first)
+        throws IOException {
       final boolean analyzed = field.fieldType().tokenized() && analyzer != null;
       /*
        * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
@@ -1258,6 +1274,50 @@ final class IndexingChain implements Accountable {
         invertState.offset += analyzer.getOffsetGap(fieldInfo.name);
       }
     }
+
+    private void invertTerm(int docID, IndexableField field, boolean first) throws IOException {
+      BytesRef binaryValue = field.binaryValue();
+      if (binaryValue == null) {
+        throw new IllegalArgumentException(
+            "Field "
+                + field.name()
+                + " returns TERM for invertableType() and null for binaryValue(), which is illegal");
+      }
+      final IndexableFieldType fieldType = field.fieldType();
+      if (fieldType.tokenized()
+          || fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0
+          || fieldType.storeTermVectorPositions()
+          || fieldType.storeTermVectorOffsets()
+          || fieldType.storeTermVectorPayloads()) {
+        throw new IllegalArgumentException(
+            "Fields that are tokenized or index proximity data must produce a non-null TokenStream, but "
+                + field.name()
+                + " did not");
+      }
+      invertState.setAttributeSource(null);
+      invertState.position++;
+      invertState.length++;
+      termsHashPerField.start(field, first);
+      invertState.length = Math.addExact(invertState.length, 1);
+      try {
+        termsHashPerField.add(binaryValue, docID);
+      } catch (MaxBytesLengthExceededException e) {
+        byte[] prefix = new byte[30];
+        System.arraycopy(binaryValue.bytes, binaryValue.offset, prefix, 0, 30);
+        String msg =
+            "Document contains at least one immense term in field=\""
+                + fieldInfo.name
+                + "\" (whose length is longer than the max length "
+                + IndexWriter.MAX_TERM_LENGTH
+                + "), all of which were skipped. The prefix of the first immense term is: '"
+                + Arrays.toString(prefix)
+                + "...'";
+        if (infoStream.isEnabled("IW")) {
+          infoStream.message("IW", "ERROR: " + msg);
+        }
+        throw new IllegalArgumentException(msg, e);
+      }
+    }
   }
 
   DocIdSetIterator getHasDocValues(String field) {
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
index a4c24482201..f2e350de78d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
@@ -284,6 +284,9 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
   }
 
   private int getTermFreq() {
+    if (termFreqAtt == null) {
+      return 1;
+    }
     int freq = termFreqAtt.getTermFrequency();
     if (freq != 1) {
       if (doVectorPositions) {
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
index 1ba00844885..7d8cdf95d23 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
@@ -17,6 +17,9 @@
 package org.apache.lucene.index;
 
 import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.function.Function;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
@@ -31,14 +34,17 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.KnnFloatVectorField;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
@@ -67,10 +73,6 @@ public class TestDocumentWriter extends LuceneTestCase {
     super.tearDown();
   }
 
-  public void test() {
-    assertTrue(dir != null);
-  }
-
   public void testAddDocument() throws Exception {
     Document testDoc = new Document();
     DocHelper.setupDoc(testDoc);
@@ -385,4 +387,222 @@ public class TestDocumentWriter extends LuceneTestCase {
             new KnnFloatVectorField(
                 field, new float[] {1, 2, 3, 4}, VectorSimilarityFunction.EUCLIDEAN));
   }
+
+  private static class MockIndexableField implements IndexableField {
+
+    private final String field;
+    private final BytesRef value;
+    private final IndexableFieldType fieldType;
+
+    MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) {
+      this.field = field;
+      this.value = value;
+      this.fieldType = fieldType;
+    }
+
+    @Override
+    public String name() {
+      return field;
+    }
+
+    @Override
+    public IndexableFieldType fieldType() {
+      return fieldType;
+    }
+
+    @Override
+    public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+      return null;
+    }
+
+    @Override
+    public BytesRef binaryValue() {
+      return value;
+    }
+
+    @Override
+    public String stringValue() {
+      return null;
+    }
+
+    @Override
+    public Reader readerValue() {
+      return null;
+    }
+
+    @Override
+    public Number numericValue() {
+      return null;
+    }
+
+    @Override
+    public StoredValue storedValue() {
+      return null;
+    }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.BINARY;
+    }
+  }
+
+  public void testIndexBinaryValueWithoutTokenStream() throws IOException {
+    List<FieldType> illegalFieldTypes = new ArrayList<>();
+    {
+      FieldType illegalFT = new FieldType();
+      // cannot index a tokenized binary field
+      illegalFT.setTokenized(true);
+      illegalFT.setIndexOptions(IndexOptions.DOCS);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+    {
+      FieldType illegalFT = new FieldType();
+      illegalFT.setTokenized(false);
+      // cannot index positions on a binary field
+      illegalFT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+    {
+      FieldType illegalFT = new FieldType();
+      illegalFT.setTokenized(false);
+      illegalFT.setIndexOptions(IndexOptions.DOCS);
+      illegalFT.setStoreTermVectors(true);
+      // cannot index term vector positions
+      illegalFT.setStoreTermVectorPositions(true);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+    {
+      FieldType illegalFT = new FieldType();
+      illegalFT.setTokenized(false);
+      illegalFT.setIndexOptions(IndexOptions.DOCS);
+      illegalFT.setStoreTermVectors(true);
+      // cannot index term vector offsets
+      illegalFT.setStoreTermVectorOffsets(true);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+
+    for (FieldType ft : illegalFieldTypes) {
+      try (IndexWriter w =
+          new IndexWriter(dir, newIndexWriterConfig().setOpenMode(OpenMode.CREATE))) {
+        MockIndexableField field = new MockIndexableField("field", new BytesRef("a"), ft);
+        Document doc = new Document();
+        doc.add(field);
+        expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc));
+      }
+    }
+
+    try (IndexWriter w =
+        new IndexWriter(dir, newIndexWriterConfig().setOpenMode(OpenMode.CREATE))) {
+      // Field that has both a null token stream and a null binary value
+      MockIndexableField field = new MockIndexableField("field", null, StringField.TYPE_NOT_STORED);
+      Document doc = new Document();
+      doc.add(field);
+      expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc));
+    }
+
+    List<FieldType> legalFieldTypes = new ArrayList<>();
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+
+    for (FieldType ft : legalFieldTypes) {
+      try (IndexWriter w =
+          new IndexWriter(dir, newIndexWriterConfig().setOpenMode(OpenMode.CREATE))) {
+        MockIndexableField field = new MockIndexableField("field", new BytesRef("a"), ft);
+        Document doc = new Document();
+        doc.add(field);
+        doc.add(field);
+        w.addDocument(doc);
+      }
+
+      try (DirectoryReader reader = DirectoryReader.open(dir)) {
+        LeafReader leafReader = getOnlyLeafReader(reader);
+
+        {
+          Terms terms = leafReader.terms("field");
+          assertEquals(1, terms.getSumDocFreq());
+          if (ft.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) {
+            assertEquals(2, terms.getSumTotalTermFreq());
+          } else {
+            assertEquals(1, terms.getSumTotalTermFreq());
+          }
+          TermsEnum termsEnum = terms.iterator();
+          assertTrue(termsEnum.seekExact(new BytesRef("a")));
+          PostingsEnum pe = termsEnum.postings(null, PostingsEnum.ALL);
+          assertEquals(0, pe.nextDoc());
+          if (ft.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) {
+            assertEquals(2, pe.freq());
+          } else {
+            assertEquals(1, pe.freq());
+          }
+          assertEquals(-1, pe.nextPosition());
+          assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+        }
+
+        if (ft.storeTermVectors()) {
+          Terms tvTerms = leafReader.termVectors().get(0).terms("field");
+          assertEquals(1, tvTerms.getSumDocFreq());
+          assertEquals(2, tvTerms.getSumTotalTermFreq());
+          TermsEnum tvTermsEnum = tvTerms.iterator();
+          assertTrue(tvTermsEnum.seekExact(new BytesRef("a")));
+          PostingsEnum pe = tvTermsEnum.postings(null, PostingsEnum.ALL);
+          assertEquals(0, pe.nextDoc());
+          assertEquals(2, pe.freq());
+          assertEquals(-1, pe.nextPosition());
+          assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+        } else {
+          assertNull(leafReader.termVectors().get(0));
+        }
+      }
+    }
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java b/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
index 11f58b88348..3b252f5d5de 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
@@ -16,7 +16,9 @@
  */
 package org.apache.lucene.index;
 
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 import java.io.IOException;
+import java.util.Arrays;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
@@ -24,6 +26,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.BytesRef;
 import org.junit.After;
 import org.junit.Before;
 
@@ -50,7 +53,7 @@ public class TestExceedMaxTermLength extends LuceneTestCase {
     dir = null;
   }
 
-  public void test() throws Exception {
+  public void testTokenStream() throws Exception {
 
     MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
     mockAnalyzer.setMaxTokenLength(Integer.MAX_VALUE);
@@ -109,4 +112,66 @@ public class TestExceedMaxTermLength extends LuceneTestCase {
       w.close();
     }
   }
+
+  public void testBinaryValue() throws Exception {
+
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    try {
+      final FieldType ft = new FieldType();
+      ft.setIndexOptions(
+          RandomPicks.randomFrom(
+              random(), Arrays.asList(IndexOptions.DOCS, IndexOptions.DOCS_AND_FREQS)));
+      ft.setStored(random().nextBoolean());
+      ft.setTokenized(false);
+      ft.freeze();
+
+      final Document doc = new Document();
+      if (random().nextBoolean()) {
+        // totally ok short field value
+        doc.add(
+            new Field(
+                TestUtil.randomSimpleString(random(), 1, 10),
+                TestUtil.randomBinaryTerm(random(), 10),
+                ft));
+      }
+      // problematic field
+      final String name = TestUtil.randomSimpleString(random(), 1, 50);
+      final BytesRef value =
+          TestUtil.randomBinaryTerm(
+              random(), TestUtil.nextInt(random(), minTestTermLength, maxTestTermLength));
+      final Field f = new Field(name, value, ft);
+      if (random().nextBoolean()) {
+        // totally ok short field value
+        doc.add(
+            new Field(
+                TestUtil.randomSimpleString(random(), 1, 10),
+                TestUtil.randomBinaryTerm(random(), 10),
+                ft));
+      }
+      doc.add(f);
+
+      IllegalArgumentException expected =
+          expectThrows(
+              IllegalArgumentException.class,
+              () -> {
+                w.addDocument(doc);
+              });
+      String maxLengthMsg = String.valueOf(IndexWriter.MAX_TERM_LENGTH);
+      String msg = expected.getMessage();
+      assertTrue(
+          "IllegalArgumentException didn't mention 'immense term': " + msg,
+          msg.contains("immense term"));
+      assertTrue(
+          "IllegalArgumentException didn't mention max length (" + maxLengthMsg + "): " + msg,
+          msg.contains(maxLengthMsg));
+      assertTrue(
+          "IllegalArgumentException didn't mention field name (" + name + "): " + msg,
+          msg.contains(name));
+      assertTrue(
+          "IllegalArgumentException didn't mention original message: " + msg,
+          msg.contains("bytes can be at most") && msg.contains("in length; got"));
+    } finally {
+      w.close();
+    }
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
index 1a81651800b..0dabb5b17f8 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
@@ -22,6 +22,7 @@ import java.util.Collections;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.store.Directory;
@@ -34,20 +35,20 @@ import org.apache.lucene.util.BytesRef;
 public class TestFieldReuse extends BaseTokenStreamTestCase {
 
   public void testStringField() throws IOException {
-    StringField stringField = new StringField("foo", "bar", Field.Store.NO);
+    Field stringField = new Field("foo", "bar", StringField.TYPE_NOT_STORED);
 
     // passing null
     TokenStream ts = stringField.tokenStream(null, null);
     assertTokenStreamContents(ts, new String[] {"bar"}, new int[] {0}, new int[] {3});
 
     // now reuse previous stream
-    stringField = new StringField("foo", "baz", Field.Store.NO);
+    stringField = new Field("foo", "baz", StringField.TYPE_NOT_STORED);
     TokenStream ts2 = stringField.tokenStream(null, ts);
     assertSame(ts, ts2);
     assertTokenStreamContents(ts, new String[] {"baz"}, new int[] {0}, new int[] {3});
 
     // pass a bogus stream and ensure it's still ok
-    stringField = new StringField("foo", "beer", Field.Store.NO);
+    stringField = new Field("foo", "beer", StringField.TYPE_NOT_STORED);
     TokenStream bogus = new CannedTokenStream();
     ts = stringField.tokenStream(null, bogus);
     assertNotSame(ts, bogus);
@@ -98,6 +99,11 @@ public class TestFieldReuse extends BaseTokenStreamTestCase {
     public StoredValue storedValue() {
       return null;
     }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.TOKEN_STREAM;
+    }
   }
 
   public void testIndexWriterActuallyReuses() throws IOException {
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
index d5f52338519..886417fabc8 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.search.BooleanClause;
@@ -198,6 +199,11 @@ public class TestIndexableField extends LuceneTestCase {
         return null;
       }
     }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.TOKEN_STREAM;
+    }
   }
 
   // Silly test showing how to index documents w/o using Lucene's core
@@ -405,6 +411,11 @@ public class TestIndexableField extends LuceneTestCase {
     public StoredValue storedValue() {
       return null;
     }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.TOKEN_STREAM;
+    }
   }
 
   // LUCENE-5611
diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index ab59337bd92..82fb61b39ba 100644
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -391,6 +391,13 @@ public class MemoryIndex {
     }
     if (tokenStream != null) {
       storeTerms(info, tokenStream, positionIncrementGap, offsetGap);
+    } else if (field.fieldType().indexOptions().compareTo(IndexOptions.DOCS) >= 0) {
+      BytesRef binaryValue = field.binaryValue();
+      if (binaryValue == null) {
+        throw new IllegalArgumentException(
+            "Indexed field must provide a TokenStream or a binary value");
+      }
+      storeTerm(info, binaryValue);
     }
 
     DocValuesType docValuesType = field.fieldType().docValuesType();
@@ -632,6 +639,29 @@ public class MemoryIndex {
     }
   }
 
+  private void storeTerm(Info info, BytesRef term) {
+    info.numTokens++;
+    int ord = info.terms.add(term);
+    if (ord < 0) {
+      ord = -ord - 1;
+      postingsWriter.reset(info.sliceArray.end[ord]);
+    } else {
+      info.sliceArray.start[ord] = postingsWriter.startNewSlice();
+    }
+    info.sliceArray.freq[ord]++;
+    info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
+    info.sumTotalTermFreq++;
+    postingsWriter.writeInt(info.lastPosition++); // fake position
+    if (storeOffsets) { // fake offsests
+      postingsWriter.writeInt(0);
+      postingsWriter.writeInt(0);
+    }
+    if (storePayloads) {
+      postingsWriter.writeInt(-1); // fake payload
+    }
+    info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
+  }
+
   private void storeTerms(
       Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {
 
diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
index 9e716484b47..7cddbe2fe3d 100644
--- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
+++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
@@ -21,6 +21,7 @@ import static org.hamcrest.CoreMatchers.not;
 import static org.hamcrest.core.StringContains.containsString;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -31,6 +32,7 @@ import java.util.function.BiFunction;
 import java.util.function.Function;
 import java.util.stream.LongStream;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.BinaryPoint;
@@ -42,12 +44,14 @@ import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.FloatPoint;
 import org.apache.lucene.document.IntField;
 import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.BinaryDocValues;
@@ -56,6 +60,7 @@ import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.IndexableFieldType;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.PostingsEnum;
@@ -63,6 +68,7 @@ import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.DocIdSetIterator;
@@ -843,4 +849,147 @@ public class TestMemoryIndex extends LuceneTestCase {
     assertEquals(1, sndv.docValueCount());
     assertEquals(50, sndv.nextValue());
   }
+
+  private static class MockIndexableField implements IndexableField {
+
+    private final String field;
+    private final BytesRef value;
+    private final IndexableFieldType fieldType;
+
+    MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) {
+      this.field = field;
+      this.value = value;
+      this.fieldType = fieldType;
+    }
+
+    @Override
+    public String name() {
+      return field;
+    }
+
+    @Override
+    public IndexableFieldType fieldType() {
+      return fieldType;
+    }
+
+    @Override
+    public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+      return null;
+    }
+
+    @Override
+    public BytesRef binaryValue() {
+      return value;
+    }
+
+    @Override
+    public String stringValue() {
+      return null;
+    }
+
+    @Override
+    public Reader readerValue() {
+      return null;
+    }
+
+    @Override
+    public Number numericValue() {
+      return null;
+    }
+
+    @Override
+    public StoredValue storedValue() {
+      return null;
+    }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.BINARY;
+    }
+  }
+
+  public void testKeywordWithoutTokenStream() throws IOException {
+    List<FieldType> legalFieldTypes = new ArrayList<>();
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+
+    for (FieldType ft : legalFieldTypes) {
+      MockIndexableField field = new MockIndexableField("field", new BytesRef("a"), ft);
+      MemoryIndex index = MemoryIndex.fromDocument(Arrays.asList(field, field), null);
+      LeafReader leafReader = index.createSearcher().getIndexReader().leaves().get(0).reader();
+      {
+        Terms terms = leafReader.terms("field");
+        assertEquals(1, terms.getSumDocFreq());
+        assertEquals(2, terms.getSumTotalTermFreq());
+        TermsEnum termsEnum = terms.iterator();
+        assertTrue(termsEnum.seekExact(new BytesRef("a")));
+        PostingsEnum pe = termsEnum.postings(null, PostingsEnum.ALL);
+        assertEquals(0, pe.nextDoc());
+        assertEquals(2, pe.freq());
+        assertEquals(0, pe.nextPosition());
+        assertEquals(1, pe.nextPosition());
+        assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+      }
+
+      if (ft.storeTermVectors()) {
+        Terms tvTerms = leafReader.termVectors().get(0).terms("field");
+        assertEquals(1, tvTerms.getSumDocFreq());
+        assertEquals(2, tvTerms.getSumTotalTermFreq());
+        TermsEnum tvTermsEnum = tvTerms.iterator();
+        assertTrue(tvTermsEnum.seekExact(new BytesRef("a")));
+        PostingsEnum pe = tvTermsEnum.postings(null, PostingsEnum.ALL);
+        assertEquals(0, pe.nextDoc());
+        assertEquals(2, pe.freq());
+        assertEquals(0, pe.nextPosition());
+        assertEquals(1, pe.nextPosition());
+        assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+      }
+    }
+  }
 }
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java b/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java
index 581eb90785c..3962f0ee2b6 100644
--- a/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java
@@ -27,6 +27,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
@@ -193,7 +194,12 @@ public class LazyDocument {
 
     @Override
     public StoredValue storedValue() {
-      return null;
+      return getRealValue().storedValue();
+    }
+
+    @Override
+    public InvertableType invertableType() {
+      return getRealValue().invertableType();
     }
   }
 }