You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2023/02/21 13:05:00 UTC
[lucene] 02/02: Skip the TokenStream overhead when indexing simple keywords. (#12139)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit 668246439ddfcc6ece0b1ee4a06d51e2693b6ff3
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Tue Feb 21 14:00:11 2023 +0100

    Skip the TokenStream overhead when indexing simple keywords. (#12139)
    
    Indexing simple keywords through a `TokenStream` abstraction introduces a bit
    of overhead due to attribute management. Not much, but indexing keywords boils
    down to adding to a hash map and appending to a postings list, which is quite
    cheap too so even some low overhead can significantly impact indexing speed.
---
 lucene/CHANGES.txt                                 |   2 +
 .../src/java/org/apache/lucene/document/Field.java |   5 +
 .../org/apache/lucene/document/InvertableType.java |  40 ++++
 .../org/apache/lucene/document/KeywordField.java   |  17 +-
 .../org/apache/lucene/document/StringField.java    |  15 ++
 .../org/apache/lucene/index/FieldInvertState.java  |  18 +-
 .../lucene/index/FreqProxTermsWriterPerField.java  |   4 +-
 .../org/apache/lucene/index/IndexableField.java    |   7 +
 .../org/apache/lucene/index/IndexingChain.java     |  60 ++++++
 .../lucene/index/TermVectorsConsumerPerField.java  |   3 +
 .../apache/lucene/index/TestDocumentWriter.java    | 228 ++++++++++++++++++++-
 .../lucene/index/TestExceedMaxTermLength.java      |  67 +++++-
 .../org/apache/lucene/index/TestFieldReuse.java    |  12 +-
 .../apache/lucene/index/TestIndexableField.java    |  11 +
 .../apache/lucene/index/memory/MemoryIndex.java    |  30 +++
 .../lucene/index/memory/TestMemoryIndex.java       | 149 ++++++++++++++
 .../apache/lucene/misc/document/LazyDocument.java  |   8 +-
 17 files changed, 654 insertions(+), 22 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 54b411b53e2..82cf4fa3075 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -40,6 +40,8 @@ Optimizations
 
 * GITHUB#12155: Speed up DocValuesRewriteMethod by making use of sortedness. (Greg Miller)
 
+* GITHUB#12139: Faster indexing of string fields. (Adrien Grand)
+
 Bug Fixes
 ---------------------
 (No changes)
diff --git a/lucene/core/src/java/org/apache/lucene/document/Field.java b/lucene/core/src/java/org/apache/lucene/document/Field.java
index db038ff37cc..68a9059039a 100644
--- a/lucene/core/src/java/org/apache/lucene/document/Field.java
+++ b/lucene/core/src/java/org/apache/lucene/document/Field.java
@@ -449,6 +449,11 @@ public class Field implements IndexableField {
     return type;
   }
 
+  @Override
+  public InvertableType invertableType() {
+    return InvertableType.TOKEN_STREAM;
+  }
+
   @Override
   public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
     if (fieldType().indexOptions() == IndexOptions.NONE) {
diff --git a/lucene/core/src/java/org/apache/lucene/document/InvertableType.java b/lucene/core/src/java/org/apache/lucene/document/InvertableType.java
new file mode 100644
index 00000000000..febc660817b
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/InvertableType.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexableField;
+
+/** Describes how an {@link IndexableField} should be inverted for indexing terms and postings. */
+public enum InvertableType {
+
+  /**
+   * The field should be treated as a single value whose binary content is returned by {@link
+   * IndexableField#binaryValue()}. The term frequency is assumed to be one. If you need to index
+   * multiple values, you should pass multiple {@link IndexableField} instances to the {@link
+   * IndexWriter}. If the same value is provided multiple times, the term frequency will be equal to
+   * the number of times that this value occurred in the same document.
+   */
+  BINARY,
+
+  /**
+   * The field should be inverted through its {@link
+   * IndexableField#tokenStream(org.apache.lucene.analysis.Analyzer,
+   * org.apache.lucene.analysis.TokenStream)}.
+   */
+  TOKEN_STREAM;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/document/KeywordField.java b/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
index 70b27ad671a..6ff0ecaf4dd 100644
--- a/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
+++ b/lucene/core/src/java/org/apache/lucene/document/KeywordField.java
@@ -63,6 +63,7 @@ public class KeywordField extends Field {
     FIELD_TYPE_STORED.freeze();
   }
 
+  private BytesRef binaryValue;
   private final StoredValue storedValue;
 
   /**
@@ -75,6 +76,7 @@ public class KeywordField extends Field {
    */
   public KeywordField(String name, BytesRef value, Store stored) {
     super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
+    this.binaryValue = value;
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -92,6 +94,7 @@ public class KeywordField extends Field {
    */
   public KeywordField(String name, String value, Store stored) {
     super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
+    this.binaryValue = new BytesRef(value);
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -101,17 +104,18 @@ public class KeywordField extends Field {
 
   @Override
   public BytesRef binaryValue() {
-    BytesRef binaryValue = super.binaryValue();
-    if (binaryValue != null) {
-      return binaryValue;
-    } else {
-      return new BytesRef(stringValue());
-    }
+    return binaryValue;
+  }
+
+  @Override
+  public InvertableType invertableType() {
+    return InvertableType.BINARY;
   }
 
   @Override
   public void setStringValue(String value) {
     super.setStringValue(value);
+    binaryValue = new BytesRef(value);
     if (storedValue != null) {
       storedValue.setStringValue(value);
     }
@@ -120,6 +124,7 @@ public class KeywordField extends Field {
   @Override
   public void setBytesValue(BytesRef value) {
     super.setBytesValue(value);
+    binaryValue = value;
     if (storedValue != null) {
       storedValue.setBinaryValue(value);
     }
diff --git a/lucene/core/src/java/org/apache/lucene/document/StringField.java b/lucene/core/src/java/org/apache/lucene/document/StringField.java
index fb242346258..4a502d79f89 100644
--- a/lucene/core/src/java/org/apache/lucene/document/StringField.java
+++ b/lucene/core/src/java/org/apache/lucene/document/StringField.java
@@ -45,6 +45,7 @@ public final class StringField extends Field {
     TYPE_STORED.freeze();
   }
 
+  private BytesRef binaryValue;
   private final StoredValue storedValue;
 
   /**
@@ -57,6 +58,7 @@ public final class StringField extends Field {
    */
   public StringField(String name, String value, Store stored) {
     super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
+    binaryValue = new BytesRef(value);
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -76,6 +78,7 @@ public final class StringField extends Field {
    */
   public StringField(String name, BytesRef value, Store stored) {
     super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
+    binaryValue = value;
     if (stored == Store.YES) {
       storedValue = new StoredValue(value);
     } else {
@@ -83,9 +86,20 @@ public final class StringField extends Field {
     }
   }
 
+  @Override
+  public InvertableType invertableType() {
+    return InvertableType.BINARY;
+  }
+
+  @Override
+  public BytesRef binaryValue() {
+    return binaryValue;
+  }
+
   @Override
   public void setStringValue(String value) {
     super.setStringValue(value);
+    binaryValue = new BytesRef(value);
     if (storedValue != null) {
       storedValue.setStringValue(value);
     }
@@ -94,6 +108,7 @@ public final class StringField extends Field {
   @Override
   public void setBytesValue(BytesRef value) {
     super.setBytesValue(value);
+    binaryValue = value;
     if (storedValue != null) {
       storedValue.setBinaryValue(value);
     }
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java b/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
index 4bf16bb6c27..d5357fa4875 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
@@ -96,11 +96,19 @@ public final class FieldInvertState {
   void setAttributeSource(AttributeSource attributeSource) {
     if (this.attributeSource != attributeSource) {
       this.attributeSource = attributeSource;
-      termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
-      termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
-      posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
-      offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
-      payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
+      if (attributeSource == null) {
+        termAttribute = null;
+        termFreqAttribute = null;
+        posIncrAttribute = null;
+        offsetAttribute = null;
+        payloadAttribute = null;
+      } else {
+        termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
+        termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
+        posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
+        offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
+        payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
+      }
     }
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
index b05747b8755..04d949f5ef9 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@@ -144,7 +144,7 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
 
     if (!hasFreq) {
       assert postings.termFreqs == null;
-      if (termFreqAtt.getTermFrequency() != 1) {
+      if (termFreqAtt != null && termFreqAtt.getTermFrequency() != 1) {
         throw new IllegalStateException(
             "field \""
                 + getFieldName()
@@ -203,7 +203,7 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
   }
 
   private int getTermFreq() {
-    int freq = termFreqAtt.getTermFrequency();
+    int freq = termFreqAtt == null ? 1 : termFreqAtt.getTermFrequency();
     if (freq != 1) {
       if (hasProx) {
         throw new IllegalStateException(
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexableField.java b/lucene/core/src/java/org/apache/lucene/index/IndexableField.java
index 0715d327b83..7293580df82 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexableField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexableField.java
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
 import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.util.BytesRef;
 
@@ -75,4 +76,10 @@ public interface IndexableField {
    * if the field stored.
    */
   public StoredValue storedValue();
+
+  /**
+   * Describes how this field should be inverted. This must return a non-null value if the field
+   * indexes terms and postings.
+   */
+  public InvertableType invertableType();
 }
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
index 3f4930403b6..a68a84ff5ac 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
@@ -1111,11 +1111,27 @@ final class IndexingChain implements Accountable {
      * this field name in this document.
      */
     public void invert(int docID, IndexableField field, boolean first) throws IOException {
+      assert field.fieldType().indexOptions().compareTo(IndexOptions.DOCS) >= 0;
+
       if (first) {
         // First time we're seeing this field (indexed) in this document
         invertState.reset();
       }
 
+      switch (field.invertableType()) {
+        case BINARY:
+          invertTerm(docID, field, first);
+          break;
+        case TOKEN_STREAM:
+          invertTokenStream(docID, field, first);
+          break;
+        default:
+          throw new AssertionError();
+      }
+    }
+
+    private void invertTokenStream(int docID, IndexableField field, boolean first)
+        throws IOException {
       final boolean analyzed = field.fieldType().tokenized() && analyzer != null;
       /*
        * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
@@ -1258,6 +1274,50 @@ final class IndexingChain implements Accountable {
         invertState.offset += analyzer.getOffsetGap(fieldInfo.name);
       }
     }
+
+    private void invertTerm(int docID, IndexableField field, boolean first) throws IOException {
+      BytesRef binaryValue = field.binaryValue();
+      if (binaryValue == null) {
+        throw new IllegalArgumentException(
+            "Field "
+                + field.name()
+                + " returns TERM for invertableType() and null for binaryValue(), which is illegal");
+      }
+      final IndexableFieldType fieldType = field.fieldType();
+      if (fieldType.tokenized()
+          || fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0
+          || fieldType.storeTermVectorPositions()
+          || fieldType.storeTermVectorOffsets()
+          || fieldType.storeTermVectorPayloads()) {
+        throw new IllegalArgumentException(
+            "Fields that are tokenized or index proximity data must produce a non-null TokenStream, but "
+                + field.name()
+                + " did not");
+      }
+      invertState.setAttributeSource(null);
+      invertState.position++;
+      invertState.length++;
+      termsHashPerField.start(field, first);
+      invertState.length = Math.addExact(invertState.length, 1);
+      try {
+        termsHashPerField.add(binaryValue, docID);
+      } catch (MaxBytesLengthExceededException e) {
+        byte[] prefix = new byte[30];
+        System.arraycopy(binaryValue.bytes, binaryValue.offset, prefix, 0, 30);
+        String msg =
+            "Document contains at least one immense term in field=\""
+                + fieldInfo.name
+                + "\" (whose length is longer than the max length "
+                + IndexWriter.MAX_TERM_LENGTH
+                + "), all of which were skipped. The prefix of the first immense term is: '"
+                + Arrays.toString(prefix)
+                + "...'";
+        if (infoStream.isEnabled("IW")) {
+          infoStream.message("IW", "ERROR: " + msg);
+        }
+        throw new IllegalArgumentException(msg, e);
+      }
+    }
   }
 
   DocIdSetIterator getHasDocValues(String field) {
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
index a4c24482201..f2e350de78d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
@@ -284,6 +284,9 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
   }
 
   private int getTermFreq() {
+    if (termFreqAtt == null) {
+      return 1;
+    }
     int freq = termFreqAtt.getTermFrequency();
     if (freq != 1) {
       if (doVectorPositions) {
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
index 1ba00844885..7d8cdf95d23 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
@@ -17,6 +17,9 @@
 package org.apache.lucene.index;
 
 import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.function.Function;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
@@ -31,14 +34,17 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.KnnFloatVectorField;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
@@ -67,10 +73,6 @@ public class TestDocumentWriter extends LuceneTestCase {
     super.tearDown();
   }
 
-  public void test() {
-    assertTrue(dir != null);
-  }
-
   public void testAddDocument() throws Exception {
     Document testDoc = new Document();
     DocHelper.setupDoc(testDoc);
@@ -385,4 +387,222 @@ public class TestDocumentWriter extends LuceneTestCase {
             new KnnFloatVectorField(
                 field, new float[] {1, 2, 3, 4}, VectorSimilarityFunction.EUCLIDEAN));
   }
+
+  private static class MockIndexableField implements IndexableField {
+
+    private final String field;
+    private final BytesRef value;
+    private final IndexableFieldType fieldType;
+
+    MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) {
+      this.field = field;
+      this.value = value;
+      this.fieldType = fieldType;
+    }
+
+    @Override
+    public String name() {
+      return field;
+    }
+
+    @Override
+    public IndexableFieldType fieldType() {
+      return fieldType;
+    }
+
+    @Override
+    public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+      return null;
+    }
+
+    @Override
+    public BytesRef binaryValue() {
+      return value;
+    }
+
+    @Override
+    public String stringValue() {
+      return null;
+    }
+
+    @Override
+    public Reader readerValue() {
+      return null;
+    }
+
+    @Override
+    public Number numericValue() {
+      return null;
+    }
+
+    @Override
+    public StoredValue storedValue() {
+      return null;
+    }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.BINARY;
+    }
+  }
+
+  public void testIndexBinaryValueWithoutTokenStream() throws IOException {
+    List<FieldType> illegalFieldTypes = new ArrayList<>();
+    {
+      FieldType illegalFT = new FieldType();
+      // cannot index a tokenized binary field
+      illegalFT.setTokenized(true);
+      illegalFT.setIndexOptions(IndexOptions.DOCS);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+    {
+      FieldType illegalFT = new FieldType();
+      illegalFT.setTokenized(false);
+      // cannot index positions on a binary field
+      illegalFT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+    {
+      FieldType illegalFT = new FieldType();
+      illegalFT.setTokenized(false);
+      illegalFT.setIndexOptions(IndexOptions.DOCS);
+      illegalFT.setStoreTermVectors(true);
+      // cannot index term vector positions
+      illegalFT.setStoreTermVectorPositions(true);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+    {
+      FieldType illegalFT = new FieldType();
+      illegalFT.setTokenized(false);
+      illegalFT.setIndexOptions(IndexOptions.DOCS);
+      illegalFT.setStoreTermVectors(true);
+      // cannot index term vector offsets
+      illegalFT.setStoreTermVectorOffsets(true);
+      illegalFT.freeze();
+      illegalFieldTypes.add(illegalFT);
+    }
+
+    for (FieldType ft : illegalFieldTypes) {
+      try (IndexWriter w =
+          new IndexWriter(dir, newIndexWriterConfig().setOpenMode(OpenMode.CREATE))) {
+        MockIndexableField field = new MockIndexableField("field", new BytesRef("a"), ft);
+        Document doc = new Document();
+        doc.add(field);
+        expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc));
+      }
+    }
+
+    try (IndexWriter w =
+        new IndexWriter(dir, newIndexWriterConfig().setOpenMode(OpenMode.CREATE))) {
+      // Field that has both a null token stream and a null binary value
+      MockIndexableField field = new MockIndexableField("field", null, StringField.TYPE_NOT_STORED);
+      Document doc = new Document();
+      doc.add(field);
+      expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc));
+    }
+
+    List<FieldType> legalFieldTypes = new ArrayList<>();
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+
+    for (FieldType ft : legalFieldTypes) {
+      try (IndexWriter w =
+          new IndexWriter(dir, newIndexWriterConfig().setOpenMode(OpenMode.CREATE))) {
+        MockIndexableField field = new MockIndexableField("field", new BytesRef("a"), ft);
+        Document doc = new Document();
+        doc.add(field);
+        doc.add(field);
+        w.addDocument(doc);
+      }
+
+      try (DirectoryReader reader = DirectoryReader.open(dir)) {
+        LeafReader leafReader = getOnlyLeafReader(reader);
+
+        {
+          Terms terms = leafReader.terms("field");
+          assertEquals(1, terms.getSumDocFreq());
+          if (ft.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) {
+            assertEquals(2, terms.getSumTotalTermFreq());
+          } else {
+            assertEquals(1, terms.getSumTotalTermFreq());
+          }
+          TermsEnum termsEnum = terms.iterator();
+          assertTrue(termsEnum.seekExact(new BytesRef("a")));
+          PostingsEnum pe = termsEnum.postings(null, PostingsEnum.ALL);
+          assertEquals(0, pe.nextDoc());
+          if (ft.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) {
+            assertEquals(2, pe.freq());
+          } else {
+            assertEquals(1, pe.freq());
+          }
+          assertEquals(-1, pe.nextPosition());
+          assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+        }
+
+        if (ft.storeTermVectors()) {
+          Terms tvTerms = leafReader.termVectors().get(0).terms("field");
+          assertEquals(1, tvTerms.getSumDocFreq());
+          assertEquals(2, tvTerms.getSumTotalTermFreq());
+          TermsEnum tvTermsEnum = tvTerms.iterator();
+          assertTrue(tvTermsEnum.seekExact(new BytesRef("a")));
+          PostingsEnum pe = tvTermsEnum.postings(null, PostingsEnum.ALL);
+          assertEquals(0, pe.nextDoc());
+          assertEquals(2, pe.freq());
+          assertEquals(-1, pe.nextPosition());
+          assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+        } else {
+          assertNull(leafReader.termVectors().get(0));
+        }
+      }
+    }
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java b/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
index 11f58b88348..3b252f5d5de 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
@@ -16,7 +16,9 @@
  */
 package org.apache.lucene.index;
 
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 import java.io.IOException;
+import java.util.Arrays;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
@@ -24,6 +26,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.BytesRef;
 import org.junit.After;
 import org.junit.Before;
 
@@ -50,7 +53,7 @@ public class TestExceedMaxTermLength extends LuceneTestCase {
     dir = null;
   }
 
-  public void test() throws Exception {
+  public void testTokenStream() throws Exception {
 
     MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
     mockAnalyzer.setMaxTokenLength(Integer.MAX_VALUE);
@@ -109,4 +112,66 @@ public class TestExceedMaxTermLength extends LuceneTestCase {
       w.close();
     }
   }
+
+  public void testBinaryValue() throws Exception {
+
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    try {
+      final FieldType ft = new FieldType();
+      ft.setIndexOptions(
+          RandomPicks.randomFrom(
+              random(), Arrays.asList(IndexOptions.DOCS, IndexOptions.DOCS_AND_FREQS)));
+      ft.setStored(random().nextBoolean());
+      ft.setTokenized(false);
+      ft.freeze();
+
+      final Document doc = new Document();
+      if (random().nextBoolean()) {
+        // totally ok short field value
+        doc.add(
+            new Field(
+                TestUtil.randomSimpleString(random(), 1, 10),
+                TestUtil.randomBinaryTerm(random(), 10),
+                ft));
+      }
+      // problematic field
+      final String name = TestUtil.randomSimpleString(random(), 1, 50);
+      final BytesRef value =
+          TestUtil.randomBinaryTerm(
+              random(), TestUtil.nextInt(random(), minTestTermLength, maxTestTermLength));
+      final Field f = new Field(name, value, ft);
+      if (random().nextBoolean()) {
+        // totally ok short field value
+        doc.add(
+            new Field(
+                TestUtil.randomSimpleString(random(), 1, 10),
+                TestUtil.randomBinaryTerm(random(), 10),
+                ft));
+      }
+      doc.add(f);
+
+      IllegalArgumentException expected =
+          expectThrows(
+              IllegalArgumentException.class,
+              () -> {
+                w.addDocument(doc);
+              });
+      String maxLengthMsg = String.valueOf(IndexWriter.MAX_TERM_LENGTH);
+      String msg = expected.getMessage();
+      assertTrue(
+          "IllegalArgumentException didn't mention 'immense term': " + msg,
+          msg.contains("immense term"));
+      assertTrue(
+          "IllegalArgumentException didn't mention max length (" + maxLengthMsg + "): " + msg,
+          msg.contains(maxLengthMsg));
+      assertTrue(
+          "IllegalArgumentException didn't mention field name (" + name + "): " + msg,
+          msg.contains(name));
+      assertTrue(
+          "IllegalArgumentException didn't mention original message: " + msg,
+          msg.contains("bytes can be at most") && msg.contains("in length; got"));
+    } finally {
+      w.close();
+    }
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
index 1a81651800b..0dabb5b17f8 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
@@ -22,6 +22,7 @@ import java.util.Collections;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.store.Directory;
@@ -34,20 +35,20 @@ import org.apache.lucene.util.BytesRef;
 public class TestFieldReuse extends BaseTokenStreamTestCase {
 
   public void testStringField() throws IOException {
-    StringField stringField = new StringField("foo", "bar", Field.Store.NO);
+    Field stringField = new Field("foo", "bar", StringField.TYPE_NOT_STORED);
 
     // passing null
     TokenStream ts = stringField.tokenStream(null, null);
     assertTokenStreamContents(ts, new String[] {"bar"}, new int[] {0}, new int[] {3});
 
     // now reuse previous stream
-    stringField = new StringField("foo", "baz", Field.Store.NO);
+    stringField = new Field("foo", "baz", StringField.TYPE_NOT_STORED);
     TokenStream ts2 = stringField.tokenStream(null, ts);
     assertSame(ts, ts2);
     assertTokenStreamContents(ts, new String[] {"baz"}, new int[] {0}, new int[] {3});
 
     // pass a bogus stream and ensure it's still ok
-    stringField = new StringField("foo", "beer", Field.Store.NO);
+    stringField = new Field("foo", "beer", StringField.TYPE_NOT_STORED);
     TokenStream bogus = new CannedTokenStream();
     ts = stringField.tokenStream(null, bogus);
     assertNotSame(ts, bogus);
@@ -98,6 +99,11 @@ public class TestFieldReuse extends BaseTokenStreamTestCase {
     public StoredValue storedValue() {
       return null;
     }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.TOKEN_STREAM;
+    }
   }
 
   public void testIndexWriterActuallyReuses() throws IOException {
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
index d5f52338519..886417fabc8 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.search.BooleanClause;
@@ -198,6 +199,11 @@ public class TestIndexableField extends LuceneTestCase {
         return null;
       }
     }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.TOKEN_STREAM;
+    }
   }
 
   // Silly test showing how to index documents w/o using Lucene's core
@@ -405,6 +411,11 @@ public class TestIndexableField extends LuceneTestCase {
     public StoredValue storedValue() {
       return null;
     }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.TOKEN_STREAM;
+    }
   }
 
   // LUCENE-5611
diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index ab59337bd92..82fb61b39ba 100644
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -391,6 +391,13 @@ public class MemoryIndex {
     }
     if (tokenStream != null) {
       storeTerms(info, tokenStream, positionIncrementGap, offsetGap);
+    } else if (field.fieldType().indexOptions().compareTo(IndexOptions.DOCS) >= 0) {
+      BytesRef binaryValue = field.binaryValue();
+      if (binaryValue == null) {
+        throw new IllegalArgumentException(
+            "Indexed field must provide a TokenStream or a binary value");
+      }
+      storeTerm(info, binaryValue);
     }
 
     DocValuesType docValuesType = field.fieldType().docValuesType();
@@ -632,6 +639,29 @@ public class MemoryIndex {
     }
   }
 
+  private void storeTerm(Info info, BytesRef term) {
+    info.numTokens++;
+    int ord = info.terms.add(term);
+    if (ord < 0) {
+      ord = -ord - 1;
+      postingsWriter.reset(info.sliceArray.end[ord]);
+    } else {
+      info.sliceArray.start[ord] = postingsWriter.startNewSlice();
+    }
+    info.sliceArray.freq[ord]++;
+    info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
+    info.sumTotalTermFreq++;
+    postingsWriter.writeInt(info.lastPosition++); // fake position
+    if (storeOffsets) { // fake offsests
+      postingsWriter.writeInt(0);
+      postingsWriter.writeInt(0);
+    }
+    if (storePayloads) {
+      postingsWriter.writeInt(-1); // fake payload
+    }
+    info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
+  }
+
   private void storeTerms(
       Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {
 
diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
index 9e716484b47..7cddbe2fe3d 100644
--- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
+++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
@@ -21,6 +21,7 @@ import static org.hamcrest.CoreMatchers.not;
 import static org.hamcrest.core.StringContains.containsString;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -31,6 +32,7 @@ import java.util.function.BiFunction;
 import java.util.function.Function;
 import java.util.stream.LongStream;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.BinaryPoint;
@@ -42,12 +44,14 @@ import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.FloatPoint;
 import org.apache.lucene.document.IntField;
 import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.BinaryDocValues;
@@ -56,6 +60,7 @@ import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.IndexableFieldType;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.PostingsEnum;
@@ -63,6 +68,7 @@ import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.DocIdSetIterator;
@@ -843,4 +849,147 @@ public class TestMemoryIndex extends LuceneTestCase {
     assertEquals(1, sndv.docValueCount());
     assertEquals(50, sndv.nextValue());
   }
+
+  private static class MockIndexableField implements IndexableField {
+
+    private final String field;
+    private final BytesRef value;
+    private final IndexableFieldType fieldType;
+
+    MockIndexableField(String field, BytesRef value, IndexableFieldType fieldType) {
+      this.field = field;
+      this.value = value;
+      this.fieldType = fieldType;
+    }
+
+    @Override
+    public String name() {
+      return field;
+    }
+
+    @Override
+    public IndexableFieldType fieldType() {
+      return fieldType;
+    }
+
+    @Override
+    public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+      return null;
+    }
+
+    @Override
+    public BytesRef binaryValue() {
+      return value;
+    }
+
+    @Override
+    public String stringValue() {
+      return null;
+    }
+
+    @Override
+    public Reader readerValue() {
+      return null;
+    }
+
+    @Override
+    public Number numericValue() {
+      return null;
+    }
+
+    @Override
+    public StoredValue storedValue() {
+      return null;
+    }
+
+    @Override
+    public InvertableType invertableType() {
+      return InvertableType.BINARY;
+    }
+  }
+
+  public void testKeywordWithoutTokenStream() throws IOException {
+    List<FieldType> legalFieldTypes = new ArrayList<>();
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(false);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setOmitNorms(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+    {
+      FieldType ft = new FieldType();
+      ft.setTokenized(false);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+      ft.setStoreTermVectors(true);
+      ft.freeze();
+      legalFieldTypes.add(ft);
+    }
+
+    for (FieldType ft : legalFieldTypes) {
+      MockIndexableField field = new MockIndexableField("field", new BytesRef("a"), ft);
+      MemoryIndex index = MemoryIndex.fromDocument(Arrays.asList(field, field), null);
+      LeafReader leafReader = index.createSearcher().getIndexReader().leaves().get(0).reader();
+      {
+        Terms terms = leafReader.terms("field");
+        assertEquals(1, terms.getSumDocFreq());
+        assertEquals(2, terms.getSumTotalTermFreq());
+        TermsEnum termsEnum = terms.iterator();
+        assertTrue(termsEnum.seekExact(new BytesRef("a")));
+        PostingsEnum pe = termsEnum.postings(null, PostingsEnum.ALL);
+        assertEquals(0, pe.nextDoc());
+        assertEquals(2, pe.freq());
+        assertEquals(0, pe.nextPosition());
+        assertEquals(1, pe.nextPosition());
+        assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+      }
+
+      if (ft.storeTermVectors()) {
+        Terms tvTerms = leafReader.termVectors().get(0).terms("field");
+        assertEquals(1, tvTerms.getSumDocFreq());
+        assertEquals(2, tvTerms.getSumTotalTermFreq());
+        TermsEnum tvTermsEnum = tvTerms.iterator();
+        assertTrue(tvTermsEnum.seekExact(new BytesRef("a")));
+        PostingsEnum pe = tvTermsEnum.postings(null, PostingsEnum.ALL);
+        assertEquals(0, pe.nextDoc());
+        assertEquals(2, pe.freq());
+        assertEquals(0, pe.nextPosition());
+        assertEquals(1, pe.nextPosition());
+        assertEquals(DocIdSetIterator.NO_MORE_DOCS, pe.nextDoc());
+      }
+    }
+  }
 }
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java b/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java
index 581eb90785c..3962f0ee2b6 100644
--- a/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/document/LazyDocument.java
@@ -27,6 +27,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.InvertableType;
 import org.apache.lucene.document.StoredValue;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
@@ -193,7 +194,12 @@ public class LazyDocument {
 
     @Override
     public StoredValue storedValue() {
-      return null;
+      return getRealValue().storedValue();
+    }
+
+    @Override
+    public InvertableType invertableType() {
+      return getRealValue().invertableType();
     }
   }
 }