You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2016/10/04 20:12:10 UTC
[5/6] lucene-solr:master: LUCENE-7438: New UnifiedHighlighter

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/722e8271/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
new file mode 100644
index 0000000..5225041
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
@@ -0,0 +1,581 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.highlight.WeightedSpanTerm;
+import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
+import org.apache.lucene.search.spans.*;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.function.Function;
+
+/**
+ * Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly).
+ * This is a stateful class holding information about the query, but it can (and is) re-used across highlighting
+ * documents.  Despite this state; it's immutable after construction.  The approach taken in this class is very similar
+ * to the standard Highlighter's {@link WeightedSpanTermExtractor} which is in fact re-used here.  However, we ought to
+ * completely rewrite it to use the SpanCollector interface to collect offsets directly. We'll get better
+ * phrase accuracy.
+ *
+ * @lucene.internal
+ */
+public class PhraseHelper {
+
+  public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_",
+      spanQuery -> null, true);
+
+  //TODO it seems this ought to be a general thing on Spans?
+  private static final Comparator<? super Spans> SPANS_COMPARATOR = (o1, o2) -> {
+    int cmp = Integer.compare(o1.docID(), o2.docID());
+    if (cmp != 0) {
+      return cmp;
+    }
+    if (o1.docID() == DocIdSetIterator.NO_MORE_DOCS) {
+      return 0; // don't ask for start/end position; not sure if we can even call those methods
+    }
+    cmp = Integer.compare(o1.startPosition(), o2.startPosition());
+    if (cmp != 0) {
+      return cmp;
+    } else {
+      return Integer.compare(o1.endPosition(), o2.endPosition());
+    }
+  };
+
+  private final String fieldName; // if non-null, only look at queries/terms for this field
+  private final Set<Term> positionInsensitiveTerms; // (TermQuery terms)
+  private final Set<SpanQuery> spanQueries;
+  private final boolean willRewrite;
+
+  /**
+   * Constructor.
+   * {@code rewriteQueryPred} is an extension hook to override the default choice of
+   * {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types are rewritten,
+   * so use this to return {@link Boolean#FALSE} if you know the query doesn't need to be rewritten.
+   * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is
+   * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones.
+   */
+  public PhraseHelper(Query query, String field, Function<SpanQuery, Boolean> rewriteQueryPred,
+               boolean ignoreQueriesNeedingRewrite) {
+    this.fieldName = field; // if null then don't require field match
+    // filter terms to those we want
+    positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>();
+    // requireFieldMatch optional
+    spanQueries = new HashSet<>();
+
+    // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls
+
+    boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class
+
+    // For TermQueries or other position insensitive queries, collect the Terms.
+    // For other Query types, WSTE will convert to an equivalent SpanQuery.  NOT extracting position spans here.
+    new WeightedSpanTermExtractor(field) {
+      //anonymous constructor
+      {
+        setExpandMultiTermQuery(true); //necessary for mustRewriteQuery(spanQuery) to work.
+
+        try {
+          extract(query, 1f, null); // null because we won't actually extract right now; we're not collecting
+        } catch (Exception e) {
+          throw new RuntimeException(e);
+        }
+      }
+
+      @Override
+      protected boolean isQueryUnsupported(Class<? extends Query> clazz) {
+        if (clazz.isAssignableFrom(MultiTermQuery.class)) {
+          return true; //We do MTQ processing separately in MultiTermHighlighting.java
+        }
+        return true; //TODO set to false and provide a hook to customize certain queries.
+      }
+
+      @Override
+      protected void extractWeightedTerms(Map<String, WeightedSpanTerm> terms, Query query, float boost)
+          throws IOException {
+        query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, false, boost)
+            .extractTerms(positionInsensitiveTerms);
+      }
+
+      @Override
+      protected void extractWeightedSpanTerms(Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery,
+                                              float boost) throws IOException {
+        if (field != null) {
+          // if this span query isn't for this field, skip it.
+          Set<String> fieldNameSet = new HashSet<>();//TODO reuse.  note: almost always size 1
+          collectSpanQueryFields(spanQuery, fieldNameSet);
+          if (!fieldNameSet.contains(field)) {
+            return;
+          }
+        }
+
+        // TODO allow users to override the answer to mustRewriteQuery
+        boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
+        if (ignoreQueriesNeedingRewrite && mustRewriteQuery) {
+          return;// ignore this query
+        }
+        mustRewriteHolder[0] |= mustRewriteQuery;
+
+        spanQueries.add(spanQuery);
+      }
+
+      @Override
+      protected boolean mustRewriteQuery(SpanQuery spanQuery) {
+        Boolean rewriteQ = rewriteQueryPred.apply(spanQuery);// allow to override
+        return rewriteQ != null ? rewriteQ : super.mustRewriteQuery(spanQuery);
+      }
+    }; // calling the constructor triggered the extraction/visiting we want.  Hacky; yes.
+
+    willRewrite = mustRewriteHolder[0];
+  }
+
+  Set<SpanQuery> getSpanQueries() {
+    return spanQueries;
+  }
+
+  /**
+   * If there is no position sensitivity then use of the instance of this class can be ignored.
+   */
+  boolean hasPositionSensitivity() {
+    return spanQueries.isEmpty() == false;
+  }
+
+  /**
+   * Rewrite is needed for handling a {@link SpanMultiTermQueryWrapper} (MTQ / wildcards) or some
+   * custom things.  When true, the resulting term list will probably be different than what it was known
+   * to be initially.
+   */
+  boolean willRewrite() {
+    return willRewrite;
+  }
+
+  /**
+   * Collect a list of pre-positioned {@link Spans} for each term, given a reader that has just one document.
+   * It returns no mapping for query terms that occurs in a position insensitive way which therefore don't
+   * need to be filtered.
+   */
+  Map<BytesRef, Spans> getTermToSpans(LeafReader leafReader, int doc)
+      throws IOException {
+    if (spanQueries.isEmpty()) {
+      return Collections.emptyMap();
+    }
+    // for each SpanQuery, collect the member spans into a map.
+    Map<BytesRef, Spans> result = new HashMap<>();
+    for (SpanQuery spanQuery : spanQueries) {
+      getTermToSpans(spanQuery, leafReader.getContext(), doc, result);
+    }
+    return result;
+  }
+
+  // code extracted & refactored from WSTE.extractWeightedSpanTerms()
+  private void getTermToSpans(SpanQuery spanQuery, LeafReaderContext readerContext,
+                              int doc, Map<BytesRef, Spans> result)
+      throws IOException {
+    // note: in WSTE there was some field specific looping that seemed pointless so that isn't here.
+    final IndexSearcher searcher = new IndexSearcher(readerContext);
+    searcher.setQueryCache(null);
+    if (willRewrite) {
+      spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done
+    }
+
+    // Get the underlying query terms
+
+    TreeSet<Term> termSet = new TreeSet<>(); // sorted so we can loop over results in order shortly...
+    searcher.createWeight(spanQuery, false, 1.0f).extractTerms(termSet);//needsScores==false
+
+    // Get Spans by running the query against the reader
+    // TODO it might make sense to re-use/cache the Spans instance, to advance forward between docs
+    SpanWeight spanWeight = (SpanWeight) searcher.createNormalizedWeight(spanQuery, false);
+    Spans spans = spanWeight.getSpans(readerContext, SpanWeight.Postings.POSITIONS);
+    if (spans == null) {
+      return;
+    }
+    TwoPhaseIterator twoPhaseIterator = spans.asTwoPhaseIterator();
+    if (twoPhaseIterator != null) {
+      if (twoPhaseIterator.approximation().advance(doc) != doc || !twoPhaseIterator.matches()) {
+        return;
+      }
+    } else if (spans.advance(doc) != doc) { // preposition, and return doing nothing if find none
+      return;
+    }
+
+    // Consume the Spans into a cache.  This instance is used as a source for multiple cloned copies.
+    // It's important we do this and not re-use the same original Spans instance since these will be iterated
+    // independently later on; sometimes in ways that prevents sharing the original Spans.
+    CachedSpans cachedSpansSource = new CachedSpans(spans); // consumes spans for this doc only and caches
+    spans = null;// we don't use it below
+
+    // Map terms to a Spans instance (aggregate if necessary)
+    for (final Term queryTerm : termSet) {
+      // note: we expect that at least one query term will pass these filters. This is because the collected
+      //   spanQuery list were already filtered by these conditions.
+      if (fieldName != null && fieldName.equals(queryTerm.field()) == false) {
+        continue;
+      }
+      if (positionInsensitiveTerms.contains(queryTerm)) {
+        continue;
+      }
+      // copy-constructor refers to same data (shallow) but has iteration state from the beginning
+      CachedSpans cachedSpans = new CachedSpans(cachedSpansSource);
+      // Add the span to whatever span may or may not exist
+      Spans existingSpans = result.get(queryTerm.bytes());
+      if (existingSpans != null) {
+        if (existingSpans instanceof MultiSpans) {
+          ((MultiSpans) existingSpans).addSpans(cachedSpans);
+        } else { // upgrade to MultiSpans
+          MultiSpans multiSpans = new MultiSpans();
+          multiSpans.addSpans(existingSpans);
+          multiSpans.addSpans(cachedSpans);
+          result.put(queryTerm.bytes(), multiSpans);
+        }
+      } else {
+        result.put(queryTerm.bytes(), cachedSpans);
+      }
+    }
+  }
+
+  /**
+   * Returns terms as a List, but expanded to any terms in strictPhrases' keySet if present.  That can only
+   * happen if willRewrite() is true.
+   */
+  List<BytesRef> expandTermsIfRewrite(BytesRef[] terms, Map<BytesRef, Spans> strictPhrasesTermToSpans) {
+    if (willRewrite()) {
+      Set<BytesRef> allTermSet = new LinkedHashSet<>(terms.length + strictPhrasesTermToSpans.size());
+      Collections.addAll(allTermSet, terms);//FYI already sorted; will keep order
+      if (allTermSet.addAll(strictPhrasesTermToSpans.keySet())) { // true if any were added
+        List<BytesRef> sourceTerms = Arrays.asList(allTermSet.toArray(new BytesRef[allTermSet.size()]));
+        sourceTerms.sort(Comparator.naturalOrder());
+        return sourceTerms;
+      }
+    }
+    return Arrays.asList(terms); // no rewrite; use original terms
+  }
+
+  /**
+   * Returns a filtered postings where the position must be in the given Spans.
+   * The Spans must be in a positioned state (not initial) and should not be shared between other terms.
+   * {@code postingsEnum} should be positioned at the
+   * document (the same one as the spans) but it hasn't iterated the positions yet.
+   * The Spans should be the result of a simple
+   * lookup from {@link #getTermToSpans(LeafReader, int)}, and so it could be null which could mean
+   * either it's completely filtered or that there should be no filtering; this class knows what to do.
+   * <p>
+   * Due to limitations in filtering, the {@link PostingsEnum#freq()} is un-changed even if some positions
+   * get filtered.  So when {@link PostingsEnum#nextPosition()} is called or {@code startOffset} or {@code
+   * endOffset} beyond the "real" positions, these methods returns {@link Integer#MAX_VALUE}.
+   * <p>
+   * <b>This will return null if it's completely filtered out (i.e. effectively has no postings).</b>
+   */
+  PostingsEnum filterPostings(BytesRef term, PostingsEnum postingsEnum, Spans spans)
+      throws IOException {
+    if (spans == null) {
+      if (hasPositionSensitivity() == false || positionInsensitiveTerms.contains(new Term(fieldName, term))) {
+        return postingsEnum; // no filtering
+      } else {
+        return null; // completely filtered out
+      }
+    }
+    if (postingsEnum.docID() != spans.docID()) {
+      throw new IllegalStateException("Spans & Postings doc ID misaligned or not positioned");
+    }
+
+    return new FilterLeafReader.FilterPostingsEnum(postingsEnum) {
+      // freq() is max times nextPosition can be called. We'll set this var to -1 when exhausted.
+      int remainingPositions = postingsEnum.freq();
+
+      @Override
+      public String toString() {
+        String where;
+        try {
+          where = "[" + startOffset() + ":" + endOffset() + "]";
+        } catch (IOException e) {
+          where = "[" + e + "]";
+        }
+        return "'" + term.utf8ToString() + "'@" + where + " filtered by " + spans;
+      }
+
+      @Override
+      public int nextDoc() throws IOException {
+        throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc
+      }
+
+      @Override
+      public int advance(int target) throws IOException {
+        throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc
+      }
+
+      @Override
+      public int nextPosition() throws IOException {
+        // loop over posting positions...
+        NEXT_POS_LOOP:
+        while (remainingPositions > 0) {
+          final int thisPos = super.nextPosition();
+          remainingPositions--;
+
+          // loop spans forward (if necessary) while the span end is behind thisPos
+          while (spans.endPosition() <= thisPos) {
+            if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) { // advance
+              break NEXT_POS_LOOP;
+            }
+            assert spans.docID() == postingsEnum.docID();
+          }
+
+          // is this position within the span?
+          if (thisPos >= spans.startPosition()) {
+            assert thisPos < spans.endPosition(); // guaranteed by previous loop
+            return thisPos; // yay!
+          }
+          // else continue and try the next position
+        }
+        remainingPositions = -1; // signify done
+        return Integer.MAX_VALUE;
+      }
+
+      @Override
+      public int startOffset() throws IOException {
+        return remainingPositions >= 0 ? super.startOffset() : Integer.MAX_VALUE;
+      }
+
+      @Override
+      public int endOffset() throws IOException {
+        return remainingPositions >= 0 ? super.endOffset() : Integer.MAX_VALUE;
+      }
+    };
+  }
+
+  /**
+   * Simple HashSet that filters out Terms not matching a desired field on {@code add()}.
+   */
+  private static class FieldFilteringTermHashSet extends HashSet<Term> {
+    private final String field;
+
+    FieldFilteringTermHashSet(String field) {
+      this.field = field;
+    }
+
+    @Override
+    public boolean add(Term term) {
+      if (term.field().equals(field)) {
+        return super.add(term);
+      } else {
+        return false;
+      }
+    }
+  }
+
+  /**
+   * A single {@link Spans} view over multiple spans.  At least one span is mandatory, but you should probably
+   * supply more than one.  Furthermore, the given spans are expected to be positioned to a document already
+   * via a call to next or advance).
+   */  // TODO move to Lucene core as a Spans utility class?
+  static class MultiSpans extends Spans {
+    final PriorityQueue<Spans> spansQueue = new PriorityQueue<>(SPANS_COMPARATOR);
+    long cost;
+
+    void addSpans(Spans spans) {
+      if (spans.docID() < 0 || spans.docID() == NO_MORE_DOCS) {
+        throw new IllegalArgumentException("Expecting given spans to be in a positioned state.");
+      }
+      spansQueue.add(spans);
+      cost = Math.max(cost, spans.cost());
+    }
+
+    // DocIdSetIterator methods:
+
+    @Override
+    public int nextDoc() throws IOException {
+      if (spansQueue.isEmpty()) {
+        return NO_MORE_DOCS;
+      }
+      return advance(spansQueue.peek().docID() + 1);
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      if (spansQueue.isEmpty()) {
+        return NO_MORE_DOCS;
+      }
+      while (true) {
+        Spans spans = spansQueue.peek();
+        if (spans.docID() >= target) {
+          return spans.docID();
+        }
+        spansQueue.remove(); // must remove before modify state
+        if (spans.advance(target) != NO_MORE_DOCS) { // ... otherwise it's not re-added
+          spansQueue.add(spans);
+        } else if (spansQueue.isEmpty()) {
+          return NO_MORE_DOCS;
+        }
+      }
+    }
+
+    @Override
+    public int docID() {
+      if (spansQueue.isEmpty()) {
+        return NO_MORE_DOCS;
+      }
+      return spansQueue.peek().docID();
+    }
+
+    @Override
+    public long cost() {
+      return cost;
+    }
+
+    // Spans methods:
+
+    @Override
+    public int nextStartPosition() throws IOException {
+      // advance any spans at the initial position per document
+      boolean atDocStart = false;
+      while (spansQueue.peek().startPosition() == -1) {
+        atDocStart = true;
+        Spans headSpans = spansQueue.remove(); // remove because we will change state
+        headSpans.nextStartPosition();
+        spansQueue.add(headSpans);
+      }
+      if (!atDocStart) {
+        Spans headSpans = spansQueue.remove(); // remove because we will change state
+        headSpans.nextStartPosition();
+        spansQueue.add(headSpans);
+      }
+      return startPosition();
+    }
+
+    @Override
+    public int startPosition() {
+      return spansQueue.peek().startPosition();
+    }
+
+    @Override
+    public int endPosition() {
+      return spansQueue.peek().endPosition();
+    }
+
+    @Override
+    public int width() {
+      return spansQueue.peek().width();
+    }
+
+    @Override
+    public void collect(SpanCollector collector) throws IOException {
+      spansQueue.peek().collect(collector);
+    }
+
+    @Override
+    public float positionsCost() {
+      return 100f;// no idea; and we can't delegate due to not allowing to call it dependent on TwoPhaseIterator
+    }
+  }
+
+  /**
+   * A Spans based on a list of cached spans for one doc.  It is pre-positioned to this doc.
+   */
+  private static class CachedSpans extends Spans {
+
+    private static class CachedSpan {
+      final int start;
+      final int end;
+
+      CachedSpan(int start, int end) {
+        this.start = start;
+        this.end = end;
+      }
+    }
+
+    final int docId;
+    final ArrayList<CachedSpan> cachedSpanList;
+    int index = -1;
+
+    CachedSpans(Spans spans) throws IOException {
+      this.docId = spans.docID();
+      assert this.docId != -1;
+      // Consume the spans for this doc into a list.  There's always at least one; the first/current one.
+      cachedSpanList = new ArrayList<>();
+      while (spans.nextStartPosition() != NO_MORE_POSITIONS) {
+        cachedSpanList.add(new CachedSpan(spans.startPosition(), spans.endPosition()));
+      }
+      assert !cachedSpanList.isEmpty(); // bad Span impl?
+    }
+
+    /**
+     * Clone; reset iteration state.
+     */
+    CachedSpans(CachedSpans cloneMe) {
+      docId = cloneMe.docId;
+      cachedSpanList = cloneMe.cachedSpanList;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      throw new UnsupportedOperationException("Not expected");
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      throw new UnsupportedOperationException("Not expected");
+    }
+
+    @Override
+    public int docID() {
+      return docId;
+    }
+
+    @Override
+    public long cost() {
+      return 1;
+    }
+
+    @Override
+    public int nextStartPosition() throws IOException {
+      index++;
+      return startPosition();
+    }
+
+    @Override
+    public int startPosition() {
+      return index < 0 ?
+          -1 : index >= cachedSpanList.size() ?
+          NO_MORE_POSITIONS : cachedSpanList.get(index).start;
+    }
+
+    @Override
+    public int endPosition() {
+      return index < 0 ?
+          -1 : index >= cachedSpanList.size() ?
+          NO_MORE_POSITIONS : cachedSpanList.get(index).end;
+    }
+
+    @Override
+    public int width() {
+      return endPosition() - startPosition();
+    }
+
+    @Override
+    public void collect(SpanCollector collector) throws IOException {
+      throw new UnsupportedOperationException("Not expected");
+    }
+
+    @Override
+    public float positionsCost() {
+      return 1f;
+    }
+
+  } // class CachedSpans
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/722e8271/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
new file mode 100644
index 0000000..4666906
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Uses offsets in postings -- {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.  This
+ * does not support multi-term queries; the highlighter will fallback on analysis for that.
+ *
+ * @lucene.internal
+ */
+public class PostingsOffsetStrategy extends FieldOffsetStrategy {
+
+  public PostingsOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
+    super(field, queryTerms, phraseHelper, automata);
+  }
+
+  @Override
+  public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+    LeafReader leafReader;
+    if (reader instanceof LeafReader) {
+      leafReader = (LeafReader) reader;
+    } else {
+      List<LeafReaderContext> leaves = reader.leaves();
+      LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
+      leafReader = leafReaderContext.reader();
+      docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
+    }
+
+    return createOffsetsEnumsFromReader(leafReader, docId);
+  }
+
+  @Override
+  public UnifiedHighlighter.OffsetSource getOffsetSource() {
+    return UnifiedHighlighter.OffsetSource.POSTINGS;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/722e8271/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
new file mode 100644
index 0000000..81de379
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Like {@link PostingsOffsetStrategy} but also uses term vectors (only terms needed) for multi-term queries.
+ *
+ * @lucene.internal
+ */
+public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy {
+
+  public PostingsWithTermVectorsOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
+    super(field, queryTerms, phraseHelper, automata);
+  }
+
+  @Override
+  public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+    LeafReader leafReader;
+    if (reader instanceof LeafReader) {
+      leafReader = (LeafReader) reader;
+    } else {
+      List<LeafReaderContext> leaves = reader.leaves();
+      LeafReaderContext LeafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
+      leafReader = LeafReaderContext.reader();
+      docId -= LeafReaderContext.docBase; // adjust 'doc' to be within this atomic reader
+    }
+
+    Terms docTerms = leafReader.getTermVector(docId, field);
+    if (docTerms == null) {
+      return Collections.emptyList();
+    }
+    leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms);
+
+    TokenStream tokenStream = automata.length > 0 ? MultiTermHighlighting
+        .uninvertAndFilterTerms(leafReader.terms(field), docId, this.automata, content.length()) : null;
+
+    return createOffsetsEnums(leafReader, docId, tokenStream);
+  }
+
+  @Override
+  public UnifiedHighlighter.OffsetSource getOffsetSource() {
+    return UnifiedHighlighter.OffsetSource.POSTINGS;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/722e8271/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/SplittingBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/SplittingBreakIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/SplittingBreakIterator.java
new file mode 100644
index 0000000..b3a415c
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/SplittingBreakIterator.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+
+/**
+ * Virtually slices the text on both sides of every occurrence of the specified character. If the slice is 0-length
+ * which happens for adjacent slice characters or when they are at the beginning or end, that character is reported
+ * as a boundary.
+ * For every slice between the specified characters, it is further processed with a specified
+ * BreakIterator. A consequence is that the enclosed BreakIterator will never "see" the splitting character.
+ * <br>
+ * <em>Note: {@link #setText(CharacterIterator)} is unsupported. Use the string version.</em>
+ *
+ * @lucene.experimental
+ */
+public class SplittingBreakIterator extends BreakIterator {
+  private final BreakIterator baseIter;
+  private final char sliceChar;
+
+  private String text;
+  private int sliceStartIdx;
+  private int sliceEndIdx;
+  private int current;
+
+  public SplittingBreakIterator(BreakIterator baseIter, char sliceChar) {
+    this.baseIter = baseIter;
+    this.sliceChar = sliceChar;
+  }
+
+  @Override
+  public void setText(CharacterIterator newText) {
+    throw new UnsupportedOperationException("unexpected");
+  }
+
+  @Override
+  public void setText(String newText) {
+    this.text = newText;
+    first();
+  }
+
+  @Override
+  public CharacterIterator getText() {
+    StringCharacterIterator charIter = new StringCharacterIterator(text);
+    // API doesn't say what the state should be but it should probably be at the current index.
+    charIter.setIndex(current());
+    return charIter;
+  }
+
+  @Override
+  public int current() {
+    assert current != DONE;
+    return current; // MUST be updated by the other methods when result isn't DONE.
+  }
+
+  @Override
+  public int first() {
+    sliceStartIdx = 0;
+    sliceEndIdx = text.indexOf(sliceChar);
+    if (sliceEndIdx == -1) {
+      sliceEndIdx = text.length();
+    }
+    if (sliceStartIdx == sliceEndIdx) {
+      return current = sliceStartIdx;
+    }
+    baseIter.setText(text.substring(sliceStartIdx, sliceEndIdx));
+    return current = sliceStartIdx + baseIter.current();// since setText() sets to first(), just grab current()
+  }
+
+  @Override
+  public int last() {
+    sliceEndIdx = text.length();
+    sliceStartIdx = text.lastIndexOf(sliceChar);
+    if (sliceStartIdx == -1) {
+      sliceStartIdx = 0;
+    } else {
+      sliceStartIdx++;//past sliceChar
+    }
+    if (sliceEndIdx == sliceStartIdx) {
+      return current = sliceEndIdx;
+    }
+    baseIter.setText(text.substring(sliceStartIdx, sliceEndIdx));
+    return current = sliceStartIdx + baseIter.last();
+  }
+
+  @Override
+  public int next() {
+    int prevCurrent = current;
+    current = sliceStartIdx == sliceEndIdx ? DONE : baseIter.next();
+    if (current != DONE) {
+      return current = current + sliceStartIdx;
+    }
+    if (sliceEndIdx >= text.length()) {
+      current = prevCurrent;//keep current where it is
+      return DONE;
+    }
+    sliceStartIdx = sliceEndIdx + 1;
+    sliceEndIdx = text.indexOf(sliceChar, sliceStartIdx);
+    if (sliceEndIdx == -1) {
+      sliceEndIdx = text.length();
+    }
+    if (sliceStartIdx == sliceEndIdx) {
+      return current = sliceStartIdx;
+    }
+    baseIter.setText(text.substring(sliceStartIdx, sliceEndIdx));
+    return current = sliceStartIdx + baseIter.current();//use current() since at first() already
+  }
+
+  @Override
+  public int previous() { // note: closely follows next() but reversed
+    int prevCurrent = current;
+    current = sliceStartIdx == sliceEndIdx ? DONE : baseIter.previous();
+    if (current != DONE) {
+      return current = current + sliceStartIdx;
+    }
+    if (sliceStartIdx == 0) {
+      current = prevCurrent;//keep current where it is
+      return DONE;
+    }
+    sliceEndIdx = sliceStartIdx - 1;
+    sliceStartIdx = text.lastIndexOf(sliceChar, sliceEndIdx - 1);
+    if (sliceStartIdx == -1) {
+      sliceStartIdx = 0;
+    } else {
+      sliceStartIdx++;//past sliceChar
+    }
+    if (sliceStartIdx == sliceEndIdx) {
+      return current = sliceStartIdx;
+    }
+    baseIter.setText(text.substring(sliceStartIdx, sliceEndIdx));
+    return current = sliceStartIdx + baseIter.last();
+  }
+
+  @Override
+  public int following(int offset) {
+    // if the offset is not in this slice, update the slice
+    if (offset + 1 < sliceStartIdx || offset + 1 > sliceEndIdx) {
+      if (offset == text.length()) { // DONE condition
+        last(); // because https://bugs.openjdk.java.net/browse/JDK-8015110
+        return DONE;
+      }
+      sliceStartIdx = text.lastIndexOf(sliceChar, offset);//no +1
+      if (sliceStartIdx == -1) {
+        sliceStartIdx = 0;
+      } else {
+        sliceStartIdx++;//move past separator
+      }
+      sliceEndIdx = text.indexOf(sliceChar, Math.max(offset + 1, sliceStartIdx));
+      if (sliceEndIdx == -1) {
+        sliceEndIdx = text.length();
+      }
+      if (sliceStartIdx != sliceEndIdx) {//otherwise, adjacent separator or separator at end
+        baseIter.setText(text.substring(sliceStartIdx, sliceEndIdx));
+      }
+    }
+
+    // lookup following() in this slice:
+    if (sliceStartIdx == sliceEndIdx) {
+      return current = offset + 1;
+    } else {
+      // note: following() can never be first() if the first character is a boundary (it usually is).
+      //   So we have to check if we should call first() instead of following():
+      if (offset == sliceStartIdx - 1) {
+        // the first boundary following this offset is the very first boundary in this slice
+        return current = sliceStartIdx + baseIter.first();
+      } else {
+        return current = sliceStartIdx + baseIter.following(offset - sliceStartIdx);
+      }
+    }
+  }
+
+  @Override
+  public int preceding(int offset) { // note: closely follows following() but reversed
+    if (offset - 1 < sliceStartIdx || offset - 1 > sliceEndIdx) {
+      if (offset == 0) { // DONE condition
+        first(); // because https://bugs.openjdk.java.net/browse/JDK-8015110
+        return DONE;
+      }
+      sliceEndIdx = text.indexOf(sliceChar, offset);//no -1
+      if (sliceEndIdx == -1) {
+        sliceEndIdx = text.length();
+      }
+      sliceStartIdx = text.lastIndexOf(sliceChar, offset - 1);
+      if (sliceStartIdx == -1) {
+        sliceStartIdx = 0;
+      } else {
+        sliceStartIdx = Math.min(sliceStartIdx + 1, sliceEndIdx);
+      }
+      if (sliceStartIdx != sliceEndIdx) {//otherwise, adjacent separator or separator at end
+        baseIter.setText(text.substring(sliceStartIdx, sliceEndIdx));
+      }
+    }
+    // lookup preceding() in this slice:
+    if (sliceStartIdx == sliceEndIdx) {
+      return current = offset - 1;
+    } else {
+      // note: preceding() can never be last() if the last character is a boundary (it usually is).
+      //   So we have to check if we should call last() instead of preceding():
+      if (offset == sliceEndIdx + 1) {
+        // the last boundary preceding this offset is the very last boundary in this slice
+        return current = sliceStartIdx + baseIter.last();
+      } else {
+        return current = sliceStartIdx + baseIter.preceding(offset - sliceStartIdx);
+      }
+    }
+  }
+
+  @Override
+  public int next(int n) {
+    if (n < 0) {
+      for (int i = 0; i < -n; i++) {
+        if (previous() == DONE) {
+          return DONE;
+        }
+      }
+    } else {
+      for (int i = 0; i < n; i++) {
+        if (next() == DONE) {
+          return DONE;
+        }
+      }
+    }
+    return current();
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/722e8271/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java
new file mode 100644
index 0000000..954024c
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FilterLeafReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+
+/**
+ * A filtered LeafReader that only includes the terms that are also in a provided set of terms.
+ * Certain methods may be unimplemented or cause large operations on the underlying reader
+ * and be slow.
+ *
+ * @lucene.internal
+ */
+final class TermVectorFilteredLeafReader extends FilterLeafReader {
+  // NOTE: super ("in") is baseLeafReader
+
+  private final Terms filterTerms;
+
+  /**
+   * <p>Construct a FilterLeafReader based on the specified base reader.
+   * <p>Note that base reader is closed if this FilterLeafReader is closed.</p>
+   *
+   * @param baseLeafReader full/original reader.
+   * @param filterTerms set of terms to filter by -- probably from a TermVector or MemoryIndex.
+   */
+  TermVectorFilteredLeafReader(LeafReader baseLeafReader, Terms filterTerms) {
+    super(baseLeafReader);
+    this.filterTerms = filterTerms;
+  }
+
+  @Override
+  public Fields fields() throws IOException {
+    return new TermVectorFilteredFields(in.fields(), filterTerms);
+  }
+
+  private static final class TermVectorFilteredFields extends FilterLeafReader.FilterFields {
+    // NOTE: super ("in") is baseFields
+
+    private final Terms filterTerms;
+
+    TermVectorFilteredFields(Fields baseFields, Terms filterTerms) {
+      super(baseFields);
+      this.filterTerms = filterTerms;
+    }
+
+    @Override
+    public Terms terms(String field) throws IOException {
+      return new TermsFilteredTerms(in.terms(field), filterTerms);
+    }
+  }
+
+  private static final class TermsFilteredTerms extends FilterLeafReader.FilterTerms {
+    // NOTE: super ("in") is the baseTerms
+
+    private final Terms filterTerms;
+
+    TermsFilteredTerms(Terms baseTerms, Terms filterTerms) {
+      super(baseTerms);
+      this.filterTerms = filterTerms;
+    }
+
+    //TODO delegate size() ?
+
+    //TODO delegate getMin, getMax to filterTerms
+
+    @Override
+    public TermsEnum iterator() throws IOException {
+      return new TermVectorFilteredTermsEnum(in.iterator(), filterTerms.iterator());
+    }
+
+    @Override
+    public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+      return new TermVectorFilteredTermsEnum(in.iterator(), filterTerms.intersect(compiled, startTerm));
+    }
+  }
+
+  private static final class TermVectorFilteredTermsEnum extends FilterLeafReader.FilterTermsEnum {
+    // NOTE: super ("in") is the filteredTermsEnum. This is different than wrappers above because we
+    //    navigate the terms using the filter.
+
+    //TODO: track the last term state from the term state method and do some potential optimizations
+    private final TermsEnum baseTermsEnum;
+
+    TermVectorFilteredTermsEnum(TermsEnum baseTermsEnum, TermsEnum filteredTermsEnum) {
+      super(filteredTermsEnum); // note this is reversed from constructors above
+      this.baseTermsEnum = baseTermsEnum;
+    }
+
+    //TODO delegate docFreq & ttf (moveToCurrentTerm() then call on full?
+
+    @Override
+    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
+      moveToCurrentTerm();
+      return baseTermsEnum.postings(reuse, flags);
+    }
+
+    void moveToCurrentTerm() throws IOException {
+      BytesRef currentTerm = in.term(); // from filteredTermsEnum
+      boolean termInBothTermsEnum = baseTermsEnum.seekExact(currentTerm);
+
+      if (!termInBothTermsEnum) {
+        throw new IllegalStateException("Term vector term " + currentTerm + " does not appear in full index.");
+      }
+    }
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/722e8271/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
new file mode 100644
index 0000000..204679b
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.highlight.TermVectorLeafReader;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Uses term vectors that contain offsets.
+ *
+ * @lucene.internal
+ */
+public class TermVectorOffsetStrategy extends FieldOffsetStrategy {
+
+  public TermVectorOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
+    super(field, queryTerms, phraseHelper, automata);
+  }
+
+  @Override
+  public UnifiedHighlighter.OffsetSource getOffsetSource() {
+    return UnifiedHighlighter.OffsetSource.TERM_VECTORS;
+  }
+
+  @Override
+  public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+    Terms tvTerms = reader.getTermVector(docId, field);
+    if (tvTerms == null) {
+      return Collections.emptyList();
+    }
+
+    LeafReader leafReader = null;
+    if ((terms.length > 0) || strictPhrases.willRewrite()) {
+      leafReader = new TermVectorLeafReader(field, tvTerms);
+      docId = 0;
+    }
+
+    TokenStream tokenStream = null;
+    if (automata.length > 0) {
+      tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(tvTerms, 0, automata, content.length());
+    }
+
+    return createOffsetsEnums(leafReader, docId, tokenStream);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/722e8271/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
new file mode 100644
index 0000000..980c566
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefArray;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.UnicodeUtil;
+
+/**
+ * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
+ * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
+ * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
+ * for them and if not then won't get them.  This TokenStream supports an efficient {@link #reset()}, so there's
+ * no need to wrap with a caching impl.
+ *
+ * @lucene.internal
+ */
+final class TokenStreamFromTermVector extends TokenStream {
+  // note: differs from similar class in the standard highlighter. This one is optimized for sparse cases.
+
+  /**
+   * content length divided by distinct positions; an average of dense text.
+   */
+  private static final double AVG_CHARS_PER_POSITION = 6;
+
+  private static final int INSERTION_SORT_THRESHOLD = 16;
+
+  private final Terms vector;
+
+  private final int filteredDocId;
+
+  private final CharTermAttribute termAttribute;
+
+  private final PositionIncrementAttribute positionIncrementAttribute;
+
+  private final int offsetLength;
+
+  private final float loadFactor;
+
+  private OffsetAttribute offsetAttribute;//maybe null
+
+  private PayloadAttribute payloadAttribute;//maybe null
+
+  private CharsRefBuilder termCharsBuilder;//term data here
+
+  private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
+  private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
+
+  private TokenLL firstToken = null; // the head of a linked-list
+
+  private TokenLL incrementToken = null;
+
+  private boolean initialized = false;//lazy
+
+  public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException {
+    this(vector, 0, offsetLength, 1f);
+  }
+
+  /**
+   * Constructor.
+   *
+   * @param vector        Terms that contains the data for
+   *                      creating the TokenStream. Must have positions and/or offsets.
+   * @param filteredDocId The docID we will process.
+   * @param offsetLength  Supply the character length of the text being uninverted, or a lower value if you don't want
+   *                      to invert text beyond an offset (in so doing this will act as a filter).  If you don't
+   *                      know the length, pass -1.  In conjunction with {@code loadFactor}, it's used to
+   *                      determine how many buckets to create during uninversion.
+   *                      It's also used to filter out tokens with a start offset exceeding this value.
+   * @param loadFactor    The percent of tokens from the original terms (by position count) that are
+   *                      expected to be inverted.  If they are filtered (e.g.
+   *                      {@link org.apache.lucene.index.FilterLeafReader.FilterTerms})
+   *                      then consider using less than 1.0 to avoid wasting space.
+   *                      1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector.
+   */
+  TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException {
+    super();
+    this.filteredDocId = filteredDocId;
+    this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength;
+    if (loadFactor <= 0f || loadFactor > 1f) {
+      throw new IllegalArgumentException("loadFactor should be > 0 and <= 1");
+    }
+    this.loadFactor = loadFactor;
+    assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
+    if (!vector.hasPositions() && !vector.hasOffsets()) {
+      throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
+    }
+    assert vector.hasFreqs();
+    this.vector = vector;
+    termAttribute = addAttribute(CharTermAttribute.class);
+    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
+  }
+
+  public Terms getTermVectorTerms() {
+    return vector;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    incrementToken = null;
+    super.reset();
+  }
+
+  //We delay initialization because we can see which attributes the consumer wants, particularly payloads
+  private void init() throws IOException {
+    assert !initialized;
+    int dpEnumFlags = 0;
+    if (vector.hasOffsets()) {
+      offsetAttribute = addAttribute(OffsetAttribute.class);
+      dpEnumFlags |= PostingsEnum.OFFSETS;
+    }
+    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
+      payloadAttribute = getAttribute(PayloadAttribute.class);
+      payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
+      spareBytesRefBuilder = new BytesRefBuilder();
+      dpEnumFlags |= PostingsEnum.PAYLOADS;
+    }
+
+    // We put term data here
+    termCharsBuilder = new CharsRefBuilder();
+    termCharsBuilder.grow(initTotalTermCharLen());
+
+    // Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor)
+
+    final TokenLL[] tokenBuckets = initTokenBucketsArray();
+    final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION;
+    final double POSITION_TO_BUCKET_IDX = loadFactor;
+
+    final TermsEnum termsEnum = vector.iterator();
+    BytesRef termBytesRef;
+    PostingsEnum dpEnum = null;
+    final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
+
+    TERM_LOOP:
+    while ((termBytesRef = termsEnum.next()) != null) {
+      //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
+      // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
+      tempCharsRefBuilder.grow(termBytesRef.length);
+      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
+      final int termCharsOff = termCharsBuilder.length();
+      termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
+      dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
+      assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
+      int currentDocId = dpEnum.advance(filteredDocId);
+      if (currentDocId != filteredDocId) {
+        continue; //Not expected
+      }
+      final int freq = dpEnum.freq();
+      for (int j = 0; j < freq; j++) {
+        TokenLL token = new TokenLL();
+        token.position = dpEnum.nextPosition(); // can be -1 if not in the TV
+        token.termCharsOff = termCharsOff;
+        token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
+        // copy offset (if it's there) and compute bucketIdx
+        int bucketIdx;
+        if (offsetAttribute != null) {
+          token.startOffset = dpEnum.startOffset();
+          if (offsetLength >= 0 && token.startOffset > offsetLength) {
+            continue TERM_LOOP;//filter this token out; exceeds threshold
+          }
+          token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
+          bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX);
+        } else {
+          bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX);
+        }
+        if (bucketIdx >= tokenBuckets.length) {
+          bucketIdx = tokenBuckets.length - 1;
+        }
+
+        if (payloadAttribute != null) {
+          final BytesRef payload = dpEnum.getPayload();
+          token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
+        }
+
+        //Add token to the head of the bucket linked list
+        token.next = tokenBuckets[bucketIdx];
+        tokenBuckets[bucketIdx] = token;
+      }
+    }
+
+    // Step 2:  Link all Tokens into a linked-list and sort all tokens at the same position
+
+    firstToken = initLinkAndSortTokens(tokenBuckets);
+
+    // If the term vector didn't have positions, synthesize them
+    if (!vector.hasPositions() && firstToken != null) {
+      TokenLL prevToken = firstToken;
+      prevToken.position = 0;
+      for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) {
+        if (prevToken.startOffset == token.startOffset) {
+          token.position = prevToken.position;
+        } else {
+          token.position = prevToken.position + 1;
+        }
+      }
+    }
+
+    initialized = true;
+  }
+
+  private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) {
+    TokenLL firstToken = null;
+    List<TokenLL> scratchTokenArray = new ArrayList<>(); // declare here for re-use.  TODO use native array
+    TokenLL prevToken = null;
+    for (TokenLL tokenHead : tokenBuckets) {
+      if (tokenHead == null) {
+        continue;
+      }
+      //sort tokens at this position and link them; return the first
+      TokenLL tokenTail;
+      // just one token
+      if (tokenHead.next == null) {
+        tokenTail = tokenHead;
+      } else {
+        // add the linked list to a temporary array
+        for (TokenLL cur = tokenHead; cur != null; cur = cur.next) {
+          scratchTokenArray.add(cur);
+        }
+        // sort; and set tokenHead & tokenTail
+        if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) {
+          // insertion sort by creating a linked list (leave scratchTokenArray alone)
+          tokenHead = tokenTail = scratchTokenArray.get(0);
+          tokenHead.next = null;
+          for (int i = 1; i < scratchTokenArray.size(); i++) {
+            TokenLL insertToken = scratchTokenArray.get(i);
+            if (insertToken.compareTo(tokenHead) <= 0) {
+              // takes the place of tokenHead
+              insertToken.next = tokenHead;
+              tokenHead = insertToken;
+            } else {
+              // goes somewhere after tokenHead
+              for (TokenLL prev = tokenHead; true; prev = prev.next) {
+                if (prev.next == null || insertToken.compareTo(prev.next) <= 0) {
+                  if (prev.next == null) {
+                    tokenTail = insertToken;
+                  }
+                  insertToken.next = prev.next;
+                  prev.next = insertToken;
+                  break;
+                }
+              }
+            }
+          }
+        } else {
+          Collections.sort(scratchTokenArray);
+          // take back out and create a linked list
+          TokenLL prev = tokenHead = scratchTokenArray.get(0);
+          for (int i = 1; i < scratchTokenArray.size(); i++) {
+            prev.next = scratchTokenArray.get(i);
+            prev = prev.next;
+          }
+          tokenTail = prev;
+          tokenTail.next = null;
+        }
+        scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that
+      }
+
+      //link to previous
+      if (prevToken != null) {
+        assert prevToken.next == null;
+        prevToken.next = tokenHead; //concatenate linked-list
+        assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations";
+      } else {
+        assert firstToken == null;
+        firstToken = tokenHead;
+      }
+
+      prevToken = tokenTail;
+    }
+    return firstToken;
+  }
+
+  private int initTotalTermCharLen() throws IOException {
+    int guessNumTerms;
+    if (vector.size() != -1) {
+      guessNumTerms = (int) vector.size();
+    } else if (offsetLength != -1) {
+      guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd
+    } else {
+      return 128;
+    }
+    return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len
+  }
+
+  private TokenLL[] initTokenBucketsArray() throws IOException {
+    // Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms).
+    int positionsEstimate;
+    if (offsetLength == -1) { // no clue what the char length is.
+      // Estimate the number of position slots we need from term stats based on Wikipedia.
+      int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
+      if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
+        int size = (int) vector.size();
+        if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
+          size = 128;
+        }
+        sumTotalTermFreq = (int) (size * 2.4);
+      }
+      positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
+    } else {
+      // guess number of token positions by this factor.
+      positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION);
+    }
+    // apply the load factor.
+    return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))];
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    int posInc;
+    if (incrementToken == null) {
+      if (!initialized) {
+        init();
+        assert initialized;
+      }
+      incrementToken = firstToken;
+      if (incrementToken == null) {
+        return false;
+      }
+      posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc
+    } else if (incrementToken.next != null) {
+      int lastPosition = incrementToken.position;
+      incrementToken = incrementToken.next;
+      posInc = incrementToken.position - lastPosition;
+    } else {
+      return false;
+    }
+    clearAttributes();
+    termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
+
+    positionIncrementAttribute.setPositionIncrement(posInc);
+    if (offsetAttribute != null) {
+      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
+    }
+    if (payloadAttribute != null && incrementToken.payloadIndex >= 0) {
+      payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
+    }
+    return true;
+  }
+
+  private static class TokenLL implements Comparable<TokenLL> {
+    // This class should weigh 32 bytes, including object header
+
+    int termCharsOff; // see termCharsBuilder
+    short termCharsLen;
+
+    int position;
+    int startOffset;
+    short endOffsetInc; // add to startOffset to get endOffset
+    int payloadIndex;
+
+    TokenLL next;
+
+    @Override
+    public int compareTo(TokenLL tokenB) {
+      int cmp = Integer.compare(this.position, tokenB.position);
+      if (cmp == 0) {
+        cmp = Integer.compare(this.startOffset, tokenB.startOffset);
+        if (cmp == 0) {
+          cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
+        }
+      }
+      return cmp;
+    }
+  }
+}