You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2016/11/15 21:17:04 UTC

[1/2] lucene-solr:master: LUCENE-7526: UnifiedHighlighter: enhance MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex strategies, and related refactorings from t

Repository: lucene-solr
Updated Branches:
  refs/heads/master 280cbfd8f -> 7af454ad7


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
index 5f09d84..ac5f0f6 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@@ -117,6 +117,8 @@ public class UnifiedHighlighter {
 
   private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging"
 
+  private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy
+
   // private boolean defaultRequireFieldMatch = true; TODO
 
   private int maxLength = DEFAULT_MAX_LENGTH;
@@ -213,6 +215,12 @@ public class UnifiedHighlighter {
     return defaultHighlightPhrasesStrictly;
   }
 
+
+  protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
+    return defaultPassageRelevancyOverSpeed;
+  }
+
+
   /**
    * The maximum content size to process.  Content will be truncated to this size before highlighting. Typically
    * snippets closer to the beginning of the document better summarize its content.
@@ -716,8 +724,13 @@ public class UnifiedHighlighter {
   }
 
   protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
+    BytesRef[] terms = filterExtractedTerms(field, allTerms);
+    Set<HighlightFlag> highlightFlags = getFlags(field);
+    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
+    CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+    OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
     return new FieldHighlighter(field,
-        getOffsetStrategy(field, query, allTerms),
+        getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags),
         new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
         getScorer(field),
         maxPassages,
@@ -725,41 +738,7 @@ public class UnifiedHighlighter {
         getFormatter(field));
   }
 
-  protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
-    EnumSet<HighlightFlag> highlightFlags = getFlags(field);
-    BytesRef[] terms = filterExtractedTerms(field, allTerms);
-    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
-    CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
-    OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
-    switch (offsetSource) {
-      case ANALYSIS:
-        return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
-            this::preMultiTermQueryRewrite);
-      case NONE_NEEDED:
-        return NoOpOffsetStrategy.INSTANCE;
-      case TERM_VECTORS:
-        return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
-      case POSTINGS:
-        return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
-      case POSTINGS_WITH_TERM_VECTORS:
-        return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
-      default:
-        throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
-    }
-  }
-
-  protected EnumSet<HighlightFlag> getFlags(String field) {
-    EnumSet<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
-    if (shouldHandleMultiTermQuery(field)) {
-      highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
-    }
-    if (shouldHighlightPhrasesStrictly(field)) {
-      highlightFlags.add(HighlightFlag.PHRASES);
-    }
-    return highlightFlags;
-  }
-
-  protected BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
+  protected static BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
     // TODO consider requireFieldMatch
     Term floor = new Term(field, "");
     Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
@@ -774,7 +753,21 @@ public class UnifiedHighlighter {
     return terms;
   }
 
-  protected PhraseHelper getPhraseHelper(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
+  protected Set<HighlightFlag> getFlags(String field) {
+    Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
+    if (shouldHandleMultiTermQuery(field)) {
+      highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
+    }
+    if (shouldHighlightPhrasesStrictly(field)) {
+      highlightFlags.add(HighlightFlag.PHRASES);
+    }
+    if (shouldPreferPassageRelevancyOverSpeed(field)) {
+      highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
+    }
+    return highlightFlags;
+  }
+
+  protected PhraseHelper getPhraseHelper(String field, Query query, Set<HighlightFlag> highlightFlags) {
     boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
     boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
     return highlightPhrasesStrictly ?
@@ -782,7 +775,7 @@ public class UnifiedHighlighter {
         PhraseHelper.NONE;
   }
 
-  protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
+  protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
     return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
         ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
           this::preMultiTermQueryRewrite)
@@ -790,11 +783,12 @@ public class UnifiedHighlighter {
   }
 
   protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
+    OffsetSource offsetSource = getOffsetSource(field);
+
     if (terms.length == 0 && automata.length == 0 && !phraseHelper.willRewrite()) {
       return OffsetSource.NONE_NEEDED; //nothing to highlight
     }
 
-    OffsetSource offsetSource = getOffsetSource(field);
     switch (offsetSource) {
       case POSTINGS:
         if (phraseHelper.willRewrite()) {
@@ -822,6 +816,32 @@ public class UnifiedHighlighter {
     return offsetSource;
   }
 
+  protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms,
+                                                  PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
+                                                  Set<HighlightFlag> highlightFlags) {
+    switch (offsetSource) {
+      case ANALYSIS:
+        if (!phraseHelper.hasPositionSensitivity() &&
+            !highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) {
+          //skip using a memory index since it's pure term filtering
+          return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
+        } else {
+          return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
+              this::preMultiTermQueryRewrite);
+        }
+      case NONE_NEEDED:
+        return NoOpOffsetStrategy.INSTANCE;
+      case TERM_VECTORS:
+        return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
+      case POSTINGS:
+        return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
+      case POSTINGS_WITH_TERM_VECTORS:
+        return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
+      default:
+        throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
+    }
+  }
+
   /**
    * When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
    * {@link Query#rewrite(IndexReader)} called on them.  It helps performance to avoid it if it's not needed.
@@ -1041,10 +1061,9 @@ public class UnifiedHighlighter {
    */
   public enum HighlightFlag {
     PHRASES,
-    MULTI_TERM_QUERY
+    MULTI_TERM_QUERY,
+    PASSAGE_RELEVANCY_OVER_SPEED
     // TODO: ignoreQueryFields
     // TODO: useQueryBoosts
-    // TODO: avoidMemoryIndexIfPossible
-    // TODO: preferMemoryIndexForStats
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
index ddc9507..be0ff1b 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
@@ -773,7 +773,40 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
     ir.close();
   }
 
-  public void testTokenStreamIsClosed() throws IOException {
+  public void testWithMaxLenAndMultipleWildcardMatches() throws IOException {
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
+
+    Field body = new Field("body", "", fieldType);
+    Document doc = new Document();
+    doc.add(body);
+
+    //tests interleaving of multiple wildcard matches with the CompositePostingsEnum
+    //In this case the CompositePostingsEnum will have an underlying PostingsEnum that jumps form pos 1 to 9 for bravo
+    //and a second with position 2 for Bravado
+    body.setStringValue("Alpha Bravo Bravado foo foo foo. Foo foo Alpha Bravo");
+    iw.addDocument(doc);
+
+    IndexReader ir = iw.getReader();
+    iw.close();
+
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
+    highlighter.setMaxLength(32);//a little past first sentence
+
+    BooleanQuery query = new BooleanQuery.Builder()
+        .add(new TermQuery(new Term("body", "alpha")), BooleanClause.Occur.MUST)
+        .add(new PrefixQuery(new Term("body", "bra")), BooleanClause.Occur.MUST)
+        .build();
+    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+    String snippets[] = highlighter.highlight("body", query, topDocs, 2);//ask for 2 but we'll only get 1
+    assertArrayEquals(
+        new String[]{"<b>Alpha</b> <b>Bravo</b> <b>Bravado</b> foo foo foo."}, snippets
+    );
+
+    ir.close();
+  }
+
+  public void testTokenStreamIsClosed() throws Exception {
     // note: test is a derivative of testWithMaxLen()
     RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
 
@@ -828,8 +861,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
       if (fieldType == UHTestHelper.reanalysisType) {
         fail("Expecting EXPECTED IOException");
       }
-    } catch (IOException e) {
-      if (!e.getMessage().equals("EXPECTED")) {
+    } catch (Exception e) {
+      if (!e.getMessage().contains("EXPECTED")) {
         throw e;
       }
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
index bc2a14d..64570ae 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
@@ -50,9 +50,8 @@ public class TestUnifiedHighlighterRanking extends LuceneTestCase {
 
   Analyzer indexAnalyzer;
 
-  // note: don't choose reanalysis because it doesn't always know the term frequency, which is a statistic used
-  //   in passage ranking.  Sometimes it does (e.g. when it builds a MemoryIndex) but not necessarily.
-  final FieldType fieldType = UHTestHelper.randomFieldType(random(), UHTestHelper.postingsType, UHTestHelper.tvType);
+  // note: all offset sources, by default, use term freq, so it shouldn't matter which we choose.
+  final FieldType fieldType = UHTestHelper.randomFieldType(random());
 
   /**
    * indexes a bunch of gibberish, and then highlights top(n).

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
index 641a835..d150940 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
@@ -22,11 +22,13 @@ import java.text.BreakIterator;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedSet;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
@@ -68,6 +70,11 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
         return Collections.emptyList();
       }
 
+      @Override
+      protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
+        return super.createOffsetsEnumsFromReader(leafReader, doc);
+      }
+
     };
     assertEquals(offsetSource, strategy.getOffsetSource());
   }
@@ -142,8 +149,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
       }
 
       @Override
-      protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
-        return super.getOffsetStrategy(field, query, allTerms);
+      protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set<HighlightFlag> highlightFlags) {
+        return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
       }
 
       @Override


[2/2] lucene-solr:master: LUCENE-7526: UnifiedHighlighter: enhance MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex strategies, and related refactorings from t

Posted by ds...@apache.org.
LUCENE-7526: UnifiedHighlighter: enhance MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex strategies, and related refactorings from that.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7af454ad
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7af454ad
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7af454ad

Branch: refs/heads/master
Commit: 7af454ad767c3a0364757d6fcf55bff9f063febe
Parents: 280cbfd
Author: David Smiley <ds...@apache.org>
Authored: Tue Nov 15 16:16:46 2016 -0500
Committer: David Smiley <ds...@apache.org>
Committed: Tue Nov 15 16:16:46 2016 -0500

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   5 +
 lucene/benchmark/conf/highlighters-postings.alg |   4 +-
 lucene/benchmark/conf/highlighters-tv.alg       |   2 +-
 .../uhighlight/AnalysisOffsetStrategy.java      | 261 ++++++------
 .../CompositeOffsetsPostingsEnum.java           | 145 +++++++
 .../search/uhighlight/FieldOffsetStrategy.java  | 115 ++++--
 .../uhighlight/MemoryIndexOffsetStrategy.java   | 129 ++++++
 .../uhighlight/MultiTermHighlighting.java       | 190 ---------
 .../uhighlight/MultiValueTokenStream.java       | 148 -------
 .../lucene/search/uhighlight/OffsetsEnum.java   |   1 +
 .../lucene/search/uhighlight/Passage.java       |   2 +-
 .../lucene/search/uhighlight/PhraseHelper.java  |   2 +-
 .../uhighlight/PostingsOffsetStrategy.java      |   3 +-
 .../PostingsWithTermVectorsOffsetStrategy.java  |   8 +-
 .../uhighlight/TermVectorOffsetStrategy.java    |  15 +-
 .../uhighlight/TokenStreamFromTermVector.java   | 395 -------------------
 .../uhighlight/TokenStreamOffsetStrategy.java   | 173 ++++++++
 .../search/uhighlight/UnifiedHighlighter.java   | 103 +++--
 .../uhighlight/TestUnifiedHighlighterMTQ.java   |  39 +-
 .../TestUnifiedHighlighterRanking.java          |   5 +-
 .../TestUnifiedHighlighterExtensibility.java    |  11 +-
 21 files changed, 763 insertions(+), 993 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5182276..a6c6dbe 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -85,6 +85,11 @@ Improvements
 * LUCENE-7524: Added more detailed explanation of how IDF is computed in
   ClassicSimilarity and BM25Similarity. (Adrien Grand)
 
+* LUCENE-7526: Enhanced UnifiedHighlighter's passage relevancy for queries with
+  wildcards and sometimes just terms. Added shouldPreferPassageRelevancyOverSpeed()
+  which can be overridden to return false to eek out more speed in some cases.
+  (Timothy M. Rodriguez, David Smiley)
+
 Other
 
 * LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/benchmark/conf/highlighters-postings.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlighters-postings.alg b/lucene/benchmark/conf/highlighters-postings.alg
index cf9df11..610908f 100644
--- a/lucene/benchmark/conf/highlighters-postings.alg
+++ b/lucene/benchmark/conf/highlighters-postings.alg
@@ -34,7 +34,7 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
 docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
-file.query.maker.file=conf/query-phrases.txt
+file.query.maker.file=conf/query-terms.txt
 log.queries=false
 log.step.SearchTravRetHighlight=-1
 
@@ -55,7 +55,7 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
 
         { "Warm" SearchTravRetHighlight > : 1000
 
-        { "HL" SearchTravRetHighlight > : 500
+        { "HL" SearchTravRetHighlight > : 2000
 
         CloseReader
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/benchmark/conf/highlighters-tv.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlighters-tv.alg b/lucene/benchmark/conf/highlighters-tv.alg
index 1e51018..26b64a3 100644
--- a/lucene/benchmark/conf/highlighters-tv.alg
+++ b/lucene/benchmark/conf/highlighters-tv.alg
@@ -54,7 +54,7 @@ highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
 
         { "Warm" SearchTravRetHighlight > : 1000
 
-        { "HL" SearchTravRetHighlight > : 500
+        { "HL" SearchTravRetHighlight > : 2000
 
         CloseReader
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
index 6b4cc74..e9db77c 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
@@ -17,181 +17,154 @@
 package org.apache.lucene.search.uhighlight;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.function.Function;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.memory.MemoryIndex;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
-
 /**
- * Uses an {@link Analyzer} on content to get offsets. It may use a {@link MemoryIndex} too.
+ * Provides a base class for analysis based offset strategies to extend from.
+ * Requires an Analyzer and provides an override-able method for altering how
+ * the TokenStream is created.
  *
  * @lucene.internal
  */
-public class AnalysisOffsetStrategy extends FieldOffsetStrategy {
-
-  //TODO: Consider splitting this highlighter into a MemoryIndexFieldHighlighter and a TokenStreamFieldHighlighter
-  private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
-  private final Analyzer analyzer;
-  private final MemoryIndex memoryIndex;
-  private final LeafReader leafReader;
-  private final CharacterRunAutomaton preMemIndexFilterAutomaton;
-
-  public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
-                                CharacterRunAutomaton[] automata, Analyzer analyzer,
-                                Function<Query, Collection<Query>> multiTermQueryRewrite) {
-    super(field, extractedTerms, phraseHelper, automata);
-    this.analyzer = analyzer;
-    // Automata (Wildcards / MultiTermQuery):
-    this.automata = automata;
+public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
 
-    if (terms.length > 0 && !strictPhrases.hasPositionSensitivity()) {
-      this.automata = convertTermsToAutomata(terms, automata);
-      // clear the terms array now that we've moved them to be expressed as automata
-      terms = ZERO_LEN_BYTES_REF_ARRAY;
-    }
+  protected final Analyzer analyzer;
 
-    if (terms.length > 0 || strictPhrases.willRewrite()) { //needs MemoryIndex
-      // init MemoryIndex
-      boolean storePayloads = strictPhrases.hasPositionSensitivity(); // might be needed
-      memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
-      leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader();
-      // preFilter for MemoryIndex
-      preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases,
-          multiTermQueryRewrite);
-    } else {
-      memoryIndex = null;
-      leafReader = null;
-      preMemIndexFilterAutomaton = null;
+  public AnalysisOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) {
+    super(field, queryTerms, phraseHelper, automata);
+    this.analyzer = analyzer;
+    if (analyzer.getOffsetGap(field) != 1) { // note: 1 is the default. It is RARELY changed.
+      throw new IllegalArgumentException(
+          "offset gap of the provided analyzer should be 1 (field " + field + ")");
     }
-
   }
 
   @Override
-  public UnifiedHighlighter.OffsetSource getOffsetSource() {
+  public final UnifiedHighlighter.OffsetSource getOffsetSource() {
     return UnifiedHighlighter.OffsetSource.ANALYSIS;
   }
 
-  @Override
-  public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
-    // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
-    TokenStream tokenStream = tokenStream(content);
-
-    if (memoryIndex != null) { // also handles automata.length > 0
-      // We use a MemoryIndex and index the tokenStream so that later we have the PostingsEnum with offsets.
-
-      // note: An *alternative* strategy is to get PostingsEnums without offsets from the main index
-      //  and then marry this up with a fake PostingsEnum backed by a TokenStream (which has the offsets) and
-      //  can use that to filter applicable tokens?  It would have the advantage of being able to exit
-      //  early and save some re-analysis.  This would be an additional method/offset-source approach
-      //  since it's still useful to highlight without any index (so we build MemoryIndex).
-
-      // note: probably unwise to re-use TermsEnum on reset mem index so we don't. But we do re-use the
-      //   leaf reader, which is a bit more top level than in the guts.
-      memoryIndex.reset();
-
-      // Filter the tokenStream to applicable terms
-      if (preMemIndexFilterAutomaton != null) {
-        tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
-      }
-      memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
-      tokenStream = null; // it's consumed; done.
-      docId = 0;
-
-      if (automata.length > 0) {
-        Terms foundTerms = leafReader.terms(field);
-        if (foundTerms == null) {
-          return Collections.emptyList(); //No offsets for this field.
-        }
-        // Un-invert for the automata. Much more compact than a CachingTokenStream
-        tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(foundTerms, 0, automata, content.length());
-      }
-
-    }
-
-    return createOffsetsEnums(leafReader, docId, tokenStream);
-  }
-
   protected TokenStream tokenStream(String content) throws IOException {
-    return MultiValueTokenStream.wrap(field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR);
-  }
-
-  private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
-    CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
-    for (int i = 0; i < terms.length; i++) {
-      newAutomata[i] = MultiTermHighlighting.makeStringMatchAutomata(terms[i]);
+    // If there is no splitChar in content then we needn't wrap:
+    int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR);
+    if (splitCharIdx == -1) {
+      return analyzer.tokenStream(field, content);
     }
-    // Append existing automata (that which is used for MTQs)
-    System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
-    return newAutomata;
-  }
 
-  private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
-                                                        final CharacterRunAutomaton charRunAutomaton) {
-    // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
-    return new FilteringTokenFilter(tokenStream) {
-      final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
+    TokenStream subTokenStream = analyzer.tokenStream(field, content.substring(0, splitCharIdx));
 
-      @Override
-      protected boolean accept() throws IOException {
-        return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
-      }
-    };
+    return new MultiValueTokenStream(subTokenStream, field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx);
   }
 
-
   /**
-   * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+   * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
+   * exposes a TokenStream that matches what would get indexed considering the
+   * {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
+   * 1; an exception will be thrown if it isn't.
+   * <br />
+   * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
+   * more work.  The underlying components see a Reader not a String -- and the String is easy to
+   * split up without redundant buffering.
+   *
+   * @lucene.internal
    */
-  private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
-                                                              CharacterRunAutomaton[] automata,
-                                                              PhraseHelper strictPhrases,
-                                                              Function<Query, Collection<Query>> multiTermQueryRewrite) {
-    List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
-    if (terms.length > 0) {
-      allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
-    }
-    Collections.addAll(allAutomata, automata);
-    for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
-      Collections.addAll(allAutomata,
-          MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
+  private static final class MultiValueTokenStream extends TokenFilter {
+
+    private final String fieldName;
+    private final Analyzer indexAnalyzer;
+    private final String content;
+    private final char splitChar;
+
+    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+    private int startValIdx = 0;
+    private int endValIdx;
+    private int remainingPosInc = 0;
+
+    private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
+                                  String content, char splitChar, int splitCharIdx) {
+      super(subTokenStream); // subTokenStream is already initialized to operate on the first value
+      this.fieldName = fieldName;
+      this.indexAnalyzer = indexAnalyzer;
+      this.content = content;
+      this.splitChar = splitChar;
+      this.endValIdx = splitCharIdx;
     }
 
-    if (allAutomata.size() == 1) {
-      return allAutomata.get(0);
+    @Override
+    public void reset() throws IOException {
+      if (startValIdx != 0) {
+        throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
+        // ... although we could if a need for it arises.
+      }
+      super.reset();
     }
-    //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
-    //  could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
-    //  by MultiTermHighlighting.
-
-    // Return an aggregate CharacterRunAutomaton of others
-    return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
-      @Override
-      public boolean run(char[] chars, int offset, int length) {
-        for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
-          if (allAutomata.get(i).run(chars, offset, length)) {
-            return true;
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      while (true) {
+
+        if (input.incrementToken()) {
+          // Position tracking:
+          if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
+            posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
+            remainingPosInc = 0;//reset
           }
+          // Offset tracking:
+          offsetAtt.setOffset(
+              startValIdx + offsetAtt.startOffset(),
+              startValIdx + offsetAtt.endOffset()
+          );
+          return true;
+        }
+
+        if (endValIdx == content.length()) {//no more
+          return false;
         }
-        return false;
-      }
-    };
-  }
 
+        input.end(); // might adjust position increment
+        remainingPosInc += posIncAtt.getPositionIncrement();
+        input.close();
+        remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
+
+        // Get new tokenStream based on next segment divided by the splitChar
+        startValIdx = endValIdx + 1;
+        endValIdx = content.indexOf(splitChar, startValIdx);
+        if (endValIdx == -1) {//EOF
+          endValIdx = content.length();
+        }
+        TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
+        if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
+          // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
+          // very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
+          // since we used it as our input in the constructor.
+          // Were this not the case, we'd have to copy every attribute of interest since we can't alter the
+          // AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
+          // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
+          // us to easily set the char[] reference without literally copying char by char.
+          throw new IllegalStateException("Require TokenStream re-use.  Unsupported re-use strategy?: " +
+              indexAnalyzer.getReuseStrategy());
+        }
+        tokenStream.reset();
+      } // while loop to increment token of this new value
+    }
+
+    @Override
+    public void end() throws IOException {
+      super.end();
+      // Offset tracking:
+      offsetAtt.setOffset(
+          startValIdx + offsetAtt.startOffset(),
+          startValIdx + offsetAtt.endOffset());
+    }
+
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
new file mode 100644
index 0000000..356f553
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Provides a view over several underlying PostingsEnums for the iteration of offsets on the current document only.
+ * It's not general purpose; the position returned is always -1 and it doesn't iterate the documents.
+ */
+final class CompositeOffsetsPostingsEnum extends PostingsEnum {
+
+  private final int docId;
+  private final int freq;
+  private final PriorityQueue<BoundsCheckingPostingsEnum> queue;
+  private boolean firstPositionConsumed = false;
+
+  /**
+   * This class is used to ensure we don't over iterate the underlying
+   * postings enum by keeping track of the position relative to the
+   * frequency.
+   * Ideally this would've been an implementation of a PostingsEnum
+   * but it would have to delegate most methods and it seemed easier
+   * to just wrap the tweaked method.
+   */
+  private static final class BoundsCheckingPostingsEnum {
+
+    private final PostingsEnum postingsEnum;
+    private int remainingPositions;
+
+    BoundsCheckingPostingsEnum(PostingsEnum postingsEnum) throws IOException {
+      this.postingsEnum = postingsEnum;
+      this.remainingPositions = postingsEnum.freq();
+      nextPosition();
+    }
+
+    /** Advances to the next position and returns true, or returns false if it can't. */
+    private boolean nextPosition() throws IOException {
+      if (remainingPositions-- > 0) {
+        postingsEnum.nextPosition(); // ignore the actual position; we don't care.
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+  }
+
+  /** The provided {@link PostingsEnum}s must all be positioned to the same document, and must have offsets. */
+  CompositeOffsetsPostingsEnum(List<PostingsEnum> postingsEnums) throws IOException {
+    queue = new PriorityQueue<BoundsCheckingPostingsEnum>(postingsEnums.size()) {
+      @Override
+      protected boolean lessThan(BoundsCheckingPostingsEnum a, BoundsCheckingPostingsEnum b) {
+        try {
+          return a.postingsEnum.startOffset() < b.postingsEnum.startOffset();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+    };
+
+    int freqAdd = 0;
+    for (PostingsEnum postingsEnum : postingsEnums) {
+      queue.add(new BoundsCheckingPostingsEnum(postingsEnum));
+      freqAdd += postingsEnum.freq();
+    }
+    freq = freqAdd;
+    this.docId = queue.top().postingsEnum.docID();
+  }
+
+  @Override
+  public int freq() throws IOException {
+    return freq;
+  }
+
+  /** Advances to the next position. Always returns -1; the caller is assumed not to care for the highlighter.  */
+  @Override
+  public int nextPosition() throws IOException {
+    if (!firstPositionConsumed) {
+      firstPositionConsumed = true;
+    } else if (queue.size() == 0) {
+      throw new IllegalStateException("nextPosition called too many times");
+    } else if (queue.top().nextPosition()) { // advance head
+      queue.updateTop(); //the new position may be behind another postingsEnum in the queue
+    } else {
+      queue.pop(); //this postingsEnum is consumed; get rid of it. Another will take it's place.
+    }
+    assert queue.size() > 0;
+    return -1;
+  }
+
+  @Override
+  public int startOffset() throws IOException {
+    return queue.top().postingsEnum.startOffset();
+  }
+
+  @Override
+  public int endOffset() throws IOException {
+    return queue.top().postingsEnum.endOffset();
+  }
+
+  @Override
+  public BytesRef getPayload() throws IOException {
+    return queue.top().postingsEnum.getPayload();
+  }
+
+  @Override
+  public int docID() {
+    return docId;
+  }
+
+  @Override
+  public int nextDoc() throws IOException {
+    return NO_MORE_DOCS;
+  }
+
+  @Override
+  public int advance(int target) throws IOException {
+    return NO_MORE_DOCS;
+  }
+
+  @Override
+  public long cost() {
+    return 1L; //at most 1 doc is returned
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
index 04df31e..155f0a7 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
@@ -14,16 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.lucene.search.uhighlight;
 
-import java.io.Closeable;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.PostingsEnum;
@@ -31,6 +29,7 @@ import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
 /**
@@ -42,14 +41,14 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 public abstract class FieldOffsetStrategy {
 
   protected final String field;
-  protected BytesRef[] terms; // Query: free-standing terms
-  protected PhraseHelper strictPhrases; // Query: position-sensitive information TODO: rename
-  protected CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
+  protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename
+  protected final BytesRef[] terms; // Query: free-standing terms
+  protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
 
   public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
     this.field = field;
     this.terms = queryTerms;
-    this.strictPhrases = phraseHelper;
+    this.phraseHelper = phraseHelper;
     this.automata = automata;
   }
 
@@ -65,58 +64,90 @@ public abstract class FieldOffsetStrategy {
    */
   public abstract List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException;
 
-  protected List<OffsetsEnum> createOffsetsEnums(LeafReader leafReader, int doc, TokenStream tokenStream) throws IOException {
-    List<OffsetsEnum> offsetsEnums = createOffsetsEnumsFromReader(leafReader, doc);
-    if (automata.length > 0) {
-      offsetsEnums.add(createOffsetsEnumFromTokenStream(doc, tokenStream));
+  protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
+    final Terms termsIndex = leafReader.terms(field);
+    if (termsIndex == null) {
+      return Collections.emptyList();
     }
-    return offsetsEnums;
-  }
 
-  protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException {
     // For strict positions, get a Map of term to Spans:
     //    note: ScriptPhraseHelper.NONE does the right thing for these method calls
     final Map<BytesRef, Spans> strictPhrasesTermToSpans =
-        strictPhrases.getTermToSpans(atomicReader, doc);
+        phraseHelper.getTermToSpans(leafReader, doc);
     // Usually simply wraps terms in a List; but if willRewrite() then can be expanded
     final List<BytesRef> sourceTerms =
-        strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
+        phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
 
-    final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1);
+    final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
 
-    Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field);
-    if (termsIndex != null) {
+    // Handle sourceTerms:
+    if (!sourceTerms.isEmpty()) {
       TermsEnum termsEnum = termsIndex.iterator();//does not return null
       for (BytesRef term : sourceTerms) {
-        if (!termsEnum.seekExact(term)) {
-          continue; // term not found
-        }
-        PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
-        if (postingsEnum == null) {
-          // no offsets or positions available
-          throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
-        }
-        if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
-          continue;
+        if (termsEnum.seekExact(term)) {
+          PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
+
+          if (postingsEnum == null) {
+            // no offsets or positions available
+            throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
+          }
+
+          if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
+            postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
+            if (postingsEnum != null) {
+              offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
+            }
+          }
         }
-        postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
-        if (postingsEnum == null) {
-          continue;// completely filtered out
+      }
+    }
+
+    // Handle automata
+    if (automata.length > 0) {
+      offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
+    }
+
+    return offsetsEnums;
+  }
+
+  protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
+    List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
+    for (int i = 0; i < automata.length; i++) {
+      automataPostings.add(new ArrayList<>());
+    }
+
+    TermsEnum termsEnum = termsIndex.iterator();
+    BytesRef term;
+    CharsRefBuilder refBuilder = new CharsRefBuilder();
+    while ((term = termsEnum.next()) != null) {
+      for (int i = 0; i < automata.length; i++) {
+        CharacterRunAutomaton automaton = automata[i];
+        refBuilder.copyUTF8Bytes(term);
+        if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
+          PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
+          if (doc == postings.advance(doc)) {
+            automataPostings.get(i).add(postings);
+          }
         }
+      }
+    }
 
-        offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
+    List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length); //will be at most this long
+    for (int i = 0; i < automata.length; i++) {
+      CharacterRunAutomaton automaton = automata[i];
+      List<PostingsEnum> postingsEnums = automataPostings.get(i);
+      int size = postingsEnums.size();
+      if (size > 0) { //only add if we have offsets
+        BytesRef wildcardTerm = new BytesRef(automaton.toString());
+        if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum
+          offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
+        } else {
+          offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
+        }
       }
     }
+
     return offsetsEnums;
   }
 
-  protected OffsetsEnum createOffsetsEnumFromTokenStream(int doc, TokenStream tokenStream) throws IOException {
-    // if there are automata (MTQ), we have to initialize the "fake" enum wrapping them.
-    assert tokenStream != null;
-    // TODO Opt: we sometimes evaluate the automata twice when this TS isn't the original; can we avoid?
-    PostingsEnum mtqPostingsEnum = MultiTermHighlighting.getDocsEnum(tokenStream, automata);
-    assert mtqPostingsEnum instanceof Closeable; // FYI we propagate close() later.
-    mtqPostingsEnum.advance(doc);
-    return new OffsetsEnum(null, mtqPostingsEnum);
-  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
new file mode 100644
index 0000000..4028912
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.memory.MemoryIndex;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+
+/**
+ * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
+ *
+ * @lucene.internal
+ */
+public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
+
+  private final MemoryIndex memoryIndex;
+  private final LeafReader leafReader;
+  private final CharacterRunAutomaton preMemIndexFilterAutomaton;
+
+  public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
+                                   CharacterRunAutomaton[] automata, Analyzer analyzer,
+                                   Function<Query, Collection<Query>> multiTermQueryRewrite) {
+    super(field, extractedTerms, phraseHelper, automata, analyzer);
+    boolean storePayloads = phraseHelper.hasPositionSensitivity(); // might be needed
+    memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
+    leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
+    // preFilter for MemoryIndex
+    preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
+  }
+
+  /**
+   * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+   */
+  private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
+                                                              CharacterRunAutomaton[] automata,
+                                                              PhraseHelper strictPhrases,
+                                                              Function<Query, Collection<Query>> multiTermQueryRewrite) {
+    List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
+    if (terms.length > 0) {
+      allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
+    }
+    Collections.addAll(allAutomata, automata);
+    for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
+      Collections.addAll(allAutomata,
+          MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
+    }
+
+    if (allAutomata.size() == 1) {
+      return allAutomata.get(0);
+    }
+    //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
+    //  could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
+    //  by MultiTermHighlighting.
+
+    // Return an aggregate CharacterRunAutomaton of others
+    return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
+      @Override
+      public boolean run(char[] chars, int offset, int length) {
+        for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
+          if (allAutomata.get(i).run(chars, offset, length)) {
+            return true;
+          }
+        }
+        return false;
+      }
+    };
+  }
+
+  @Override
+  public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+    // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
+    TokenStream tokenStream = tokenStream(content);
+
+    // Filter the tokenStream to applicable terms
+    tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
+    memoryIndex.reset();
+    memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
+    docId = 0;
+
+    return createOffsetsEnumsFromReader(leafReader, docId);
+  }
+
+
+  private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
+                                                        final CharacterRunAutomaton charRunAutomaton) {
+    // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
+    return new FilteringTokenFilter(tokenStream) {
+      final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
+
+      @Override
+      protected boolean accept() throws IOException {
+        return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
+      }
+    };
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
index e85fa3b..fd6a26a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@@ -16,8 +16,6 @@
  */
 package org.apache.lucene.search.uhighlight;
 
-import java.io.Closeable;
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -25,15 +23,7 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.function.Function;
 
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.index.FilterLeafReader;
-import org.apache.lucene.index.FilteredTermsEnum;
-import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.AutomatonQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
@@ -48,9 +38,7 @@ import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanNotQuery;
 import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanPositionCheckQuery;
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
@@ -210,182 +198,4 @@ class MultiTermHighlighting {
     return list.toArray(new CharacterRunAutomaton[list.size()]);
   }
 
-  /**
-   * Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
-   * matches tokens.
-   * <p>
-   * This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
-   */
-  public static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
-    return new TokenStreamPostingsEnum(ts, matchers);
-  }
-
-  // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
-  // but this would have a performance cost for likely little gain in the user experience, it
-  // would only serve to make this method less bogus.
-  // instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
-  // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
-  private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
-    TokenStream stream; // becomes null when closed
-    final CharacterRunAutomaton[] matchers;
-    final CharTermAttribute charTermAtt;
-    final OffsetAttribute offsetAtt;
-
-    int currentDoc = -1;
-    int currentMatch = -1;
-    int currentStartOffset = -1;
-
-    int currentEndOffset = -1;
-
-    final BytesRef matchDescriptions[];
-
-    TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
-      this.stream = ts;
-      this.matchers = matchers;
-      matchDescriptions = new BytesRef[matchers.length];
-      charTermAtt = ts.addAttribute(CharTermAttribute.class);
-      offsetAtt = ts.addAttribute(OffsetAttribute.class);
-      ts.reset();
-    }
-
-    @Override
-    public int nextPosition() throws IOException {
-      if (stream != null) {
-        while (stream.incrementToken()) {
-          for (int i = 0; i < matchers.length; i++) {
-            if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
-              currentStartOffset = offsetAtt.startOffset();
-              currentEndOffset = offsetAtt.endOffset();
-              currentMatch = i;
-              return 0;
-            }
-          }
-        }
-        stream.end();
-        close();
-      }
-      // exhausted
-      currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
-      return Integer.MAX_VALUE;
-    }
-
-    @Override
-    public int freq() throws IOException {
-      return Integer.MAX_VALUE; // lie
-    }
-
-    @Override
-    public int startOffset() throws IOException {
-      assert currentStartOffset >= 0;
-      return currentStartOffset;
-    }
-
-    @Override
-    public int endOffset() throws IOException {
-      assert currentEndOffset >= 0;
-      return currentEndOffset;
-    }
-
-    @Override
-    public BytesRef getPayload() throws IOException {
-      if (matchDescriptions[currentMatch] == null) {
-        matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
-      }
-      return matchDescriptions[currentMatch];
-    }
-
-    @Override
-    public int docID() {
-      return currentDoc;
-    }
-
-    @Override
-    public int nextDoc() throws IOException {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public int advance(int target) throws IOException {
-      return currentDoc = target;
-    }
-
-    @Override
-    public long cost() {
-      return 0;
-    }
-
-    @Override
-    public void close() throws IOException {
-      if (stream != null) {
-        stream.close();
-        stream = null;
-      }
-    }
-  }
-
-  /**
-   * Return a TokenStream un-inverted from the provided Terms, but filtered based on the automata. The
-   * Terms must have exactly one doc count (e.g. term vector or MemoryIndex).
-   */
-  //TODO: Alternatively, produce a list of OffsetsEnums from the Terms that match the automata.
-  public static TokenStream uninvertAndFilterTerms(Terms termsIndex,
-                                                      int doc,
-                                                      final CharacterRunAutomaton[] automata,
-                                                      int offsetLength)
-      throws IOException {
-    assert automata.length > 0;
-    //Note: if automata were plain Automaton (not CharacterRunAutomaton), we might instead use
-    // TermsEnum.intersect(compiledAutomaton).  But probably won't help due to O(N) TV impl so whatever.
-    FilterLeafReader.FilterTerms filteredTermsIndex = new FilterLeafReader.FilterTerms(termsIndex) {
-      @Override
-      public TermsEnum iterator() throws IOException {
-        return new FilteredTermsEnum(super.iterator(), false) {//false == no seek
-          CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//reuse only for UTF8->UTF16 call
-
-          @Override
-          protected AcceptStatus accept(BytesRef termBytesRef) throws IOException {
-            //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
-            tempCharsRefBuilder.grow(termBytesRef.length);
-            final int charLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
-            for (CharacterRunAutomaton runAutomaton : automata) {
-              if (runAutomaton.run(tempCharsRefBuilder.chars(), 0, charLen)) {
-                return AcceptStatus.YES;
-              }
-            }
-            return AcceptStatus.NO;
-          }
-        };
-      }
-
-      @Override
-      public long size() throws IOException {
-        return -1; // unknown
-      }
-
-      @Override
-      public long getSumTotalTermFreq() throws IOException {
-        return -1; // unknown
-      }
-
-      @Override
-      public long getSumDocFreq() throws IOException {
-        return -1; // unknown
-      }
-    };
-    float loadFactor = 1f / 64f;
-    return new TokenStreamFromTermVector(filteredTermsIndex, doc, offsetLength, loadFactor);
-  }
-
-  /**
-   * Returns a simple automata that matches the specified term.
-   */
-  public static CharacterRunAutomaton makeStringMatchAutomata(BytesRef term) {
-    String termString = term.utf8ToString();
-    return new CharacterRunAutomaton(Automata.makeString(termString)) {
-      @Override
-      public String toString() {
-        return termString;
-      }
-    };
-  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
deleted file mode 100644
index 4cbf754..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.uhighlight;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-/**
- * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
- * exposes a TokenStream that matches what would get indexed considering the
- * {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
- * 1; an exception will be thrown if it isn't.
- * <br />
- * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
- * more work.  The underlying components see a Reader not a String -- and the String is easy to
- * split up without redundant buffering.
- *
- * @lucene.internal
- */
-final class MultiValueTokenStream extends TokenFilter {
-
-    private final String fieldName;
-    private final Analyzer indexAnalyzer;
-    private final String content;
-    private final char splitChar;
-
-    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
-    private int startValIdx = 0;
-    private int endValIdx;
-    private int remainingPosInc = 0;
-
-    /** note: The caller must remember to close the TokenStream eventually. */
-    static TokenStream wrap(String fieldName, Analyzer indexAnalyzer, String content, char splitChar)
-            throws IOException {
-        if (indexAnalyzer.getOffsetGap(fieldName) != 1) { // note: 1 is the default. It is RARELY changed.
-            throw new IllegalArgumentException(
-                    "offset gap of the provided analyzer should be 1 (field " + fieldName + ")");
-        }
-        // If there is no splitChar in content then we needn't wrap:
-        int splitCharIdx = content.indexOf(splitChar);
-        if (splitCharIdx == -1) {
-            return indexAnalyzer.tokenStream(fieldName, content);
-        }
-
-        TokenStream subTokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(0, splitCharIdx));
-
-        return new MultiValueTokenStream(subTokenStream, fieldName, indexAnalyzer, content, splitChar, splitCharIdx);
-    }
-
-    private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
-                                  String content, char splitChar, int splitCharIdx) {
-        super(subTokenStream); // subTokenStream is already initialized to operate on the first value
-        this.fieldName = fieldName;
-        this.indexAnalyzer = indexAnalyzer;
-        this.content = content;
-        this.splitChar = splitChar;
-        this.endValIdx = splitCharIdx;
-    }
-
-    @Override
-    public void reset() throws IOException {
-        if (startValIdx != 0) {
-            throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
-            // ... although we could if a need for it arises.
-        }
-        super.reset();
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-        while (true) {
-
-            if (input.incrementToken()) {
-                // Position tracking:
-                if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
-                    posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
-                    remainingPosInc = 0;//reset
-                }
-                // Offset tracking:
-                offsetAtt.setOffset(
-                        startValIdx + offsetAtt.startOffset(),
-                        startValIdx + offsetAtt.endOffset()
-                                         );
-                return true;
-            }
-
-            if (endValIdx == content.length()) {//no more
-                return false;
-            }
-
-            input.end(); // might adjust position increment
-            remainingPosInc += posIncAtt.getPositionIncrement();
-            input.close();
-            remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
-
-            // Get new tokenStream based on next segment divided by the splitChar
-            startValIdx = endValIdx + 1;
-            endValIdx = content.indexOf(splitChar, startValIdx);
-            if (endValIdx == -1) {//EOF
-                endValIdx = content.length();
-            }
-            TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
-            if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
-                // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
-                // very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
-                // since we used it as our input in the constructor.
-                // Were this not the case, we'd have to copy every attribute of interest since we can't alter the
-                // AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
-                // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
-                // us to easily set the char[] reference without literally copying char by char.
-                throw new IllegalStateException("Require TokenStream re-use.  Unsupported re-use strategy?: " +
-                                                indexAnalyzer.getReuseStrategy());
-            }
-            tokenStream.reset();
-        } // while loop to increment token of this new value
-    }
-
-    @Override
-    public void end() throws IOException {
-        super.end();
-        // Offset tracking:
-        offsetAtt.setOffset(
-                startValIdx + offsetAtt.startOffset(),
-                startValIdx + offsetAtt.endOffset());
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
index af29ef1..cbaeb90 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
@@ -76,6 +76,7 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
   }
 
   void nextPosition() throws IOException {
+    assert hasMorePositions();
     pos++;
     postingsEnum.nextPosition();
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
index f4caaa0..de37d5d 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
@@ -40,7 +40,7 @@ public final class Passage {
     BytesRef matchTerms[] = new BytesRef[8];
     int numMatches = 0;
 
-    void addMatch(int startOffset, int endOffset, BytesRef term) {
+    public void addMatch(int startOffset, int endOffset, BytesRef term) {
         assert startOffset >= this.startOffset && startOffset <= this.endOffset;
         if (numMatches == matchStarts.length) {
             int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
index 95d51c9..cde17ba 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
@@ -266,7 +266,7 @@ public class PhraseHelper {
   }
 
   /**
-   * Returns terms as a List, but expanded to any terms in strictPhrases' keySet if present.  That can only
+   * Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present.  That can only
    * happen if willRewrite() is true.
    */
   List<BytesRef> expandTermsIfRewrite(BytesRef[] terms, Map<BytesRef, Spans> strictPhrasesTermToSpans) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
index 4666906..975d3a1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
@@ -41,7 +41,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
 
   @Override
   public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
-    LeafReader leafReader;
+    final LeafReader leafReader;
     if (reader instanceof LeafReader) {
       leafReader = (LeafReader) reader;
     } else {
@@ -54,6 +54,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
     return createOffsetsEnumsFromReader(leafReader, docId);
   }
 
+
   @Override
   public UnifiedHighlighter.OffsetSource getOffsetSource() {
     return UnifiedHighlighter.OffsetSource.POSTINGS;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
index 81de379..b9086a7 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
@@ -20,7 +20,6 @@ import java.io.IOException;
 import java.util.Collections;
 import java.util.List;
 
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
@@ -58,14 +57,11 @@ public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy {
     }
     leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms);
 
-    TokenStream tokenStream = automata.length > 0 ? MultiTermHighlighting
-        .uninvertAndFilterTerms(leafReader.terms(field), docId, this.automata, content.length()) : null;
-
-    return createOffsetsEnums(leafReader, docId, tokenStream);
+    return createOffsetsEnumsFromReader(leafReader, docId);
   }
 
   @Override
   public UnifiedHighlighter.OffsetSource getOffsetSource() {
-    return UnifiedHighlighter.OffsetSource.POSTINGS;
+    return UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS;
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
index 204679b..f6eedc4 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
@@ -20,7 +20,6 @@ import java.io.IOException;
 import java.util.Collections;
 import java.util.List;
 
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.Terms;
@@ -51,18 +50,10 @@ public class TermVectorOffsetStrategy extends FieldOffsetStrategy {
       return Collections.emptyList();
     }
 
-    LeafReader leafReader = null;
-    if ((terms.length > 0) || strictPhrases.willRewrite()) {
-      leafReader = new TermVectorLeafReader(field, tvTerms);
-      docId = 0;
-    }
-
-    TokenStream tokenStream = null;
-    if (automata.length > 0) {
-      tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(tvTerms, 0, automata, content.length());
-    }
+    LeafReader leafReader = new TermVectorLeafReader(field, tvTerms);
+    docId = 0;
 
-    return createOffsetsEnums(leafReader, docId, tokenStream);
+    return createOffsetsEnumsFromReader(leafReader, docId);
   }
 
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
deleted file mode 100644
index 980c566..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.uhighlight;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefArray;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.UnicodeUtil;
-
-/**
- * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
- * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
- * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
- * for them and if not then won't get them.  This TokenStream supports an efficient {@link #reset()}, so there's
- * no need to wrap with a caching impl.
- *
- * @lucene.internal
- */
-final class TokenStreamFromTermVector extends TokenStream {
-  // note: differs from similar class in the standard highlighter. This one is optimized for sparse cases.
-
-  /**
-   * content length divided by distinct positions; an average of dense text.
-   */
-  private static final double AVG_CHARS_PER_POSITION = 6;
-
-  private static final int INSERTION_SORT_THRESHOLD = 16;
-
-  private final Terms vector;
-
-  private final int filteredDocId;
-
-  private final CharTermAttribute termAttribute;
-
-  private final PositionIncrementAttribute positionIncrementAttribute;
-
-  private final int offsetLength;
-
-  private final float loadFactor;
-
-  private OffsetAttribute offsetAttribute;//maybe null
-
-  private PayloadAttribute payloadAttribute;//maybe null
-
-  private CharsRefBuilder termCharsBuilder;//term data here
-
-  private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
-  private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
-
-  private TokenLL firstToken = null; // the head of a linked-list
-
-  private TokenLL incrementToken = null;
-
-  private boolean initialized = false;//lazy
-
-  public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException {
-    this(vector, 0, offsetLength, 1f);
-  }
-
-  /**
-   * Constructor.
-   *
-   * @param vector        Terms that contains the data for
-   *                      creating the TokenStream. Must have positions and/or offsets.
-   * @param filteredDocId The docID we will process.
-   * @param offsetLength  Supply the character length of the text being uninverted, or a lower value if you don't want
-   *                      to invert text beyond an offset (in so doing this will act as a filter).  If you don't
-   *                      know the length, pass -1.  In conjunction with {@code loadFactor}, it's used to
-   *                      determine how many buckets to create during uninversion.
-   *                      It's also used to filter out tokens with a start offset exceeding this value.
-   * @param loadFactor    The percent of tokens from the original terms (by position count) that are
-   *                      expected to be inverted.  If they are filtered (e.g.
-   *                      {@link org.apache.lucene.index.FilterLeafReader.FilterTerms})
-   *                      then consider using less than 1.0 to avoid wasting space.
-   *                      1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector.
-   */
-  TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException {
-    super();
-    this.filteredDocId = filteredDocId;
-    this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength;
-    if (loadFactor <= 0f || loadFactor > 1f) {
-      throw new IllegalArgumentException("loadFactor should be > 0 and <= 1");
-    }
-    this.loadFactor = loadFactor;
-    assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
-    if (!vector.hasPositions() && !vector.hasOffsets()) {
-      throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
-    }
-    assert vector.hasFreqs();
-    this.vector = vector;
-    termAttribute = addAttribute(CharTermAttribute.class);
-    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
-  }
-
-  public Terms getTermVectorTerms() {
-    return vector;
-  }
-
-  @Override
-  public void reset() throws IOException {
-    incrementToken = null;
-    super.reset();
-  }
-
-  //We delay initialization because we can see which attributes the consumer wants, particularly payloads
-  private void init() throws IOException {
-    assert !initialized;
-    int dpEnumFlags = 0;
-    if (vector.hasOffsets()) {
-      offsetAttribute = addAttribute(OffsetAttribute.class);
-      dpEnumFlags |= PostingsEnum.OFFSETS;
-    }
-    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
-      payloadAttribute = getAttribute(PayloadAttribute.class);
-      payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
-      spareBytesRefBuilder = new BytesRefBuilder();
-      dpEnumFlags |= PostingsEnum.PAYLOADS;
-    }
-
-    // We put term data here
-    termCharsBuilder = new CharsRefBuilder();
-    termCharsBuilder.grow(initTotalTermCharLen());
-
-    // Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor)
-
-    final TokenLL[] tokenBuckets = initTokenBucketsArray();
-    final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION;
-    final double POSITION_TO_BUCKET_IDX = loadFactor;
-
-    final TermsEnum termsEnum = vector.iterator();
-    BytesRef termBytesRef;
-    PostingsEnum dpEnum = null;
-    final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
-
-    TERM_LOOP:
-    while ((termBytesRef = termsEnum.next()) != null) {
-      //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
-      // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
-      tempCharsRefBuilder.grow(termBytesRef.length);
-      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
-      final int termCharsOff = termCharsBuilder.length();
-      termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
-      dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
-      assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
-      int currentDocId = dpEnum.advance(filteredDocId);
-      if (currentDocId != filteredDocId) {
-        continue; //Not expected
-      }
-      final int freq = dpEnum.freq();
-      for (int j = 0; j < freq; j++) {
-        TokenLL token = new TokenLL();
-        token.position = dpEnum.nextPosition(); // can be -1 if not in the TV
-        token.termCharsOff = termCharsOff;
-        token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
-        // copy offset (if it's there) and compute bucketIdx
-        int bucketIdx;
-        if (offsetAttribute != null) {
-          token.startOffset = dpEnum.startOffset();
-          if (offsetLength >= 0 && token.startOffset > offsetLength) {
-            continue TERM_LOOP;//filter this token out; exceeds threshold
-          }
-          token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
-          bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX);
-        } else {
-          bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX);
-        }
-        if (bucketIdx >= tokenBuckets.length) {
-          bucketIdx = tokenBuckets.length - 1;
-        }
-
-        if (payloadAttribute != null) {
-          final BytesRef payload = dpEnum.getPayload();
-          token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
-        }
-
-        //Add token to the head of the bucket linked list
-        token.next = tokenBuckets[bucketIdx];
-        tokenBuckets[bucketIdx] = token;
-      }
-    }
-
-    // Step 2:  Link all Tokens into a linked-list and sort all tokens at the same position
-
-    firstToken = initLinkAndSortTokens(tokenBuckets);
-
-    // If the term vector didn't have positions, synthesize them
-    if (!vector.hasPositions() && firstToken != null) {
-      TokenLL prevToken = firstToken;
-      prevToken.position = 0;
-      for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) {
-        if (prevToken.startOffset == token.startOffset) {
-          token.position = prevToken.position;
-        } else {
-          token.position = prevToken.position + 1;
-        }
-      }
-    }
-
-    initialized = true;
-  }
-
-  private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) {
-    TokenLL firstToken = null;
-    List<TokenLL> scratchTokenArray = new ArrayList<>(); // declare here for re-use.  TODO use native array
-    TokenLL prevToken = null;
-    for (TokenLL tokenHead : tokenBuckets) {
-      if (tokenHead == null) {
-        continue;
-      }
-      //sort tokens at this position and link them; return the first
-      TokenLL tokenTail;
-      // just one token
-      if (tokenHead.next == null) {
-        tokenTail = tokenHead;
-      } else {
-        // add the linked list to a temporary array
-        for (TokenLL cur = tokenHead; cur != null; cur = cur.next) {
-          scratchTokenArray.add(cur);
-        }
-        // sort; and set tokenHead & tokenTail
-        if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) {
-          // insertion sort by creating a linked list (leave scratchTokenArray alone)
-          tokenHead = tokenTail = scratchTokenArray.get(0);
-          tokenHead.next = null;
-          for (int i = 1; i < scratchTokenArray.size(); i++) {
-            TokenLL insertToken = scratchTokenArray.get(i);
-            if (insertToken.compareTo(tokenHead) <= 0) {
-              // takes the place of tokenHead
-              insertToken.next = tokenHead;
-              tokenHead = insertToken;
-            } else {
-              // goes somewhere after tokenHead
-              for (TokenLL prev = tokenHead; true; prev = prev.next) {
-                if (prev.next == null || insertToken.compareTo(prev.next) <= 0) {
-                  if (prev.next == null) {
-                    tokenTail = insertToken;
-                  }
-                  insertToken.next = prev.next;
-                  prev.next = insertToken;
-                  break;
-                }
-              }
-            }
-          }
-        } else {
-          Collections.sort(scratchTokenArray);
-          // take back out and create a linked list
-          TokenLL prev = tokenHead = scratchTokenArray.get(0);
-          for (int i = 1; i < scratchTokenArray.size(); i++) {
-            prev.next = scratchTokenArray.get(i);
-            prev = prev.next;
-          }
-          tokenTail = prev;
-          tokenTail.next = null;
-        }
-        scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that
-      }
-
-      //link to previous
-      if (prevToken != null) {
-        assert prevToken.next == null;
-        prevToken.next = tokenHead; //concatenate linked-list
-        assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations";
-      } else {
-        assert firstToken == null;
-        firstToken = tokenHead;
-      }
-
-      prevToken = tokenTail;
-    }
-    return firstToken;
-  }
-
-  private int initTotalTermCharLen() throws IOException {
-    int guessNumTerms;
-    if (vector.size() != -1) {
-      guessNumTerms = (int) vector.size();
-    } else if (offsetLength != -1) {
-      guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd
-    } else {
-      return 128;
-    }
-    return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len
-  }
-
-  private TokenLL[] initTokenBucketsArray() throws IOException {
-    // Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms).
-    int positionsEstimate;
-    if (offsetLength == -1) { // no clue what the char length is.
-      // Estimate the number of position slots we need from term stats based on Wikipedia.
-      int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
-      if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
-        int size = (int) vector.size();
-        if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
-          size = 128;
-        }
-        sumTotalTermFreq = (int) (size * 2.4);
-      }
-      positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
-    } else {
-      // guess number of token positions by this factor.
-      positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION);
-    }
-    // apply the load factor.
-    return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))];
-  }
-
-  @Override
-  public boolean incrementToken() throws IOException {
-    int posInc;
-    if (incrementToken == null) {
-      if (!initialized) {
-        init();
-        assert initialized;
-      }
-      incrementToken = firstToken;
-      if (incrementToken == null) {
-        return false;
-      }
-      posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc
-    } else if (incrementToken.next != null) {
-      int lastPosition = incrementToken.position;
-      incrementToken = incrementToken.next;
-      posInc = incrementToken.position - lastPosition;
-    } else {
-      return false;
-    }
-    clearAttributes();
-    termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
-
-    positionIncrementAttribute.setPositionIncrement(posInc);
-    if (offsetAttribute != null) {
-      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
-    }
-    if (payloadAttribute != null && incrementToken.payloadIndex >= 0) {
-      payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
-    }
-    return true;
-  }
-
-  private static class TokenLL implements Comparable<TokenLL> {
-    // This class should weigh 32 bytes, including object header
-
-    int termCharsOff; // see termCharsBuilder
-    short termCharsLen;
-
-    int position;
-    int startOffset;
-    short endOffsetInc; // add to startOffset to get endOffset
-    int payloadIndex;
-
-    TokenLL next;
-
-    @Override
-    public int compareTo(TokenLL tokenB) {
-      int cmp = Integer.compare(this.position, tokenB.position);
-      if (cmp == 0) {
-        cmp = Integer.compare(this.startOffset, tokenB.startOffset);
-        if (cmp == 0) {
-          cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
-        }
-      }
-      return cmp;
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
new file mode 100644
index 0000000..966eeef
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Analyzes the text, producing a single {@link OffsetsEnum} wrapping the {@link TokenStream} filtered to terms
+ * in the query, including wildcards.  It can't handle position-sensitive queries (phrases). Passage accuracy suffers
+ * because the freq() is unknown -- it's always {@link Integer#MAX_VALUE} instead.
+ */
+public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
+
+  private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
+
+  public TokenStreamOffsetStrategy(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer indexAnalyzer) {
+    super(field, ZERO_LEN_BYTES_REF_ARRAY, phraseHelper, convertTermsToAutomata(terms, automata), indexAnalyzer);
+    assert phraseHelper.hasPositionSensitivity() == false;
+  }
+
+  private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
+    CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
+    for (int i = 0; i < terms.length; i++) {
+      String termString = terms[i].utf8ToString();
+      newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
+        @Override
+        public String toString() {
+          return termString;
+        }
+      };
+    }
+    // Append existing automata (that which is used for MTQs)
+    System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
+    return newAutomata;
+  }
+
+  @Override
+  public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+    TokenStream tokenStream = tokenStream(content);
+    PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
+    mtqPostingsEnum.advance(docId);
+    return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
+  }
+
+  // but this would have a performance cost for likely little gain in the user experience, it
+  // would only serve to make this method less bogus.
+  // instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
+  // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
+  private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
+    TokenStream stream; // becomes null when closed
+    final CharacterRunAutomaton[] matchers;
+    final CharTermAttribute charTermAtt;
+    final OffsetAttribute offsetAtt;
+
+    int currentDoc = -1;
+    int currentMatch = -1;
+    int currentStartOffset = -1;
+
+    int currentEndOffset = -1;
+
+    final BytesRef matchDescriptions[];
+
+    TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
+      this.stream = ts;
+      this.matchers = matchers;
+      matchDescriptions = new BytesRef[matchers.length];
+      charTermAtt = ts.addAttribute(CharTermAttribute.class);
+      offsetAtt = ts.addAttribute(OffsetAttribute.class);
+      ts.reset();
+    }
+
+    @Override
+    public int nextPosition() throws IOException {
+      if (stream != null) {
+        while (stream.incrementToken()) {
+          for (int i = 0; i < matchers.length; i++) {
+            if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
+              currentStartOffset = offsetAtt.startOffset();
+              currentEndOffset = offsetAtt.endOffset();
+              currentMatch = i;
+              return 0;
+            }
+          }
+        }
+        stream.end();
+        close();
+      }
+      // exhausted
+      currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
+      return Integer.MAX_VALUE;
+    }
+
+    @Override
+    public int freq() throws IOException {
+      return Integer.MAX_VALUE; // lie
+    }
+
+    @Override
+    public int startOffset() throws IOException {
+      assert currentStartOffset >= 0;
+      return currentStartOffset;
+    }
+
+    @Override
+    public int endOffset() throws IOException {
+      assert currentEndOffset >= 0;
+      return currentEndOffset;
+    }
+
+    @Override
+    public BytesRef getPayload() throws IOException {
+      if (matchDescriptions[currentMatch] == null) {
+        matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
+      }
+      return matchDescriptions[currentMatch];
+    }
+
+    @Override
+    public int docID() {
+      return currentDoc;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      return currentDoc = target;
+    }
+
+    @Override
+    public long cost() {
+      return 0;
+    }
+
+    @Override
+    public void close() throws IOException {
+      if (stream != null) {
+        stream.close();
+        stream = null;
+      }
+    }
+  }
+}