You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/09/17 09:44:13 UTC
[27/47] lucene-solr:jira/solr-12709: SOLR-9418: Added a new (experimental) PhrasesIdentificationComponent for identifying potential phrases in query input based on overlapping shingles in the index

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/597bd5db/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java
new file mode 100644
index 0000000..c8d9edf
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/component/PhrasesIdentificationComponentTest.java
@@ -0,0 +1,796 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.component;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.function.BiConsumer;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.handler.component.PhrasesIdentificationComponent;
+import org.apache.solr.handler.component.PhrasesIdentificationComponent.Phrase;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+
+import org.junit.After;
+import org.junit.BeforeClass;
+import org.junit.Before;
+import org.hamcrest.Description;
+import org.hamcrest.Matcher;
+import org.hamcrest.BaseMatcher;
+
+public class PhrasesIdentificationComponentTest extends SolrTestCaseJ4 {
+
+  private static final String HANDLER = "/phrases";
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-phrases-identification.xml","schema-phrases-identification.xml");
+  }
+  
+  @Before
+  public void addSomeDocs() throws Exception {
+    assertU(adoc("id", "42",
+                 "title","Tale of the Brown Fox: was he lazy?",
+                 "body", "No. The quick brown fox was a very brown fox who liked to get into trouble."));
+    assertU(adoc("id", "43",
+                 "title","A fable in two acts",
+                 "body", "The brOwn fOx jumped. The lazy dog did not"));
+    assertU(adoc("id", "44",
+                 "title","Why the LazY dog was lazy",
+                 "body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox"));
+    assertU(adoc("id", "45",
+                 "title","Why Are We Lazy?",
+                 "body", "Because we are. that's why"));
+    assertU((commit()));
+  }
+  
+  @After
+  public void deleteAllDocs() throws Exception {
+    assertU(delQ("*:*"));
+    assertU((commit()));
+  }
+
+  public void testWhiteBoxPhraseParsingLongInput() throws Exception {
+    final SchemaField field = h.getCore().getLatestSchema().getField("multigrams_body");
+    assertNotNull(field);
+    final List<Phrase> phrases = Phrase.extractPhrases
+      (" did  a Quick    brown FOX perniciously jump over the lAZy dog", field, 3, 7);
+
+    assertEquals(IntStream.rangeClosed((11-7+1), 11).sum(), // 11 words, max query phrase size is 7
+                 phrases.size());
+    
+    // spot check a few explicitly choosen phrases of various lengths...
+    
+    { // single term, close to edge so not as many super phrases as other terms might have
+      final Phrase lazy = phrases.get(phrases.size() - 1 - 2);
+      final String debug = lazy.toString();
+
+      assertEquals(debug, "lAZy", lazy.getSubSequence());
+      assertEquals(debug, 10, lazy.getPositionStart());
+      assertEquals(debug, 11, lazy.getPositionEnd());
+      assertEquals(debug, 1, lazy.getPositionLength());
+      
+      assertEquals(debug, 54, lazy.getOffsetStart());
+      assertEquals(debug, 58, lazy.getOffsetEnd());
+
+      assertEquals(debug, 1, lazy.getIndividualIndexedTerms().size());
+      assertEquals(debug, 1, lazy.getLargestIndexedSubPhrases().size());
+      assertEquals(debug, lazy, lazy.getIndividualIndexedTerms().get(0));
+      assertEquals(debug, lazy, lazy.getLargestIndexedSubPhrases().get(0));
+      assertEquals(debug, 4, lazy.getIndexedSuperPhrases().size()); // (2 each: len=2, len=3)
+    }
+    { // length 2, middle of the pack
+      final Phrase brown_fox = phrases.get((7 * 3) + 1);
+      final String debug = brown_fox.toString();
+
+      assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
+      assertEquals(debug, 4, brown_fox.getPositionStart());
+      assertEquals(debug, 6, brown_fox.getPositionEnd());
+      assertEquals(debug, 2, brown_fox.getPositionLength());
+      
+      assertEquals(debug, 17, brown_fox.getOffsetStart());
+      assertEquals(debug, 26, brown_fox.getOffsetEnd());
+
+      assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
+      assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
+      assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
+      assertEquals(debug, 2, brown_fox.getIndexedSuperPhrases().size()); // (2 @ len=3)
+      
+    }
+    { // length 3 (which is the max indexed size) @ start of the string
+      final Phrase daq = phrases.get(2);
+      final String debug = daq.toString();
+
+      assertEquals(debug, "did  a Quick", daq.getSubSequence());
+      assertEquals(debug, 1, daq.getPositionStart());
+      assertEquals(debug, 4, daq.getPositionEnd());
+      assertEquals(debug, 3, daq.getPositionLength());
+      
+      assertEquals(debug, 1, daq.getOffsetStart());
+      assertEquals(debug, 13, daq.getOffsetEnd());
+
+      assertEquals(debug, 3, daq.getIndividualIndexedTerms().size());
+      assertEquals(debug, 1, daq.getLargestIndexedSubPhrases().size());
+      assertEquals(debug, daq, daq.getLargestIndexedSubPhrases().get(0));
+      assertEquals(debug, 0, daq.getIndexedSuperPhrases().size());
+    }
+    { // length 4 phrase (larger then the max indexed size)
+      final Phrase qbfp = phrases.get((7 * 2) + 3);
+      final String debug = qbfp.toString();
+
+      assertEquals(debug, "Quick    brown FOX perniciously", qbfp.getSubSequence());
+      assertEquals(debug, 3, qbfp.getPositionStart());
+      assertEquals(debug, 7, qbfp.getPositionEnd());
+      assertEquals(debug, 4, qbfp.getPositionLength());
+      
+      assertEquals(debug, 8, qbfp.getOffsetStart());
+      assertEquals(debug, 39, qbfp.getOffsetEnd());
+
+      assertEquals(debug, 4, qbfp.getIndividualIndexedTerms().size());
+      assertEquals(debug, 2, qbfp.getLargestIndexedSubPhrases().size());
+      assertEquals(debug, 0, qbfp.getIndexedSuperPhrases().size());
+    }
+    
+    // some blanket assumptions about the results...
+    assertBasicSanityChecks(phrases, 11, 3, 7);
+  }
+
+  public void testWhiteBoxPhraseParsingShortInput() throws Exception {
+    // for input this short, either of these fields should be (mostly) equivilent
+    final Map<String,Integer> fields = new TreeMap<>();
+    fields.put("multigrams_body", 7); 
+    fields.put("multigrams_body_short", 3);
+    for (Map.Entry<String,Integer> entry : fields.entrySet()) {
+      try {
+        final int maxQ = entry.getValue();
+        final SchemaField field = h.getCore().getLatestSchema().getField(entry.getKey());
+        assertNotNull(field);
+        
+        // empty input shouldn't break anything
+        assertEquals(0, Phrase.extractPhrases(random().nextBoolean() ? "" : "  ", field, 3, maxQ).size());
+        
+        // input shorter them our index/query phrase sizes shouldn't break anything either....
+        final List<Phrase> phrases = Phrase.extractPhrases("brown FOX", field, 3, maxQ);
+        
+        assertEquals(3, phrases.size());
+        
+        { // length 2
+          final Phrase brown_fox = phrases.get(1);
+          final String debug = brown_fox.toString();
+          
+          assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
+          assertEquals(debug, 1, brown_fox.getPositionStart());
+          assertEquals(debug, 3, brown_fox.getPositionEnd());
+          assertEquals(debug, 2, brown_fox.getPositionLength());
+          
+          assertEquals(debug, 0, brown_fox.getOffsetStart());
+          assertEquals(debug, 9, brown_fox.getOffsetEnd());
+          
+          assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
+          assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
+          assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
+          assertEquals(debug, 0, brown_fox.getIndexedSuperPhrases().size());
+        }
+        { // length 1
+          final Phrase fox = phrases.get(2);
+          final String debug = fox.toString();
+          
+          assertEquals(debug, "FOX", fox.getSubSequence());
+          assertEquals(debug, 2, fox.getPositionStart());
+          assertEquals(debug, 3, fox.getPositionEnd());
+          assertEquals(debug, 1, fox.getPositionLength());
+          
+          assertEquals(debug, 6, fox.getOffsetStart());
+          assertEquals(debug, 9, fox.getOffsetEnd());
+          
+          assertEquals(debug, 1, fox.getIndividualIndexedTerms().size());
+          assertEquals(debug, 1, fox.getLargestIndexedSubPhrases().size());
+          assertEquals(debug, fox, fox.getLargestIndexedSubPhrases().get(0));
+          assertEquals(debug, 1, fox.getIndexedSuperPhrases().size());
+        }
+        
+        assertBasicSanityChecks(phrases, 2, 3, maxQ);
+      } catch (AssertionError e) {
+        throw new AssertionError(entry.getKey() + " => " + e.getMessage(), e);
+      }
+    }
+  }
+
+  /** 
+   * Asserts some basic rules that should be enforced about all Phrases 
+   * &amp; their linkages to oher phrases 
+   */
+  private void assertBasicSanityChecks(final List<Phrase> phrases,
+                                       final int inputPositionLength,
+                                       final int maxIndexedPositionLength,
+                                       final int maxQueryPositionLength) throws Exception {
+    assert 0 < phrases.size() : "Don't use this method if phrases might be empty";
+    
+    assertEmptyStream("no phrase should be longer then "+maxQueryPositionLength+" positions",
+                      phrases.stream().filter(p -> p.getPositionLength() > maxQueryPositionLength));
+
+    assertEmptyStream("no phrase should have a start offset < 0",
+                      phrases.stream().filter(p -> p.getOffsetStart() < 0));
+    assertEmptyStream("no phrase should have a start position < 1",
+                      phrases.stream().filter(p -> p.getPositionStart() < 1));
+
+    assertEmptyStream("If a phrase has a start offset of 0, then it must have position 1",
+                      phrases.stream().filter(p -> (p.getOffsetStart() == 0)
+                                              && (p.getPositionStart() != 1)));
+    
+    final Phrase first = phrases.get(0);
+    final Phrase last = phrases.get(phrases.size()-1);
+    
+    assertEmptyStream("no phrase should have a start offset < first phrase",
+                      phrases.stream().filter(p -> p.getOffsetStart() < first.getOffsetStart()));
+    assertEmptyStream("no phrase should have an end offset > last phrase",
+                      phrases.stream().filter(p -> last.getOffsetEnd() < p.getOffsetEnd()));
+    
+    assertEmptyStream("no phrase should have a start position < first phrase",
+                      phrases.stream().filter(p -> p.getPositionStart() < first.getPositionStart()));
+    assertEmptyStream("no phrase should have an end position > last phrase",
+                      phrases.stream().filter(p -> last.getPositionEnd() < p.getPositionEnd()));
+                 
+
+    // NOTE: stuff below this point may not be true for all analyzers (ie: stopwords)
+    // but should be valid for the analyzers used in this test...
+    // (if we expand test to cover analyzers w/stopwords, refactor this into a new method)
+        
+    for (int n = 1; n <= maxQueryPositionLength; n++) {
+      final int len = n;
+      final int expected = Math.max(0, 1 + inputPositionLength - n);
+      final List<Phrase> sizeN = phrases.stream().filter(p -> p.getPositionLength() == len
+                                                         ).collect(Collectors.toList());
+      assertEquals("Expected # phrases of size " + n + ": " + sizeN, expected, sizeN.size());
+    }
+
+    // check the quantities of sub-terms/phrases...
+    assertEmptyStream("no phrase should have num indexed terms != pos_len",
+                      phrases.stream().filter
+                      (p -> last.getPositionLength() != last.getIndividualIndexedTerms().size()));
+    assertEmptyStream("no phrase should have num sub-phrases != max(1, 1 + pos_len - "+maxIndexedPositionLength+")",
+                      phrases.stream().filter
+                      (p -> (Math.max(1, 1 + last.getPositionLength() - maxIndexedPositionLength)
+                             != last.getLargestIndexedSubPhrases().size())));
+    // NOTE: indexed super phrases can be of various lengths, and differing quantities near
+    // begining/end of input so don't worry about an exact count, just check their properties (below)
+
+    // check the properties of our sub/super phrases
+    for (Phrase phrase : phrases) {
+      final String debug = phrase.toString();
+      
+      assertEmptyStream(debug + " should not have any indexed terms where pos_len != 1",
+                        phrase.getIndividualIndexedTerms().stream().filter
+                        (term -> 1 != term.getPositionLength()));
+      
+      assertEmptyStream(debug + " should not have any sub-phrases where pos_len > min(pos_len, "
+                        + maxIndexedPositionLength+")",
+                        phrase.getLargestIndexedSubPhrases().stream().filter
+                        (inner -> (Math.min(phrase.getPositionLength(), maxIndexedPositionLength)
+                                   < inner.getPositionLength())));
+      
+      assertEmptyStream(debug + " should not have any super-phrases where super.len <= phrase.len or " 
+                        + maxIndexedPositionLength + " < super.len",
+                        phrase.getIndexedSuperPhrases().stream().filter
+                        (outer -> (outer.getPositionLength() <= phrase.getPositionLength() ||
+                                   maxIndexedPositionLength < outer.getPositionLength())));
+    }
+  }
+
+  public void testWhiteboxStats() throws Exception {
+    final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
+    assertNotNull(analysisField);
+    final String input = "BROWN fox lAzY  dog xxxyyyzzz";
+
+    // a function we'll re-use on phrases generated from the above input
+    // the multiplier let's us simulate multiple shards returning the same values
+    BiConsumer<Integer,List<Phrase>> assertions = (mult, phrases) -> {
+      final Phrase brown_fox = phrases.get(1);
+      assertEquals("BROWN fox", brown_fox.getSubSequence());
+      
+      assertEquals(mult * 1, brown_fox.getTTF("multigrams_title"));
+      assertEquals(mult * 1, brown_fox.getDocFreq("multigrams_title"));
+      assertEquals(mult * 1, brown_fox.getConjunctionDocCount("multigrams_title"));
+      
+      assertEquals(mult * 3, brown_fox.getTTF("multigrams_body"));
+      assertEquals(mult * 2, brown_fox.getDocFreq("multigrams_body"));
+      assertEquals(mult * 2, brown_fox.getConjunctionDocCount("multigrams_body"));
+      
+      final Phrase fox_lazy = phrases.get(6);
+      assertEquals("fox lAzY", fox_lazy.getSubSequence());
+      
+      assertEquals(mult * 0, fox_lazy.getTTF("multigrams_title"));
+      assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_title"));
+      assertEquals(mult * 1, fox_lazy.getConjunctionDocCount("multigrams_title"));
+      
+      assertEquals(mult * 0, fox_lazy.getTTF("multigrams_body"));
+      assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_body"));
+      assertEquals(mult * 2, fox_lazy.getConjunctionDocCount("multigrams_body"));
+      
+      final Phrase bfld = phrases.get(3);
+      assertEquals("BROWN fox lAzY  dog", bfld.getSubSequence());
+      
+      expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_title"); });
+      expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_title"); });
+      assertEquals(mult * 0, bfld.getConjunctionDocCount("multigrams_title"));
+      
+      expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_body"); });
+      expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_body"); });
+      assertEquals(mult * 1, bfld.getConjunctionDocCount("multigrams_body"));
+      
+      final Phrase xyz = phrases.get(phrases.size()-1);
+      
+      assertEquals("xxxyyyzzz", xyz.getSubSequence());
+      assertEquals(mult * 0, xyz.getTTF("multigrams_title"));
+      assertEquals(mult * 0, xyz.getDocFreq("multigrams_title"));
+      assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_title"));
+      
+      assertEquals(mult * 0, xyz.getTTF("multigrams_body"));
+      assertEquals(mult * 0, xyz.getDocFreq("multigrams_body"));
+      assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_body"));
+      return;
+    };
+
+
+    final List<Phrase> phrasesLocal = Phrase.extractPhrases(input, analysisField, 3, 7);
+    
+    // freshly parsed phrases, w/o any stats populated, all the stats should be 0
+    assertions.accept(0, phrasesLocal);
+
+    // If we populate with our index stats, we should get the basic values in our BiConsumer
+    try (SolrQueryRequest req = req()) {
+      Phrase.populateStats(phrasesLocal, Arrays.asList("multigrams_body","multigrams_title"),
+                           req.getSearcher());
+    }
+    assertions.accept(1, phrasesLocal);
+
+    // likewise, if we create a new freshly parsed set of phrases, and "merge" in the previous index stats
+    // (ie: merge results from one shard) we should get the same results
+    final List<Phrase> phrasesMerged = Phrase.extractPhrases(input, analysisField, 3, 7);
+    Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
+    assertions.accept(1, phrasesMerged);
+
+    // if we merge in a second copy of the same results (ie: two identical shards)
+    // our results should be double what we had before
+    Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
+    assertions.accept(2, phrasesMerged);
+    
+  }
+  
+  public void testWhiteboxScores() throws Exception {
+    final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
+    assertNotNull(analysisField);
+    final Map<String,Double> fieldWeights = new TreeMap<>();
+    fieldWeights.put("multigrams_title", 1.0D);
+    fieldWeights.put("multigrams_body", 0.0D); // NOTE: 0 weighting should only affect total score
+    
+    final String input = "xxxyyyzzz BROWN fox why are we lAzY";
+    final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
+    try (SolrQueryRequest req = req()) {
+      Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
+    }
+    Phrase.populateScores(phrases, fieldWeights, 3, 7);
+
+    // do some basic sanity checks of the field & total scores...
+
+    for (Phrase xyz : phrases.subList(0, 7)) {
+      // first 7 all start with xyz which isn't in index (in either field) so all scores should be -1
+      assertEquals(xyz.toString(), -1.0D, xyz.getTotalScore(), 0.0D);
+      assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_title"), 0.0D);
+      assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_body"), 0.0D);
+    }
+    
+    // any individual terms (past xyz) should score 0.0 because they are all actually in the index
+    // (in both fields)
+    for (Phrase term : phrases.subList(7, phrases.size()).stream().filter
+           ((p -> 1 == p.getPositionLength())).collect(Collectors.toList())) {
+      
+      assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_title"), 0.0D);
+      assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_body"), 0.0D);
+      assertEquals(term.toString(), 0.0D, term.getTotalScore(), 0.0D);
+    }
+
+    // "brown fox" should score positively in both fields, and overall...
+    final Phrase brown_fox = phrases.get(8);
+    assertEquals("BROWN fox", brown_fox.getSubSequence());
+    assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
+    assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_body"), greaterThan(0.0D) );
+    assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
+    
+    // "we lazy" does appear in a title value, but should score poorly given how often the terms
+    // are used in other contexts, and should score -1 against body -- but because of our weights,
+    // that shouldn't bring down the total
+    final Phrase we_lazy = phrases.get(phrases.size()-2);
+    assertEquals("we lAzY", we_lazy.getSubSequence());
+    assertEquals(we_lazy.toString(), -1.0D, we_lazy.getFieldScore("multigrams_body"), 0.0D);
+    assertThat(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), lessThan(0.0D));
+    assertThat(we_lazy.toString(), we_lazy.getTotalScore(), lessThan(0.0D));
+    assertEquals(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), we_lazy.getTotalScore(),
+                 0.0D);
+
+    // "why are we lazy" is longer then the max indexed phrase size & appears verbatim in a title value
+    // it should score -1 against body -- but because of our weights, that shouldn't bring down the total
+    final Phrase wawl = phrases.get(phrases.size()-7);
+    assertEquals("why are we lAzY", wawl.getSubSequence());
+    assertEquals(wawl.toString(), -1.0D, wawl.getFieldScore("multigrams_body"), 0.0D);
+    assertThat(wawl.toString(), wawl.getFieldScore("multigrams_title"), greaterThan(0.0D));
+    assertThat(wawl.toString(), wawl.getTotalScore(), greaterThan(0.0D));
+    assertEquals(wawl.toString(), wawl.getFieldScore("multigrams_title"), wawl.getTotalScore(),
+                 0.0D);
+
+    // "brown fox why are we" is longer then the max indexed phrase, and none of it's
+    // (longest) sub phrases exists in either field -- so all of it's scores should be -1
+    final Phrase bfwaw = phrases.get(11);
+    assertEquals("BROWN fox why are we", bfwaw.getSubSequence());
+    assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_title"), 0.0D);
+    assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_body"), 0.0D);
+    assertEquals(bfwaw.toString(), -1.0D, bfwaw.getTotalScore(), 0.0D);
+    
+  }
+  
+  public void testWhiteboxScorcesStopwords() throws Exception {
+    final String input = "why the lazy dog brown fox";
+    final Map<String,Double> fieldWeights = new TreeMap<>();
+    fieldWeights.put("multigrams_title", 1.0D); 
+    fieldWeights.put("multigrams_title_stop", 1.0D);
+    
+    { // If our analysisField uses all terms,
+      // be we also generate scores from a field that filters stopwords...
+      final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title");
+      assertNotNull(analysisField);
+      
+      final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
+      try (SolrQueryRequest req = req()) {
+        Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
+      }
+      Phrase.populateScores(phrases, fieldWeights, 3, 7);
+
+      // phrases that span the stop word should have valid scores from the field that doesn't care
+      // about stop words, but the stopword field should reject them
+      final Phrase why_the_lazy = phrases.get(2);
+      assertEquals("why the lazy", why_the_lazy.getSubSequence());
+      assertThat(why_the_lazy.toString(), why_the_lazy.getFieldScore("multigrams_title"), greaterThan(0.0D) );
+      assertEquals(why_the_lazy.toString(), -1.0D, why_the_lazy.getFieldScore("multigrams_title_stop"), 0.0D);
+      
+      final Phrase the_lazy_dog = phrases.get(8);
+      assertEquals("the lazy dog", the_lazy_dog.getSubSequence());
+      assertThat(the_lazy_dog.toString(), the_lazy_dog.getFieldScore("multigrams_title"), greaterThan(0.0D) );
+      assertEquals(the_lazy_dog.toString(), -1.0D, the_lazy_dog.getFieldScore("multigrams_title_stop"), 0.0D);
+      
+      // sanity check that good scores are still possible with stopwords
+      // "brown fox" should score positively in both fields, and overall...
+      final Phrase brown_fox = phrases.get(phrases.size()-2);
+      assertEquals("brown fox", brown_fox.getSubSequence());
+      assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
+      assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
+      assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
+    }
+    
+    { // now flip things: our analysisField filters stopwords, 
+      // but we also generates scores from a field that doesn't know about them...
+      //
+      // (NOTE: the parser will still generate _some_ candidate phrases spaning the stop word position,
+      // but not ones that start with the stopword)
+      final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title_stop");
+      assertNotNull(analysisField);
+      
+      final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
+      try (SolrQueryRequest req = req()) {
+        Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
+      }
+      Phrase.populateScores(phrases, fieldWeights, 3, 7);
+      assertTrue(phrases.toString(), 0 < phrases.size());
+
+      for (Phrase p : phrases) {
+        if (p.getPositionStart() <= 2 && 2 < p.getPositionEnd()) {
+          // phrases that span the stop word should have valid scores from the field that doesn't care
+          // about stop words, but the stopword field should reject them
+          assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title"), 0.0D);
+          assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title_stop"), 0.0D);
+        }
+      }
+      
+      // sanity check that good scores are still possible with stopwords
+      // "brown fox" should score positively in both fields, and overall...
+      final Phrase brown_fox = phrases.get(phrases.size()-2);
+      assertEquals("brown fox", brown_fox.getSubSequence());
+      assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
+      assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
+      assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
+    }
+    
+  }
+  
+  public void testExpectedUserErrors() throws Exception {
+    assertQEx("empty field list should error",
+              "must specify a (weighted) list of fields", 
+              req("q","foo", "phrases","true",
+                  "phrases.fields", " "),
+              ErrorCode.BAD_REQUEST);
+    
+    assertQEx("bogus field name should error",
+              "does not exist",
+              req("q","foo", "phrases","true",
+                  "phrases.fields", "bogus1 bogus2"),
+              ErrorCode.BAD_REQUEST);
+    
+    assertQEx("lack of shingles should cause error",
+              "Unable to determine max position length",
+              req("q","foo", "phrases","true",
+                  "phrases.fields", "title"),
+              ErrorCode.BAD_REQUEST);
+    
+    assertQEx("analyzer missmatch should cause error",
+              "must have the same fieldType",
+              req("q","foo", "phrases","true",
+                  "phrases.fields", "multigrams_title multigrams_title_short"),
+              ErrorCode.BAD_REQUEST);
+    
+    assertQEx("analysis field must exist",
+              "does not exist",
+              req("q","foo", "phrases","true",
+                  "phrases.analysis.field", "bogus",
+                  "phrases.fields", "multigrams_title multigrams_title_short"),
+              ErrorCode.BAD_REQUEST);
+
+    assertQEx("no query param should error",
+              "requires a query string", 
+              req("qt", "/phrases",
+                  "phrases.fields", "multigrams_title"),
+              ErrorCode.BAD_REQUEST);
+  }
+  
+  public void testMaxShingleSizeHelper() throws Exception {
+    IndexSchema schema = h.getCore().getLatestSchema();
+    
+    assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
+                 (schema.getFieldTypeByName("multigrams_3_7").getIndexAnalyzer()));
+    assertEquals(7, PhrasesIdentificationComponent.getMaxShingleSize
+                 (schema.getFieldTypeByName("multigrams_3_7").getQueryAnalyzer()));
+    
+    assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
+                 (schema.getFieldTypeByName("multigrams_3").getIndexAnalyzer()));
+    assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
+                 (schema.getFieldTypeByName("multigrams_3").getQueryAnalyzer()));
+    
+    assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
+                 (schema.getFieldTypeByName("text").getIndexAnalyzer()));
+    assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
+                 (schema.getFieldTypeByName("text").getQueryAnalyzer()));
+    
+  }
+  
+  public void testSimplePhraseRequest() throws Exception {
+    final String input = " did  a Quick    brown FOX perniciously jump over the lazy dog";
+    final String expected = " did  a Quick    {brown FOX} perniciously jump over {the lazy dog}";
+
+    // should get same behavior regardless of wether we use "q" or "phrases.q"
+    for (String p : Arrays.asList("q", "phrases.q")) {
+      // basic request...
+      assertQ(req("qt", HANDLER, p, input)
+              // expect no search results...
+              , "count(//result)=0"
+              
+              // just phrase info...
+              , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
+              , "//lst[@name='phrases']/str[@name='summary'][.='"+expected+"']"
+              , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 2"
+              //
+              , "//lst[@name='phrases']/arr[@name='details']/lst[1]/str[@name='text'][.='the lazy dog']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_start'][.='50']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_end'][.='62']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst[1]/double[@name='score'][number(.) > 0]"
+              //
+              , "//lst[@name='phrases']/arr[@name='details']/lst[2]/str[@name='text'][.='brown FOX']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_start'][.='17']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_end'][.='26']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst[2]/double[@name='score'][number(.) > 0]"
+              );
+
+      // empty input, empty phrases (and no error)...
+      assertQ(req("qt", HANDLER, p, "")
+              // expect no search results...
+              , "count(//result)=0"
+              // just empty phrase info for our empty input...
+              , "//lst[@name='phrases']/str[@name='input'][.='']"
+              , "//lst[@name='phrases']/str[@name='summary'][.='']"
+              , "count(//lst[@name='phrases']/arr[@name='details']) = 1"
+              , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
+              );
+    }
+  }
+  
+  public void testSimpleSearchRequests() throws Exception {
+    final String input = "\"brown fox\"";
+    
+    assertQ(req("q", input)
+            // basic search should have worked...
+            , "//result[@numFound='2']"
+            , "//result/doc/str[@name='id'][.='42']"
+            , "//result/doc/str[@name='id'][.='43']"
+            // and phrases should not be returned since they weren't requested...
+            , "0=count(//lst[@name='phrases'])"
+            );
+    
+    assertQ(req("phrases", "false", "q", input)
+            // basic search should have worked...
+            , "//result[@numFound='2']"
+            , "//result/doc/str[@name='id'][.='42']"
+            , "//result/doc/str[@name='id'][.='43']"
+            // and phrases should not be returned since they were explicitly disabled...
+            , "0=count(//lst[@name='phrases'])"
+            );
+
+    // with input this short, all of these permutations of requests should produce the same output...
+    for (SolrQueryRequest req : Arrays.asList
+           ( // simple, using 3/7 defaults
+             req("phrases","true", "q", input),
+             
+             // simple, using just the 3/3 'short' fields
+             req("phrases","true", "q", input,
+                 "phrases.fields", "multigrams_body_short multigrams_title_short^2"),
+             
+             // diff analysers, but explicit override using 3/3 "short" field...
+             req("phrases","true", "q", input,
+                 "phrases.fields", "multigrams_body multigrams_title_short^2",
+                 "phrases.analysis.field", "multigrams_title_short"))) {
+      assertQ(req
+              // basic search should have worked...
+              , "//result[@numFound='2']"
+              , "//result/doc/str[@name='id'][.='42']"
+              , "//result/doc/str[@name='id'][.='43']"
+              
+              // and we should have gotten phrase info...
+              , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
+              , "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
+              , "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
+              , "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
+              , "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
+              );
+    }
+
+    // override the query string to get different phrases
+    assertQ(req("phrases","true", "q", "*:*", "phrases.q",  input)
+            // basic search should have found all docs...
+            , "//result[@numFound='4']"
+            // and we should have gotten phrase info for our alternative q string...
+            , "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
+            , "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
+            , "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
+            , "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
+            , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
+            , "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
+            , "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
+            );
+    
+    // empty input, empty phrases (but no error)
+    assertQ(req("phrases","true", "q", "*:*", "phrases.q", "")
+            // basic search should have found all docs...
+            , "//result[@numFound='4']"
+            // and we should have gotten (empty) phrase info for our alternative q string...
+            , "//lst[@name='phrases']/str[@name='input'][.='']"
+            , "//lst[@name='phrases']/str[@name='summary'][.='']"
+            , "count(//lst[@name='phrases']/arr[@name='details'])     = 1"
+            , "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
+            );
+  }
+  
+  public void testGreyboxShardSearchRequests() throws Exception {
+    final String input = "quick brown fox ran";
+
+    final String phrase_xpath = "//lst[@name='phrases']";
+    final String all_phrase_xpath = phrase_xpath + "/arr[@name='_all']";
+
+    // phrases requested, and correct request stage / shard purpose ...
+    assertQ(req("q", input,
+                "phrases","true",
+                ShardParams.IS_SHARD, "true",
+                ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
+            
+            // this shard request should have caused stats to be returned about all phrases...
+            , "10=count("+ all_phrase_xpath +"/lst)"
+            // "quick" ...
+            , all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_body'][.='1']"
+            , all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_title'][.='0']"
+            // ...
+            // "brown fox"
+            , all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_body'][.='3']"
+            , all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_title'][.='1']"
+            , all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_body'][.='2']"
+            , all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_title'][.='1']"
+            , all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_body'][.='2']"
+            , all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_title'][.='1']"
+            
+            // but no computed "scores"...
+            , "0=count("+phrase_xpath+"//*[@name='score'])"
+            );
+
+    // phrases requested, but incorrect request stage / shard purpose ...
+    assertQ(req("q", input,
+                "phrases","true",
+                ShardParams.IS_SHARD, "true",
+                ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
+            , "0=count("+ phrase_xpath +"/lst)");
+    
+    // phrases disabled, regardless of request stage / shard purpose ...
+    assertTrue("sanity check failed, stage was modified in code w/o updating test",
+               PhrasesIdentificationComponent.SHARD_PURPOSE != ShardRequest.PURPOSE_GET_FIELDS);
+    assertQ(req("q", input,
+                "phrases","false",
+                ShardParams.IS_SHARD, "true",
+                ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
+            , "0=count("+ phrase_xpath +"/lst)");
+    assertQ(req("q", input,
+                "phrases","false",
+                ShardParams.IS_SHARD, "true",
+                ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
+            , "0=count("+ phrase_xpath +"/lst)");
+  }
+
+
+  
+  // ////////////////////////////////////////////////////////////////
+
+
+
+  
+  /** 
+   * Trivial Helper method that collects &amp; compares to an empty List so
+   * the assertion shows the unexpected stream elements 
+   */
+  public <T> void assertEmptyStream(final String msg, final Stream<? extends T> stream) {
+    assertEquals(msg,
+                 Collections.emptyList(),
+                 stream.collect(Collectors.toList()));
+  }
+
+  /** helper, docs for future junit/hamcrest seems to have something similar */
+  public static Matcher lessThan(double expected) {
+    return new BaseMatcher() {
+      @Override public boolean matches(Object actual) {
+        return ((Double)actual).compareTo(expected) < 0;
+      }
+      @Override public void describeTo(Description d) {
+        d.appendText("should be less than " + expected);
+      }
+    };
+  }
+  /** helper, docs for future junit/hamcrest seems to have something similar */
+  public static Matcher greaterThan(double expected) {
+    return new BaseMatcher() {
+      @Override public boolean matches(Object actual) {
+        return 0 < ((Double)actual).compareTo(expected);
+      }
+      @Override public void describeTo(Description d) {
+        d.appendText("should be greater than " + expected);
+      }
+    };
+  }
+}