You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ho...@apache.org on 2021/06/21 21:47:43 UTC

[solr] branch main updated: SOLR-15487: fix CommonGramsFilter docs in ref-guide, add solr tests

This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 498f9c4  SOLR-15487: fix CommonGramsFilter docs in ref-guide, add solr tests
498f9c4 is described below

commit 498f9c4c42a0824eaaef1e312992a5eb41da4e3e
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Mon Jun 21 14:47:35 2021 -0700

    SOLR-15487: fix CommonGramsFilter docs in ref-guide, add solr tests
    
    The new test assertions have some comments related to unexpected behavior from CommonGramsQueryFilter, currently being tracked in LUCENE-10007
---
 .../test-files/solr/collection1/conf/schema.xml    |  28 ++++++
 .../solr/analysis/CommonGramsPhraseQueryTest.java  | 101 +++++++++++++++++++++
 .../handler/FieldAnalysisRequestHandlerTest.java   |  70 ++++++++++++++
 solr/solr-ref-guide/src/filter-descriptions.adoc   |  32 +++++--
 4 files changed, 223 insertions(+), 8 deletions(-)

diff --git a/solr/core/src/test-files/solr/collection1/conf/schema.xml b/solr/core/src/test-files/solr/collection1/conf/schema.xml
index 5c3d483..c55d86b 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml
@@ -497,6 +497,34 @@
       <tokenizer class="solr.WhitespaceTokenizerFactory"/>
     </analyzer>
   </fieldType>
+
+  <!-- CommonGrams for phrase queries -->
+  <dynamicField name="*_commongrams" type="commongrams" />
+  <fieldType name="commongrams" class="solr.TextField" indexed="true" stored="true">
+    <analyzer type="index">
+      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+      <filter class="solr.CommonGramsFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+      <filter class="solr.CommonGramsQueryFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <dynamicField name="*_commongrams_stop" type="commongrams_stop" />
+  <fieldType name="commongrams_stop" class="solr.TextField" indexed="true" stored="true">
+    <analyzer type="index">
+      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+      <filter class="solr.CommonGramsFilterFactory"/>
+      <filter class="solr.StopFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+      <filter class="solr.CommonGramsQueryFilterFactory"/>
+      <filter class="solr.StopFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <copyField source="*_commongrams" dest="*_commongrams_stop" />
+  
   <fieldType name="severityType" class="${solr.tests.EnumFieldType}" enumsConfig="enumsConfig.xml" enumName="severity"/>
   
   <fieldType name="binary" class="solr.BinaryField" />
diff --git a/solr/core/src/test/org/apache/solr/analysis/CommonGramsPhraseQueryTest.java b/solr/core/src/test/org/apache/solr/analysis/CommonGramsPhraseQueryTest.java
new file mode 100644
index 0000000..6f2d014
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/analysis/CommonGramsPhraseQueryTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.util.Arrays;
+
+import org.apache.solr.SolrTestCaseJ4;
+
+import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
+import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory;
+
+import org.junit.BeforeClass;
+
+/**
+ * Validate that using {@link CommonGramsFilterFactory} at index time with {@link CommonGramsQueryFilterFactory} at query time 
+ * has the expected results when doing phrase queries - even if stop words are removed from the index.
+ */
+public class CommonGramsPhraseQueryTest extends SolrTestCaseJ4 {
+
+  @BeforeClass
+  public static void setupIndex() throws Exception {
+    initCore("solrconfig.xml","schema.xml");
+
+    assertU(adoc("id", "1", 
+                 "x_commongrams", "the quick and the dead man"));
+    assertU(adoc("id", "2", 
+                 "x_commongrams", "a longer field that also mentions the quick and the dead man plus extra stuff"));
+    assertU(adoc("id", "3", 
+                 "x_commongrams", "not a dead man"));
+
+    assertU(commit());
+  }
+
+  public void testCommonGrams() throws Exception {
+    testCommonQueries("x_commongrams");
+    // individual stop words should also match in this field....
+    for (String word : Arrays.asList("the", "and")) {
+      assertQ(req("x_commongrams:" + word)
+              ,"//*[@numFound='2']"
+              ,"//str[@name='id' and .='1']"
+              ,"//str[@name='id' and .='2']"
+              );
+    }
+    assertQ(req("x_commongrams:a")
+            ,"//*[@numFound='2']"
+            ,"//str[@name='id' and .='2']"
+            ,"//str[@name='id' and .='3']"
+            );
+    assertQ(req("x_commongrams:not")
+            ,"//*[@numFound='1']"
+            ,"//str[@name='id' and .='3']"
+            );
+  }
+  public void testCommonGramsStop() throws Exception {
+    testCommonQueries("x_commongrams_stop");
+    // individual stop words should not match anything in this field...
+    for (String word : Arrays.asList("the", "and", "not", "a")) {
+      assertQ(req("x_commongrams_stop:" + word)
+              ,"//*[@numFound='0']");
+    }
+  }
+  
+  protected void testCommonQueries(final String f) throws Exception {
+    // match 2...
+    for (String phrase : Arrays.asList("the quick and the dead", "the quick", "and the dead", "and the", "the dead man", "quick")) {
+      assertQ(req(f + ":\""+phrase+"\"")
+              ,"//*[@numFound='2']"
+              ,"//str[@name='id' and .='1']"
+              ,"//str[@name='id' and .='2']"
+              );
+    }
+    assertQ(req(f + ":quick") // just for the hell of it, let's also check this as a term and not a phase
+            ,"//*[@numFound='2']"
+            ,"//str[@name='id' and .='1']"
+            ,"//str[@name='id' and .='2']"
+            );
+    // match all...
+    for (String qs : Arrays.asList("dead", "man", "\"dead man\"")) {
+      assertQ(req(f + ":" + qs)
+            ,"//*[@numFound='3']"
+            ,"//str[@name='id' and .='1']"
+            ,"//str[@name='id' and .='2']"
+            ,"//str[@name='id' and .='3']"
+            );
+    }
+  }
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
index 5ab5a18..8f5f449 100644
--- a/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
@@ -27,6 +27,8 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttributeImpl;
@@ -442,6 +444,74 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
     assertToken(tokenList.get(5), new TokenInfo("test", null, "word", 14, 18, 5, new int[]{4,5,5}, null, false));
   }
 
+  @SuppressWarnings({"unchecked"})
+  public void testCommonGrams() throws Exception {
+
+    final FieldAnalysisRequest request = new FieldAnalysisRequest();
+    final String fieldType = "commongrams";
+    request.addFieldType(fieldType);
+    request.setFieldValue("the quick and the dead");
+    request.setQuery("and the dead man");
+    request.setShowMatch(true);
+
+    @SuppressWarnings({"rawtypes"})
+    NamedList<NamedList> result = handler.handleAnalysisRequest(request, h.getCore().getLatestSchema());
+    assertTrue("result is null and it shouldn't be", result != null);
+
+    @SuppressWarnings({"rawtypes"})
+    NamedList<NamedList> fieldTypes = result.get("field_types");
+    assertNotNull("field_types should never be null", fieldTypes);
+    @SuppressWarnings({"rawtypes"})
+    NamedList<NamedList> type = fieldTypes.get(fieldType);
+    assertNotNull("expecting result for field type: " + fieldType, type);
+
+    @SuppressWarnings({"rawtypes"})
+    NamedList<List<NamedList>> indexPart = type.get("index");
+    assertNotNull("expecting an index token analysis for field type: " + fieldType, indexPart);
+
+    @SuppressWarnings({"rawtypes"})
+    List<NamedList> tokenList = indexPart.get(WhitespaceTokenizer.class.getName());
+    assertNotNull("Expcting WhitespaceTokenizer analysis breakdown", tokenList);
+    assertEquals(tokenList.size(), 5);
+    assertToken(tokenList.get(0), new TokenInfo("the", null, "word", 0, 3, 1, new int[]{1}, null, false));
+    assertToken(tokenList.get(1), new TokenInfo("quick", null, "word", 4, 9, 2, new int[]{2}, null, false));
+    assertToken(tokenList.get(2), new TokenInfo("and", null, "word", 10, 13, 3, new int[]{3}, null, false));
+    assertToken(tokenList.get(3), new TokenInfo("the", null, "word", 14, 17, 4, new int[]{4}, null, false));
+    assertToken(tokenList.get(4), new TokenInfo("dead", null, "word", 18, 22, 5, new int[]{5}, null, false));
+    tokenList = indexPart.get(CommonGramsFilter.class.getName());
+    assertNotNull("Expcting CommonGramsFilter analysis breakdown", tokenList);
+    assertEquals(tokenList.size(), 9);
+    assertToken(tokenList.get(0), new TokenInfo("the", null, "word", 0, 3, 1, new int[]{1,1}, null, false));
+    assertToken(tokenList.get(1), new TokenInfo("the_quick", null, "gram", 0, 9, 1, new int[]{2,1}, null, false));
+    assertToken(tokenList.get(2), new TokenInfo("quick", null, "word", 4, 9, 2, new int[]{2,2}, null, false));
+    assertToken(tokenList.get(3), new TokenInfo("quick_and", null, "gram", 4, 13, 2, new int[]{3,2}, null, false));
+    assertToken(tokenList.get(4), new TokenInfo("and", null, "word", 10, 13, 3, new int[]{3,3}, null, false));
+    assertToken(tokenList.get(5), new TokenInfo("and_the", null, "gram", 10, 17, 3, new int[]{4,3}, null, true));
+    assertToken(tokenList.get(6), new TokenInfo("the", null, "word", 14, 17, 4, new int[]{4,4}, null, false));
+    assertToken(tokenList.get(7), new TokenInfo("the_dead", null, "gram", 14, 22, 4, new int[]{5,4}, null, true));
+    assertToken(tokenList.get(8), new TokenInfo("dead", null, "word", 18, 22, 5, new int[]{5,5}, null, true));
+    
+    @SuppressWarnings({"rawtypes"})
+    NamedList<List<NamedList>> queryPart = type.get("query");
+    assertNotNull("expecting a query token analysis for field type: " + fieldType, queryPart);
+
+    tokenList = queryPart.get(WhitespaceTokenizer.class.getName());
+    assertNotNull("Expecting WhitespaceTokenizer analysis breakdown", tokenList);
+    assertEquals(tokenList.size(), 4);
+    assertToken(tokenList.get(0), new TokenInfo("and", null, "word", 0, 3, 1, new int[]{1}, null, false));
+    assertToken(tokenList.get(1), new TokenInfo("the", null, "word", 4, 7, 2, new int[]{2}, null, false));
+    assertToken(tokenList.get(2), new TokenInfo("dead", null, "word", 8, 12, 3, new int[]{3}, null, false));
+    assertToken(tokenList.get(3), new TokenInfo("man", null, "word", 13, 16, 4, new int[]{4}, null, false));
+    tokenList = queryPart.get(CommonGramsQueryFilter.class.getName());
+    assertNotNull("Expcting CommonGramsQueryFilter analysis breakdown", tokenList);
+    // Hmmm... Not clear if "dead" should really be here, but it's what the filter currently produces, see: LUCENE-10007 
+    assertEquals(4, tokenList.size()); // LUCENE-10007 
+    assertToken(tokenList.get(0), new TokenInfo("and_the", null, "gram", 0, 7, 1, new int[]{2,1}, null, false));
+    assertToken(tokenList.get(1), new TokenInfo("the_dead", null, "gram", 4, 12, 2, new int[]{3,2}, null, false));
+    assertToken(tokenList.get(2), new TokenInfo("dead", null, "word", 8, 12, 3, new int[]{3,3}, null, false)); // LUCENE-10007
+    assertToken(tokenList.get(3), new TokenInfo("man", null, "word", 13, 16, 4, new int[]{4,4}, null, false));
+  }
+  
   @Test
   public void testSpatial() throws Exception {
     FieldAnalysisRequest request = new FieldAnalysisRequest();
diff --git a/solr/solr-ref-guide/src/filter-descriptions.adoc b/solr/solr-ref-guide/src/filter-descriptions.adoc
index 399cf04..7d811ba 100644
--- a/solr/solr-ref-guide/src/filter-descriptions.adoc
+++ b/solr/solr-ref-guide/src/filter-descriptions.adoc
@@ -240,7 +240,9 @@ This filter takes the output of the <<tokenizers.adoc#classic-tokenizer,Classic
 
 == Common Grams Filter
 
-This filter creates word shingles by combining common tokens such as stop words with regular tokens. This is useful for creating phrase queries containing common words, such as "the cat." Solr normally ignores stop words in queried phrases, so searching for "the cat" would return all matches for the word "cat."
+This filter for use in `index` time analysis creates word shingles by combining common tokens such as stop words with regular tokens.  This can result in an index with more unique terms, but is useful for creating phrase queries containing common words, such as "the cat", in a way that will typically be much faster then if the combined tokens are not used, because only the term positions of documents containg both terms in sequence have to be considered.  Correct usage requires being pai [...]
+
+These filters can also be combined with <<#stop-filter,Stop Filter>> so searching for `"the cat"` would match different documents then `"a cat"`, while pathological searches for either `"the"` or `"a"` would not match any documents.
 
 *Factory class:* `solr.CommonGramsFilterFactory`
 
@@ -261,10 +263,14 @@ This filter creates word shingles by combining common tokens such as stop words
 [.tab-label]*With name*
 [source,xml]
 ----
-<analyzer>
-  <tokenizer name="standard"/>
+<analyzer type="index">
+  <tokenizer name="whitespace"/>
   <filter name="commonGrams" words="stopwords.txt" ignoreCase="true"/>
 </analyzer>
+<analyzer type="query">
+  <tokenizer name="whitespace"/>
+  <filter name="commonGramsQuery" words="stopwords.txt" ignoreCase="true"/>
+</analyzer>
 ----
 ====
 [example.tab-pane#byclass-filter-commongrams]
@@ -272,19 +278,29 @@ This filter creates word shingles by combining common tokens such as stop words
 [.tab-label]*With class name (legacy)*
 [source,xml]
 ----
-<analyzer>
-  <tokenizer class="solr.StandardTokenizerFactory"/>
+<analyzer type="index">
+  <tokenizer class="solr.WhitespaceTokenizerFactory"/>
   <filter class="solr.CommonGramsFilterFactory" words="stopwords.txt" ignoreCase="true"/>
 </analyzer>
+<analyzer type="query">
+  <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+  <filter class="solr.CommonGramsQueryFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+</analyzer>
 ----
 ====
 --
 
-*In:* "the Cat"
+*In:* "the cat in the hat"
+
+*Tokenizer to Filter(s):* "the", "cat", "in", "the", "hat"
+
+*(Index) Out:* "the"(1), "the_cat"(1), "cat"(2), "cat_in"(2), "in"(3), "in_the"(3), "the"(4), "the_hat"(4), "hat"(5)
+
+*(Query) Out:* "the_cat"(1), "cat_in"(2), "in_the"(3), "the_hat"(4)
 
-*Tokenizer to Filter:* "the", "Cat"
+== Common Grams Query Filter
 
-*Out:* "the_cat"
+This filter is used for the `query` time analysis aspect of <<#common-grams-filter,Common Grams Filter>> -- see that filer for a description of arguments, example configuration, and sample input/output.
 
 == Collation Key Filter