You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2020/08/13 00:59:44 UTC
[lucene-solr] branch jira/SOLR-14383 updated: SOLR-14383: minor typo/inconsistency fix in sample doc ids

This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch jira/SOLR-14383
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/jira/SOLR-14383 by this push:
     new 3095dd2  SOLR-14383: minor typo/inconsistency fix in sample doc ids
3095dd2 is described below

commit 3095dd23958aa3dd9190ca698e1258f994ad374c
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Wed Aug 12 17:40:49 2020 -0700

    SOLR-14383: minor typo/inconsistency fix in sample doc ids
    
    Also add a unit test of hueristic for 'safe' of/which params based on _nest_path_ before trying to figure out how to document it clearly
    
    (This is basically the manual 'long form' way of writing child/parent queries using the (hypothetical) syntactic sugar described in SOLR-14687)
---
 .../solr/update/TestNestedUpdateProcessor.java     | 309 +++++++++++++++++++++
 .../src/indexing-nested-documents.adoc             |   2 +-
 solr/solr-ref-guide/src/other-parsers.adoc         |   4 +-
 3 files changed, 313 insertions(+), 2 deletions(-)

diff --git a/solr/core/src/test/org/apache/solr/update/TestNestedUpdateProcessor.java b/solr/core/src/test/org/apache/solr/update/TestNestedUpdateProcessor.java
index 49177a9..83305a8 100644
--- a/solr/core/src/test/org/apache/solr/update/TestNestedUpdateProcessor.java
+++ b/solr/core/src/test/org/apache/solr/update/TestNestedUpdateProcessor.java
@@ -17,7 +17,15 @@
 
 package org.apache.solr.update;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.util.TestUtil;
 
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.SolrException;
@@ -25,6 +33,7 @@ import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.update.processor.NestedUpdateProcessorFactory;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
+import org.apache.solr.common.params.SolrParams;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Rule;
@@ -209,4 +218,304 @@ public class TestNestedUpdateProcessor extends SolrTestCaseJ4 {
     updateJ(cmd, null);
     assertU(commit());
   }
+
+  /**
+   * Randomized test to look for flaws in the documented approach for building "safe" values of the 
+   * <code>of</code> / <code>which</code> params in the <code>child</code> / <code>parent</code> QParsers
+   * when a specific <code>_nest_path_</code> is desired
+   *
+   * @see <a href="https://issues.apache.org/jira/browse/SOLR-14687">SOLR-14687</a>
+   */
+  public void testRandomNestPathQueryFiltering() throws Exception {
+
+    // First: build a bunch of complex randomly nested documents, with random "nest paths"
+    // re-use the same "path segments" at various levels of nested, so as to confuse things even more
+    final RandomNestedDocModel docs = new RandomNestedDocModel();
+    for (int i = 0; i < 50; i++) {
+      final SolrInputDocument rootDoc = docs.buildRandomDoc();
+      assertU(adoc(rootDoc));
+    }
+    assertU(commit());
+
+    // now do some systematic parent/child queries.
+    // we're checking both for "parser errors" (ie: children matching "parent filter")
+    // as well as that the test_path_s of all matching docs meets our expectations
+
+    // *:* w/ parent parser...
+    // starts at "root" parent_path and recurses until we get no (expected) results
+    assertTrue(// we expected at least one query for every "real" path,
+               // but there will be more because we'll try lots of sub-paths that have no docs
+               docs.numDocsDescendentFromPath.keySet().size()
+               < docs.recursiveCheckParentQueryOfAllChildren(Collections.<String>emptyList()));
+    // sanity check: path that is garunteed not to exist...
+    assertEquals(1, docs.recursiveCheckParentQueryOfAllChildren(Arrays.asList("xxx", "yyy")));
+
+    // *:* w/ child parser...
+    // starts at "root" parent_path and recurses until we get no (expected) results
+    assertTrue(// we expected at least one query for every "real" path,
+               // but there will be more because we'll try lots of sub-paths that have no docs
+               docs.numDocsWithPathWithKids.keySet().size()
+               < docs.recursiveCheckChildQueryOfAllParents(Collections.<String>emptyList()));
+    // sanity check: path that is garunteed not to exist...
+    assertEquals(1, docs.recursiveCheckChildQueryOfAllParents(Arrays.asList("xxx", "yyy")));
+
+    // quering against individual child ids w/ both parent & child parser...
+    docs.checkParentAndChildQueriesOfEachDocument();
+  }
+  
+  private static class RandomNestedDocModel {
+    public static final List<String> PATH_ELEMENTS = Arrays.asList("aa", "bb", "cc", "dd");
+
+    private final Map<String,SolrInputDocument> allDocs = new HashMap<>();
+    
+    public final Map<String,Integer> numDocsDescendentFromPath = new HashMap<>();
+    public final Map<String,Integer> numDocsWithPathWithKids = new HashMap<>();
+      
+    private int idCounter = 0;
+
+    public synchronized SolrInputDocument buildRandomDoc() {
+      return buildRandomDoc(null, Collections.<String>emptyList(), 15);
+    }
+    private static String joinPath(List<String> test_path) {
+      return "/" + String.join("/", test_path);
+    }
+    private synchronized SolrInputDocument buildRandomDoc(SolrInputDocument parent,
+                                                          List<String> test_path,
+                                                          int maxDepthAndBreadth) {
+      final String path_string = joinPath(test_path);
+      final String id = "" + (++idCounter);
+      maxDepthAndBreadth--;
+      final SolrInputDocument doc = sdoc
+        ("id", id,
+         // may change, but we want it 0 even if we never add any
+         "num_direct_kids_s", "0", 
+         // conceptually matches _nest_path_ but should be easier to make assertions about (no inline position #s)
+         "test_path_s", path_string);
+      if (null != parent) {
+        // order matters: if we add the Collection first, SolrInputDocument will try to reuse it
+        doc.addField("ancestor_ids_ss", parent.getFieldValue("id"));
+        if (parent.containsKey("ancestor_ids_ss")) { // sigh: getFieldValues returns null, not empty collection
+          doc.addField("ancestor_ids_ss", parent.getFieldValues("ancestor_ids_ss"));
+        }
+      }
+      
+      for (int i = 0; i < test_path.size(); i++) {
+        // NOTE: '<' not '<=" .. we only includes paths we are descendents of, not our full path...
+        numDocsDescendentFromPath.merge(joinPath(test_path.subList(0, i)), 1, Math::addExact);
+      }
+      
+      if (0 < maxDepthAndBreadth) {
+        final int numDirectKids = TestUtil.nextInt(random(), 0, Math.min(4, maxDepthAndBreadth));
+        doc.setField("num_direct_kids_s", "" + numDirectKids);
+        if (0 < numDirectKids) {
+          numDocsWithPathWithKids.merge(path_string, 1, Math::addExact);
+        }
+        maxDepthAndBreadth -= numDirectKids;
+        for (int i = 0; i < numDirectKids; i++) {
+          final String kidType = PATH_ELEMENTS.get(random().nextInt(PATH_ELEMENTS.size()));
+          final List<String> kid_path = new ArrayList<>(test_path);
+          kid_path.add(kidType);
+          final SolrInputDocument kid = buildRandomDoc(doc, kid_path, maxDepthAndBreadth);
+          doc.addField(kidType, kid);
+          // order matters: if we add the Collection first, SolrInputDocument will try to reuse it
+          doc.addField("descendent_ids_ss", kid.getFieldValue("id"));
+          if (kid.containsKey("descendent_ids_ss")) {  // sigh: getFieldValues returns null, not empty collection
+            doc.addField("descendent_ids_ss", kid.getFieldValues("descendent_ids_ss"));
+          }
+        }
+      }
+      allDocs.put(id, doc);
+      return doc;
+    }
+
+
+    /** 
+     * Loops over the 'model' of every document we've indexed, asserting that
+     * parent/child queries wrapping an '<code>id:foo</code> using various paths
+     * match the expected ancestors/descendents
+     */
+    public void checkParentAndChildQueriesOfEachDocument() {
+      assertFalse("You didn't build any docs", allDocs.isEmpty());
+      
+      for (String doc_id : allDocs.keySet()) {
+        final String doc_path = allDocs.get(doc_id).getFieldValue("test_path_s").toString();
+        
+        if ( ! doc_path.equals("/") ) {
+          
+          // doc_id -> descdentId must have at least one ancestor (since it's not a root level document)
+          final String descendentId = doc_id;
+          assert allDocs.get(descendentId).containsKey("ancestor_ids_ss");
+          final List<Object> allAncestorIds = new ArrayList<>(allDocs.get(descendentId).getFieldValues("ancestor_ids_ss"));
+          
+          // pick a random ancestor to use in our testing...
+          final String ancestorId = allAncestorIds.get(random().nextInt(allAncestorIds.size())).toString();
+          final String ancestor_path = allDocs.get(ancestorId).getFieldValue("test_path_s").toString();
+          
+          final Collection<Object> allOfAncestorsDescendentIds
+            = allDocs.get(ancestorId).getFieldValues("descendent_ids_ss");
+          
+          assertTrue("Sanity check " + ancestorId + " ancestor of " + descendentId,
+                     allOfAncestorsDescendentIds.contains(descendentId));
+          
+          // now we should be able to assert that a 'parent' query wrapped around a query for the descendentId
+          // using the ancestor_path should match exactly one doc: our ancestorId...
+          assertQ(req(parentQueryMaker(ancestor_path, "id:" + descendentId),
+                      "_trace_path_tested", ancestor_path,
+                      "fl", "id",
+                      "indent", "true")
+                  , "//result/@numFound=1"
+                  , "//doc/str[@name='id'][.='"+ancestorId+"']"
+                  );
+          
+          // meanwhile, a 'child' query wrapped arround a query for the ancestorId, using the ancestor_path,
+          // should match all of it's descendents (for simplicity we'll check just the numFound and the
+          // 'descendentId' we started with)
+          assertQ(req(childQueryMaker(ancestor_path, "id:" + ancestorId),
+                      "_trace_path_tested", ancestor_path,
+                      "rows", "9999",
+                      "fl", "id",
+                      "indent", "true")
+                  , "//result/@numFound="+allOfAncestorsDescendentIds.size()
+                  , "//doc/str[@name='id'][.='"+descendentId+"']"
+                  );
+          
+        }
+        
+        // regardless of wether doc_id has an ancestor or not, a 'parent' query with a path that isn't a
+        // prefix of the path of the (child) doc_id in the wrapped query should match 0 docs w/o failing
+        assertQ(req(parentQueryMaker("/xxx/yyy", "id:" + doc_id),
+                    "_trace_path_tested", "/xxx/yyy",
+                    "indent", "true")
+                , "//result/@numFound=0");
+        
+        // likewise: a 'child' query wrapped around a query for our doc_id (regardless of wether if has
+        // any kids), using a path that doesn't start with the same prefix as doc_id, should match 0
+        // docs w/o failing
+        assertQ(req(childQueryMaker("/xxx/yyy", "id:" + doc_id),
+                    "_trace_path_tested", "/xxx/yyy",
+                    "indent", "true")
+                , "//result/@numFound=0");
+        
+        // lastly: wrapping a child query around a query for our doc_id, using a path that "extends" 
+        // the doc_id's path should always get 0 results if that path doesn't match any actual kids
+        // (regardless of wether doc_id has any children/descendents)
+        assertQ(req(childQueryMaker(doc_path + "/xxx/yyy", "id:" + doc_id),
+                    "_trace_path_tested", doc_path + "/xxx/yyy",
+                    "indent", "true")
+                , "//result/@numFound=0");
+      }
+    }
+
+    
+    /** 
+     * recursively check path permutations using <code>*:*</code> inner query, asserting that the 
+     * only docs matched have the expected path, and at least one kid (since this is the "parents" parser)
+     *
+     * (using <code>*:*</code> as our inner query keeps the validation simple and also helps stress out 
+     * risk of matching incorrect docs if the 'which' param is wrong)
+     *
+     * @return total number of queries checked (assuming no assertion failures)
+     */
+    public int recursiveCheckParentQueryOfAllChildren(List<String> parent_path) {
+      final String p = joinPath(parent_path);
+      final int expectedParents = numDocsWithPathWithKids.getOrDefault(p, 0);
+      assertQ(req(parentQueryMaker(p, "*:*"),
+                  "rows", "9999",
+                  "_trace_path_tested", p,
+                  "fl", "test_path_s,num_direct_kids_s",
+                  "indent", "true")
+              , "//result/@numFound="+expectedParents
+              , "count(//doc)="+expectedParents
+              , "count(//doc/str[@name='test_path_s'][.='"+p+"'])="+expectedParents
+              , "0=count(//doc/str[@name='num_direct_kids_s'][.='0'])"
+              );
+      int numChecked = 1;
+
+      // no point in recursing on the current path if we already have no results found...
+      if (0 < expectedParents) {
+        for (String next : PATH_ELEMENTS) {
+          final List<String> next_path = new ArrayList<>(parent_path);
+          next_path.add(next);
+          numChecked += recursiveCheckParentQueryOfAllChildren(next_path);
+        }
+      }
+      return numChecked;
+    }
+    
+    /** 
+     * This implements the "safe query based on parent path" rules we're sanity checking.
+     *
+     * @param parent_path the nest path of the parents to consider
+     * @param inner_child_query the specific children whose ancestors we are looking for, must be simple string <code>*:*</code> or <code>id:foo</code>
+     */
+    private SolrParams parentQueryMaker(String parent_path, String inner_child_query) {
+      assertValidPathSytax(parent_path);
+
+      if (parent_path.equals("/")) {
+        return params("q", "{!parent which=$parent_filt v=$child_q}",
+                      "parent_filt", "(*:* -_nest_path_:*)",
+                      "child_q", "(+" + inner_child_query + " +_nest_path_:*)");
+      } // else...
+      return params("q", "{!parent which=$parent_filt v=$child_q})",
+                    "parent_filt", "(*:* -{!prefix f='_nest_path_' v='"+parent_path+"/'})",
+                    "child_q", "(+" + inner_child_query + " +{!prefix f='_nest_path_' v='"+parent_path+"/'})");
+    }
+
+    /** 
+     * recursively check path permutations using <code>*:*</code> inner query, asserting that the 
+     * only docs matched have paths that include the specified path as a (strict) prefix
+     *
+     * (using <code>*:*</code> as our inner query keeps the validation simple and also helps stress out 
+     * risk of matching incorrect docs if the 'of' param is wrong)
+     *
+     * @return total number of queries checked (assuming no assertion failures)
+     */
+    public int recursiveCheckChildQueryOfAllParents(List<String> parent_path) {
+      final String p = joinPath(parent_path);
+      final int expectedMatches = numDocsDescendentFromPath.getOrDefault(p, 0);
+      assertQ(req(childQueryMaker(p, "*:*"),
+                  "rows", "9999",
+                  "_trace_path_tested", p,
+                  "fl", "test_path_s",
+                  "indent", "true")
+              , "//result/@numFound="+expectedMatches
+              , "count(//doc)="+expectedMatches
+              , "count(//doc/str[@name='test_path_s'][starts-with(., '"+p+"')])="+expectedMatches
+              );
+      int numChecked = 1;
+
+      // no point in recursing on the current path if we already have no results found...
+      if (0 < expectedMatches) {
+        for (String next : PATH_ELEMENTS) {
+          final List<String> next_path = new ArrayList<>(parent_path);
+          next_path.add(next);
+          numChecked += recursiveCheckChildQueryOfAllParents(next_path);
+        }
+      }
+      return numChecked;
+    }
+
+    /** 
+     * This implements the "safe query based on parent path" rules we're sanity checking.
+     *
+     * @param parent_path the nest path of the parents to consider
+     * @param inner_parent_query the specific parents whose descendents we are looking for, must be simple string <code>*:*</code> or <code>id:foo</code>
+     */
+    private SolrParams childQueryMaker(String parent_path, String inner_parent_query) {
+      assertValidPathSytax(parent_path);
+      if (parent_path.equals("/")) {
+        return params("q", "{!child of=$parent_filt v=$parent_q})",
+                      "parent_filt", "(*:* -_nest_path_:*)",
+                      "parent_q", "(+" + inner_parent_query + " -_nest_path_:*)");
+      } // else...
+      return params("q", "{!child of=$parent_filt v=$parent_q})",
+                    "parent_filt", "(*:* -{!prefix f='_nest_path_' v='"+parent_path+"/'})",
+                    "parent_q", "(+" + inner_parent_query + " +{!field f='_nest_path_' v='"+parent_path+"'})");
+    }
+
+    private void assertValidPathSytax(String path) {
+      assert path.startsWith("/");
+      assert (1 == path.length()) ^ ! path.endsWith("/");
+    }
+  }
 }
diff --git a/solr/solr-ref-guide/src/indexing-nested-documents.adoc b/solr/solr-ref-guide/src/indexing-nested-documents.adoc
index 85c6943..4c8c9ce 100644
--- a/solr/solr-ref-guide/src/indexing-nested-documents.adoc
+++ b/solr/solr-ref-guide/src/indexing-nested-documents.adoc
@@ -85,7 +85,7 @@ Even though the child documents in these examples are provided syntactically as
    "skus": [ { "id": "P22!S22",
                "color_s": "RED",
                "price_i": 89,
-               "manuals": [ { "id": "P21!D41",
+               "manuals": [ { "id": "P22!D42",
                               "name_s": "Red Mont Blanc Brochure",
                               "content_t": "..."
                             } ]
diff --git a/solr/solr-ref-guide/src/other-parsers.adoc b/solr/solr-ref-guide/src/other-parsers.adoc
index d0bc731..788cdb3 100644
--- a/solr/solr-ref-guide/src/other-parsers.adoc
+++ b/solr/solr-ref-guide/src/other-parsers.adoc
@@ -32,7 +32,9 @@ Many of these parsers are expressed the same way as <<local-parameters-in-querie
 // nocommit: so people can understand them (and so other sections of the ref-guide can link here for
 // nocommit: actual explanaiton)
 
-
+// nocommit: find a way to write up a susincet & clear description of the process/hueristic demonstrated in
+// nocommit: TestNestedUpdateProcessor.testRandomNestPathQueryFiltering
+// nocommit: AKA: the "manual" form of the syntactic sugar proposed in SOLR-14687
 
 There are two query parsers that support block joins. These parsers allow indexing and searching for relational content that has been <<indexing-nested-documents.adoc#indexing-nested-documents, indexed as Nested Documents>>.