You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/05/22 19:59:19 UTC
[42/50] [abbrv] lucene-solr:jira/solr-11779: SOLR-9480: A new
'relatedness()' aggregate function for JSON Faceting to enable building
Semantic Knowledge Graphs
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/669b9e7a/solr/core/src/test/org/apache/solr/cloud/TestCloudJSONFacetSKG.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudJSONFacetSKG.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudJSONFacetSKG.java
new file mode 100644
index 0000000..35f23d0
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudJSONFacetSKG.java
@@ -0,0 +1,654 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.cloud;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.lang.StringUtils;
+
+import org.apache.lucene.util.LuceneTestCase.Slow;
+import org.apache.lucene.util.TestUtil;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.search.facet.FacetField;
+import static org.apache.solr.search.facet.RelatednessAgg.computeRelatedness;
+import static org.apache.solr.search.facet.RelatednessAgg.roundTo5Digits;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * A randomized test of nested facets using the <code>relatedness()</code> function, that asserts the
+ * accuracy the results for all the buckets returned using verification queries of the (expected)
+ * foreground & background queries based on the nested facet terms.
+ * <p>
+ * Note that unlike normal facet "count" verification, using a high limit + overrequest isn't a substitute
+ * for refinement in order to ensure accurate "skg" computation across shards. For that reason, this
+ * tests forces <code>refine: true</code> (unlike {@link TestCloudJSONFacetJoinDomain}) and specifices a
+ * <code>domain: { 'query':'{!v=$back' }</code> for every facet, in order to garuntee that all popularity
+ * & relatedness values returned can be proven with validation requests.
+ * </p>
+ * <p>
+ * (Refinement alone is not enough. Using the background query as the facet domain is neccessary to
+ * prevent situations where a single shardX may return candidate bucket with no child-buckets due to
+ * the normal facet intersections, but when refined on other shardY(s), can produce "high scoring"
+ * SKG child-buckets, which would then be missing the foreground/background "size" contributions from
+ * shardX.
+ * </p>
+ *
+ *
+ *
+ * @see TestCloudJSONFacetJoinDomain
+ */
+@Slow
+public class TestCloudJSONFacetSKG extends SolrCloudTestCase {
+
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ private static final String DEBUG_LABEL = MethodHandles.lookup().lookupClass().getName();
+ private static final String COLLECTION_NAME = DEBUG_LABEL + "_collection";
+
+ private static final int DEFAULT_LIMIT = FacetField.DEFAULT_FACET_LIMIT;
+ private static final int MAX_FIELD_NUM = 15;
+ private static final int UNIQUE_FIELD_VALS = 50;
+
+ /** Multivalued string field suffixes that can be randomized for testing diff facet/join code paths */
+ private static final String[] STR_FIELD_SUFFIXES = new String[] { "_ss", "_sds", "_sdsS" };
+ /** Multivalued int field suffixes that can be randomized for testing diff facet/join code paths */
+ private static final String[] INT_FIELD_SUFFIXES = new String[] { "_is", "_ids", "_idsS" };
+
+ /** A basic client for operations at the cloud level, default collection will be set */
+ private static CloudSolrClient CLOUD_CLIENT;
+ /** One client per node */
+ private static ArrayList<HttpSolrClient> CLIENTS = new ArrayList<>(5);
+
+ @BeforeClass
+ private static void createMiniSolrCloudCluster() throws Exception {
+ // sanity check constants
+ assertTrue("bad test constants: some suffixes will never be tested",
+ (STR_FIELD_SUFFIXES.length < MAX_FIELD_NUM) && (INT_FIELD_SUFFIXES.length < MAX_FIELD_NUM));
+
+ // we need DVs on point fields to compute stats & facets
+ if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true");
+
+ // multi replicas should not matter...
+ final int repFactor = usually() ? 1 : 2;
+ // ... but we definitely want to test multiple shards
+ final int numShards = TestUtil.nextInt(random(), 1, (usually() ? 2 :3));
+ final int numNodes = (numShards * repFactor);
+
+ final String configName = DEBUG_LABEL + "_config-set";
+ final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf");
+
+ configureCluster(numNodes).addConfig(configName, configDir).configure();
+
+ Map<String, String> collectionProperties = new LinkedHashMap<>();
+ collectionProperties.put("config", "solrconfig-tlog.xml");
+ collectionProperties.put("schema", "schema_latest.xml");
+ CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, numShards, repFactor)
+ .setProperties(collectionProperties)
+ .process(cluster.getSolrClient());
+
+ CLOUD_CLIENT = cluster.getSolrClient();
+ CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME);
+
+ waitForRecoveriesToFinish(CLOUD_CLIENT);
+
+ for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
+ CLIENTS.add(getHttpSolrClient(jetty.getBaseUrl() + "/" + COLLECTION_NAME + "/"));
+ }
+
+ final int numDocs = atLeast(100);
+ for (int id = 0; id < numDocs; id++) {
+ SolrInputDocument doc = sdoc("id", ""+id);
+ for (int fieldNum = 0; fieldNum < MAX_FIELD_NUM; fieldNum++) {
+ // NOTE: some docs may have no value in a field
+ final int numValsThisDoc = TestUtil.nextInt(random(), 0, (usually() ? 5 : 10));
+ for (int v = 0; v < numValsThisDoc; v++) {
+ final String fieldValue = randFieldValue(fieldNum);
+
+ // for each fieldNum, there are actaully two fields: one string, and one integer
+ doc.addField(field(STR_FIELD_SUFFIXES, fieldNum), fieldValue);
+ doc.addField(field(INT_FIELD_SUFFIXES, fieldNum), fieldValue);
+ }
+ }
+ CLOUD_CLIENT.add(doc);
+ if (random().nextInt(100) < 1) {
+ CLOUD_CLIENT.commit(); // commit 1% of the time to create new segments
+ }
+ if (random().nextInt(100) < 5) {
+ CLOUD_CLIENT.add(doc); // duplicate the doc 5% of the time to create deleted docs
+ }
+ }
+ CLOUD_CLIENT.commit();
+ }
+
+ /**
+ * Given a (random) number, and a (static) array of possible suffixes returns a consistent field name that
+ * uses that number and one of hte specified suffixes in it's name.
+ *
+ * @see #STR_FIELD_SUFFIXES
+ * @see #INT_FIELD_SUFFIXES
+ * @see #MAX_FIELD_NUM
+ * @see #randFieldValue
+ */
+ private static String field(final String[] suffixes, final int fieldNum) {
+ assert fieldNum < MAX_FIELD_NUM;
+
+ final String suffix = suffixes[fieldNum % suffixes.length];
+ return "field_" + fieldNum + suffix;
+ }
+ private static String strfield(final int fieldNum) {
+ return field(STR_FIELD_SUFFIXES, fieldNum);
+ }
+ private static String intfield(final int fieldNum) {
+ return field(INT_FIELD_SUFFIXES, fieldNum);
+ }
+
+ /**
+ * Given a (random) field number, returns a random (integer based) value for that field.
+ * NOTE: The number of unique values in each field is constant acording to {@link #UNIQUE_FIELD_VALS}
+ * but the precise <em>range</em> of values will vary for each unique field number, such that cross field joins
+ * will match fewer documents based on how far apart the field numbers are.
+ *
+ * @see #UNIQUE_FIELD_VALS
+ * @see #field
+ */
+ private static String randFieldValue(final int fieldNum) {
+ return "" + (fieldNum + TestUtil.nextInt(random(), 1, UNIQUE_FIELD_VALS));
+ }
+
+
+ @AfterClass
+ private static void afterClass() throws Exception {
+ CLOUD_CLIENT.close(); CLOUD_CLIENT = null;
+ for (HttpSolrClient client : CLIENTS) {
+ client.close();
+ }
+ CLIENTS = null;
+ }
+
+ /**
+ * Test some small, hand crafted, but non-trivial queries that are
+ * easier to trace/debug then a pure random monstrosity.
+ * (ie: if something obvious gets broken, this test may fail faster and in a more obvious way then testRandom)
+ */
+ public void testBespoke() throws Exception {
+ { // trivial single level facet
+ Map<String,TermFacet> facets = new LinkedHashMap<>();
+ TermFacet top = new TermFacet(strfield(9), UNIQUE_FIELD_VALS, 0, null);
+ facets.put("top1", top);
+ final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS);
+ assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*");
+ assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS);
+ }
+
+ { // trivial single level facet w/sorting on skg
+ Map<String,TermFacet> facets = new LinkedHashMap<>();
+ TermFacet top = new TermFacet(strfield(9), UNIQUE_FIELD_VALS, 0, "skg desc");
+ facets.put("top2", top);
+ final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS);
+ assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*");
+ assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS);
+ }
+
+ { // trivial single level facet w/ 2 diff ways to request "limit = (effectively) Infinite"
+ // to sanity check refinement of buckets missing from other shard in both cases
+
+ // NOTE that these two queries & facets *should* effectively identical given that the
+ // very large limit value is big enough no shard will ever return that may terms,
+ // but the "limit=-1" case it actaully triggers slightly different code paths
+ // because it causes FacetField.returnsPartial() to be "true"
+ for (int limit : new int[] { 999999999, -1 }) {
+ Map<String,TermFacet> facets = new LinkedHashMap<>();
+ facets.put("top_facet_limit__" + limit, new TermFacet(strfield(9), limit, 0, "skg desc"));
+ final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS);
+ assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*");
+ assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS);
+ }
+ }
+ }
+
+ public void testRandom() throws Exception {
+
+ // since the "cost" of verifying the stats for each bucket is so high (see TODO in verifySKGResults())
+ // we put a safety valve in place on the maximum number of buckets that we are willing to verify
+ // across *all* the queries that we do.
+ // that way if the randomized queries we build all have relatively small facets, so be it, but if
+ // we get a really big one early on, we can test as much as possible, skip other iterations.
+ //
+ // (deeply nested facets may contain more buckets then the max, but we won't *check* all of them)
+ final int maxBucketsAllowed = atLeast(2000);
+ final AtomicInteger maxBucketsToCheck = new AtomicInteger(maxBucketsAllowed);
+
+ final int numIters = atLeast(10);
+ for (int iter = 0; iter < numIters && 0 < maxBucketsToCheck.get(); iter++) {
+
+ assertFacetSKGsAreCorrect(maxBucketsToCheck, TermFacet.buildRandomFacets(),
+ buildRandomQuery(), buildRandomQuery(), buildRandomQuery());
+ }
+ assertTrue("Didn't check a single bucket???", maxBucketsToCheck.get() < maxBucketsAllowed);
+
+
+ }
+
+ /**
+ * Generates a random query string across the randomized fields/values in the index
+ *
+ * @see #randFieldValue
+ * @see #field
+ */
+ private static String buildRandomQuery() {
+ if (0 == TestUtil.nextInt(random(), 0,10)) {
+ return "*:*";
+ }
+ final int numClauses = TestUtil.nextInt(random(), 3, 10);
+ final String[] clauses = new String[numClauses];
+ for (int c = 0; c < numClauses; c++) {
+ final int fieldNum = random().nextInt(MAX_FIELD_NUM);
+ // keep queries simple, just use str fields - not point of test
+ clauses[c] = strfield(fieldNum) + ":" + randFieldValue(fieldNum);
+ }
+ return buildORQuery(clauses);
+ }
+
+ private static String buildORQuery(String... clauses) {
+ assert 0 < clauses.length;
+ return "(" + StringUtils.join(clauses, " OR ") + ")";
+ }
+
+ /**
+ * Given a set of term facets, and top level query strings, asserts that
+ * the SKG stats for each facet term returned when executing that query with those foreground/background
+ * queries match the expected results of executing the equivilent queries in isolation.
+ *
+ * @see #verifySKGResults
+ */
+ private void assertFacetSKGsAreCorrect(final AtomicInteger maxBucketsToCheck,
+ Map<String,TermFacet> expected,
+ final String query,
+ final String foreQ,
+ final String backQ) throws SolrServerException, IOException {
+ final SolrParams baseParams = params("rows","0", "fore", foreQ, "back", backQ);
+
+ final SolrParams facetParams = params("q", query,
+ "json.facet", ""+TermFacet.toJSONFacetParamValue(expected,null));
+ final SolrParams initParams = SolrParams.wrapAppended(facetParams, baseParams);
+
+ log.info("Doing full run: {}", initParams);
+
+ QueryResponse rsp = null;
+ // JSON Facets not (currently) available from QueryResponse...
+ NamedList topNamedList = null;
+ try {
+ rsp = (new QueryRequest(initParams)).process(getRandClient(random()));
+ assertNotNull(initParams + " is null rsp?", rsp);
+ topNamedList = rsp.getResponse();
+ assertNotNull(initParams + " is null topNamedList?", topNamedList);
+ } catch (Exception e) {
+ throw new RuntimeException("init query failed: " + initParams + ": " +
+ e.getMessage(), e);
+ }
+ try {
+ final NamedList facetResponse = (NamedList) topNamedList.get("facets");
+ assertNotNull("null facet results?", facetResponse);
+ assertEquals("numFound mismatch with top count?",
+ rsp.getResults().getNumFound(), ((Number)facetResponse.get("count")).longValue());
+ if (0 == rsp.getResults().getNumFound()) {
+ // when the query matches nothing, we should expect no top level facets
+ expected = Collections.emptyMap();
+ }
+ assertFacetSKGsAreCorrect(maxBucketsToCheck, expected, baseParams, facetResponse);
+ } catch (AssertionError e) {
+ throw new AssertionError(initParams + " ===> " + topNamedList + " --> " + e.getMessage(), e);
+ } finally {
+ log.info("Ending full run");
+ }
+ }
+
+ /**
+ * Recursive helper method that walks the actual facet response, comparing the SKG results to
+ * the expected output based on the equivilent filters generated from the original TermFacet.
+ */
+ private void assertFacetSKGsAreCorrect(final AtomicInteger maxBucketsToCheck,
+ final Map<String,TermFacet> expected,
+ final SolrParams baseParams,
+ final NamedList actualFacetResponse) throws SolrServerException, IOException {
+
+ for (Map.Entry<String,TermFacet> entry : expected.entrySet()) {
+ final String facetKey = entry.getKey();
+ final TermFacet facet = entry.getValue();
+ final NamedList results = (NamedList) actualFacetResponse.get(facetKey);
+ assertNotNull(facetKey + " key missing from: " + actualFacetResponse, results);
+ final List<NamedList> buckets = (List<NamedList>) results.get("buckets");
+ assertNotNull(facetKey + " has null buckets: " + actualFacetResponse, buckets);
+
+ if (buckets.isEmpty()) {
+ // should only happen if the background query does not match any docs with field X
+ final long docsWithField = getNumFound(params("_trace", "noBuckets",
+ "rows", "0",
+ "q", facet.field+":[* TO *]",
+ "fq", baseParams.get("back")));
+
+ assertEquals(facetKey + " has no buckets, but docs in background exist with field: " + facet.field,
+ 0, docsWithField);
+ }
+
+ // NOTE: it's important that we do this depth first -- not just because it's the easiest way to do it,
+ // but because it means that our maxBucketsToCheck will ensure we do a lot of deep sub-bucket checking,
+ // not just all the buckets of the top level(s) facet(s)
+ for (NamedList bucket : buckets) {
+ final String fieldVal = bucket.get("val").toString(); // int or stringified int
+
+ verifySKGResults(facetKey, facet, baseParams, fieldVal, bucket);
+ if (maxBucketsToCheck.decrementAndGet() <= 0) {
+ return;
+ }
+
+ final SolrParams verifyParams = SolrParams.wrapAppended(baseParams,
+ params("fq", facet.field + ":" + fieldVal));
+
+ // recursively check subFacets
+ if (! facet.subFacets.isEmpty()) {
+ assertFacetSKGsAreCorrect(maxBucketsToCheck, facet.subFacets, verifyParams, bucket);
+ }
+ }
+ }
+ }
+
+ /**
+ * Verifies that the popularity & relatedness values containined in a single SKG bucket
+ * match the expected values based on the facet field & bucket value, as well the existing
+ * filterParams.
+ *
+ * @see #assertFacetSKGsAreCorrect
+ */
+ private void verifySKGResults(String facetKey, TermFacet facet, SolrParams filterParams,
+ String fieldVal, NamedList<Object> bucket)
+ throws SolrServerException, IOException {
+
+ final String bucketQ = facet.field+":"+fieldVal;
+ final NamedList<Object> skgBucket = (NamedList<Object>) bucket.get("skg");
+ assertNotNull(facetKey + "/bucket:" + bucket.toString(), skgBucket);
+
+ // TODO: make this more efficient?
+ // ideally we'd do a single query w/4 facet.queries, one for each count
+ // but formatting the queries is a pain, currently we leverage the accumulated fq's
+ final long fgSize = getNumFound(SolrParams.wrapAppended(params("_trace", "fgSize",
+ "rows","0",
+ "q","{!query v=$fore}"),
+ filterParams));
+ final long bgSize = getNumFound(params("_trace", "bgSize",
+ "rows","0",
+ "q", filterParams.get("back")));
+
+ final long fgCount = getNumFound(SolrParams.wrapAppended(params("_trace", "fgCount",
+ "rows","0",
+ "q","{!query v=$fore}",
+ "fq", bucketQ),
+ filterParams));
+ final long bgCount = getNumFound(params("_trace", "bgCount",
+ "rows","0",
+ "q", bucketQ,
+ "fq", filterParams.get("back")));
+
+ assertEquals(facetKey + "/bucket:" + bucket + " => fgPop should be: " + fgCount + " / " + bgSize,
+ roundTo5Digits((double) fgCount / bgSize),
+ skgBucket.get("foreground_popularity"));
+ assertEquals(facetKey + "/bucket:" + bucket + " => bgPop should be: " + bgCount + " / " + bgSize,
+ roundTo5Digits((double) bgCount / bgSize),
+ skgBucket.get("background_popularity"));
+ assertEquals(facetKey + "/bucket:" + bucket + " => relatedness is wrong",
+ roundTo5Digits(computeRelatedness(fgCount, fgSize, bgCount, bgSize)),
+ skgBucket.get("relatedness"));
+
+ }
+
+
+ /**
+ * Trivial data structure for modeling a simple terms facet that can be written out as a json.facet param.
+ *
+ * Doesn't do any string escaping or quoting, so don't use whitespace or reserved json characters
+ */
+ private static final class TermFacet {
+ public final String field;
+ public final Map<String,TermFacet> subFacets = new LinkedHashMap<>();
+ public final Integer limit; // may be null
+ public final Integer overrequest; // may be null
+ public final String sort; // may be null
+ /** Simplified constructor asks for limit = # unique vals */
+ public TermFacet(String field) {
+ this(field, UNIQUE_FIELD_VALS, 0, "skg desc");
+
+ }
+ public TermFacet(String field, Integer limit, Integer overrequest, String sort) {
+ assert null != field;
+ this.field = field;
+ this.limit = limit;
+ this.overrequest = overrequest;
+ this.sort = sort;
+ }
+
+ /**
+ * recursively generates the <code>json.facet</code> param value to use for testing this facet
+ */
+ private CharSequence toJSONFacetParamValue() {
+ final String limitStr = (null == limit) ? "" : (", limit:" + limit);
+ final String overrequestStr = (null == overrequest) ? "" : (", overrequest:" + overrequest);
+ final String sortStr = (null == sort) ? "" : (", sort: '" + sort + "'");
+ final StringBuilder sb
+ = new StringBuilder("{ type:terms, field:" + field + limitStr + overrequestStr + sortStr);
+
+ // see class javadocs for why we always use refine:true & the query:$back domain for this test.
+ sb.append(", refine: true, domain: { query: '{!v=$back}' }, facet:");
+ sb.append(toJSONFacetParamValue(subFacets, "skg : 'relatedness($fore,$back)'"));
+ sb.append("}");
+ return sb;
+ }
+
+ /**
+ * Given a set of (possibly nested) facets, generates a suitable <code>json.facet</code> param value to
+ * use for testing them against in a solr request.
+ */
+ public static CharSequence toJSONFacetParamValue(final Map<String,TermFacet> facets,
+ final String extraJson) {
+ assert null != facets;
+ if (0 == facets.size() && null == extraJson) {
+ return "";
+ }
+
+ StringBuilder sb = new StringBuilder("{ processEmpty: true, ");
+ for (String key : facets.keySet()) {
+ sb.append(key).append(" : ").append(facets.get(key).toJSONFacetParamValue());
+ sb.append(" ,");
+ }
+ if (null == extraJson) {
+ sb.setLength(sb.length() - 1);
+ } else {
+ sb.append(extraJson);
+ }
+ sb.append("}");
+ return sb;
+ }
+
+ /**
+ * Factory method for generating some random facets.
+ *
+ * For simplicity, each facet will have a unique key name.
+ */
+ public static Map<String,TermFacet> buildRandomFacets() {
+ // for simplicity, use a unique facet key regardless of depth - simplifies verification
+ // and le's us enforce a hard limit on the total number of facets in a request
+ AtomicInteger keyCounter = new AtomicInteger(0);
+
+ final int maxDepth = TestUtil.nextInt(random(), 0, (usually() ? 2 : 3));
+ return buildRandomFacets(keyCounter, maxDepth);
+ }
+
+ /**
+ * picks a random value for the "sort" param, biased in favor of interesting test cases
+ *
+ * @return a sort string (w/direction), or null to specify nothing (trigger default behavior)
+ * @see #randomLimitParam
+ */
+ public static String randomSortParam(Random r) {
+
+ // IMPORTANT!!!
+ // if this method is modified to produce new sorts, make sure to update
+ // randomLimitParam to account for them if they are impacted by SOLR-12343
+ final String dir = random().nextBoolean() ? "asc" : "desc";
+ switch(r.nextInt(4)) {
+ case 0: return null;
+ case 1: return "count " + dir;
+ case 2: return "skg " + dir;
+ case 3: return "index " + dir;
+ default: throw new RuntimeException("Broken case statement");
+ }
+ }
+ /**
+ * picks a random value for the "limit" param, biased in favor of interesting test cases
+ *
+ * <p>
+ * <b>NOTE:</b> Due to SOLR-12343, we have to force an overrequest of "all" possible terms for
+ * some sort values.
+ * </p>
+ *
+ * @return a number to specify in the request, or null to specify nothing (trigger default behavior)
+ * @see #UNIQUE_FIELD_VALS
+ * @see #randomSortParam
+ */
+ public static Integer randomLimitParam(Random r, final String sort) {
+ if (null != sort) {
+ if (sort.equals("count asc") || sort.startsWith("skg")) {
+ // of the known types of sorts produced, these are at risk of SOLR-12343
+ // so request (effectively) unlimited num buckets
+ return r.nextBoolean() ? UNIQUE_FIELD_VALS : -1;
+ }
+ }
+ final int limit = 1 + r.nextInt((int) (UNIQUE_FIELD_VALS * 1.5F));
+ if (limit >= UNIQUE_FIELD_VALS && r.nextBoolean()) {
+ return -1; // unlimited
+ } else if (limit == DEFAULT_LIMIT && r.nextBoolean()) {
+ return null; // sometimes, don't specify limit if it's the default
+ }
+ return limit;
+ }
+
+ /**
+ * picks a random value for the "overrequest" param, biased in favor of interesting test cases.
+ *
+ * @return a number to specify in the request, or null to specify nothing (trigger default behavior)
+ * @see #UNIQUE_FIELD_VALS
+ */
+ public static Integer randomOverrequestParam(Random r) {
+ switch(r.nextInt(10)) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ return 0; // 40% of the time, disable overrequest to better stress refinement
+ case 4:
+ case 5:
+ return r.nextInt(UNIQUE_FIELD_VALS); // 20% ask for less them what's needed
+ case 6:
+ return r.nextInt(Integer.MAX_VALUE); // 10%: completley random value, statisticaly more then enough
+ default: break;
+ }
+ // else.... either leave param unspecified (or redundently specify the -1 default)
+ return r.nextBoolean() ? null : -1;
+ }
+
+ /**
+ * recursive helper method for building random facets
+ *
+ * @param keyCounter used to ensure every generated facet has a unique key name
+ * @param maxDepth max possible depth allowed for the recusion, a lower value may be used depending on how many facets are returned at the current level.
+ */
+ private static Map<String,TermFacet> buildRandomFacets(AtomicInteger keyCounter, int maxDepth) {
+ final int numFacets = Math.max(1, TestUtil.nextInt(random(), -1, 3)); // 3/5th chance of being '1'
+ Map<String,TermFacet> results = new LinkedHashMap<>();
+ for (int i = 0; i < numFacets; i++) {
+ if (keyCounter.get() < 3) { // a hard limit on the total number of facets (regardless of depth) to reduce OOM risk
+
+ final String sort = randomSortParam(random());
+ final Integer limit = randomLimitParam(random(), sort);
+ final Integer overrequest = randomOverrequestParam(random());
+ final TermFacet facet = new TermFacet(field((random().nextBoolean()
+ ? STR_FIELD_SUFFIXES : INT_FIELD_SUFFIXES),
+ random().nextInt(MAX_FIELD_NUM)),
+ limit, overrequest, sort);
+ results.put("facet_" + keyCounter.incrementAndGet(), facet);
+ if (0 < maxDepth) {
+ // if we're going wide, don't go deep
+ final int nextMaxDepth = Math.max(0, maxDepth - numFacets);
+ facet.subFacets.putAll(buildRandomFacets(keyCounter, TestUtil.nextInt(random(), 0, nextMaxDepth)));
+ }
+ }
+ }
+ return results;
+ }
+ }
+
+ /**
+ * returns a random SolrClient -- either a CloudSolrClient, or an HttpSolrClient pointed
+ * at a node in our cluster
+ */
+ public static SolrClient getRandClient(Random rand) {
+ int numClients = CLIENTS.size();
+ int idx = TestUtil.nextInt(rand, 0, numClients);
+
+ return (idx == numClients) ? CLOUD_CLIENT : CLIENTS.get(idx);
+ }
+
+ /**
+ * Uses a random SolrClient to execture a request and returns only the numFound
+ * @see #getRandClient
+ */
+ public static long getNumFound(final SolrParams req) throws SolrServerException, IOException {
+ return getRandClient(random()).query(req).getResults().getNumFound();
+ }
+
+ public static void waitForRecoveriesToFinish(CloudSolrClient client) throws Exception {
+ assert null != client.getDefaultCollection();
+ AbstractDistribZkTestBase.waitForRecoveriesToFinish(client.getDefaultCollection(),
+ client.getZkStateReader(),
+ true, true, 330);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/669b9e7a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
index d562076..84c34e1 100644
--- a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
+++ b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
@@ -1015,6 +1015,16 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
"currency(amount,USD)",
"currency('amount',USD)");
}
+ public void testFuncRelatedness() throws Exception {
+ SolrQueryRequest req = req("fore","foo_s:front", "back","foo_s:back");
+ try {
+ assertFuncEquals(req,
+ "agg_relatedness({!query v='foo_s:front'}, {!query v='foo_s:back'})",
+ "agg_relatedness($fore, $back)");
+ } finally {
+ req.close();
+ }
+ }
public void testTestFuncs() throws Exception {
assertFuncEquals("sleep(1,5)", "sleep(1,5)");
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/669b9e7a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
index 8eed4c7..c7ce7c3 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
@@ -19,6 +19,7 @@ package org.apache.solr.search.facet;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.IntFunction;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.ValueSource;
@@ -88,8 +89,8 @@ public class DebugAgg extends AggValueSource {
}
@Override
- public void collect(int doc, int slot) throws IOException {
- sub.collect(doc, slot);
+ public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
+ sub.collect(doc, slot, slotContext);
}
@Override
@@ -126,8 +127,8 @@ public class DebugAgg extends AggValueSource {
}
@Override
- public int collect(DocSet docs, int slot) throws IOException {
- return sub.collect(docs, slot);
+ public int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
+ return sub.collect(docs, slot, slotContext);
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/669b9e7a/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java b/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java
index 0612755..ea3b5ef 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java
@@ -373,7 +373,10 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
NamedList<NamedList> all_facets = (NamedList) queryServer
( params( "q", "*:*", "shards", getShardsString(), "rows" , "0", "json.facet",
"{ foo : { " + commonJson + " field: foo_s, facet: { " +
- ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON + "} } } } }"
+ ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON +
+ // under bar, in addition to "ALL" simple stats, we also ask for skg...
+ ", skg : 'relatedness($skg_fore,$skg_back)' } } } } }",
+ "skg_fore", STAT_FIELD+":[0 TO 40]", "skg_back", STAT_FIELD+":[-10000 TO 10000]"
) ).getResponse().get("facets");
assertNotNull(all_facets);
@@ -411,7 +414,7 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
List<NamedList> tail_bar_buckets = (List) ((NamedList)tail_Bucket.get("bar")).get("buckets");
NamedList tailB_Bucket = tail_bar_buckets.get(0);
- assertEquals(ALL_STATS.size() + 2, tailB_Bucket.size()); // val,count ... NO SUB FACETS
+ assertEquals(ALL_STATS.size() + 3, tailB_Bucket.size()); // val,count,skg ... NO SUB FACETS
assertEquals("tailB", tailB_Bucket.get("val"));
assertEquals(17L, tailB_Bucket.get("count"));
assertEquals(35L, tailB_Bucket.get("min"));
@@ -423,6 +426,18 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
assertEquals(16910.0D, (double) tailB_Bucket.get("sumsq"), 0.1E-7);
// assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
assertEquals(1.70782513D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
+
+ // check the SKG stats on our tailB bucket
+ NamedList tailB_skg = (NamedList) tailB_Bucket.get("skg");
+ assertEquals(tailB_skg.toString(),
+ 3, tailB_skg.size());
+ assertEquals(0.19990D, tailB_skg.get("relatedness"));
+ assertEquals(0.00334D, tailB_skg.get("foreground_popularity"));
+ assertEquals(0.00334D, tailB_skg.get("background_popularity"));
+ //assertEquals(12L, tailB_skg.get("foreground_count"));
+ //assertEquals(82L, tailB_skg.get("foreground_size"));
+ //assertEquals(12L, tailB_skg.get("background_count"));
+ //assertEquals(3591L, tailB_skg.get("background_size"));
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/669b9e7a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
index 1e62cb2..687adde 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
@@ -407,6 +407,75 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS {
"}"
);
+ // test that SKG stat reflects merged refinement
+ client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]",
+ "json.facet", "{"
+ + " cat0:{ ${terms} type:terms, field: ${cat_s}, "
+ + " sort:'count desc', limit:1, overrequest:0, refine:true, "
+ + " facet:{ s:'relatedness($fore,$back)'} } }")
+ , "facets=={ count:8, cat0:{ buckets:[ "
+ + " { val:A, count:4, "
+ + " s : { relatedness: 0.00496, "
+ //+ " foreground_count: 3, "
+ //+ " foreground_size: 5, "
+ //+ " background_count: 2, "
+ //+ " background_size: 4, "
+ + " foreground_popularity: 0.75, "
+ + " background_popularity: 0.5, "
+ + " } } ] }" +
+ "}"
+ );
+
+ // SKG under nested facet where some terms only exist on one shard
+ {
+ // sub-bucket order should change as sort direction changes
+ final String jsonFacet = ""
+ + "{ processEmpty:true, "
+ + " cat0:{ ${terms} type:terms, field: ${cat_s}, "
+ + " sort:'count desc', limit:1, overrequest:0, refine:true, "
+ + " facet:{ processEmpty:true, "
+ + " qw1: { ${terms} type:terms, field: ${qw_s}, mincount:0, "
+ + " sort:'${skg_sort}', limit:100, overrequest:0, refine:true, "
+ + " facet:{ processEmpty:true, skg:'relatedness($fore,$back)' } } } } }";
+ final String bucketQ = ""
+ + " { val:Q, count:1, "
+ + " skg : { relatedness: 1.0, "
+ + " foreground_popularity: 0.25, "
+ + " background_popularity: 0.0, "
+ // + " foreground_count: 1, "
+ // + " foreground_size: 3, "
+ // + " background_count: 0, "
+ // + " background_size: 4, "
+ + " } },";
+ final String bucketW = ""
+ + " { val:W, count:1, "
+ + " skg : { relatedness: 0.0037, "
+ + " foreground_popularity: 0.25, "
+ + " background_popularity: 0.25, "
+ // + " foreground_count: 1, "
+ // + " foreground_size: 3, "
+ // + " background_count: 1, "
+ // + " background_size: 4, "
+ + " } },";
+
+ client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]",
+ "skg_sort", "skg desc", "json.facet", jsonFacet)
+ , "facets=={ count:8, cat0:{ buckets:[ "
+ + " { val:A, count:4, "
+ + " qw1 : { buckets:["
+ + bucketQ
+ + bucketW
+ + " ] } } ] } }");
+ client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]",
+ "skg_sort", "skg asc", "json.facet", jsonFacet)
+ , "facets=={ count:8, cat0:{ buckets:[ "
+ + " { val:A, count:4, "
+ + " qw1 : { buckets:["
+ + bucketW
+ + bucketQ
+ + " ] } } ] } }");
+ }
+
// test partial buckets (field facet within field facet)
client.testJQ(params(p, "q", "*:*",
"json.facet", "{" +
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/669b9e7a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
index b6afdb8..4402d78 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
@@ -213,6 +213,245 @@ public class TestJsonFacets extends SolrTestCaseHS {
}
@Test
+ public void testExplicitQueryDomain() throws Exception {
+ Client client = Client.localClient();
+ indexSimple(client);
+
+ { // simple 'query' domain
+
+ // the facet buckets for all of the requests below should be identical
+ // only the numFound & top level facet count should differ
+ final String expectedFacets
+ = "facets/w=={ buckets:["
+ + " { val:'NJ', count:2}, "
+ + " { val:'NY', count:1} ] }";
+
+ assertJQ(req("rows", "0", "q", "cat_s:B", "json.facet",
+ "{w: {type:terms, field:'where_s'}}"),
+ "response/numFound==3",
+ "facets/count==3",
+ expectedFacets);
+ assertJQ(req("rows", "0", "q", "id:3", "json.facet",
+ "{w: {type:terms, field:'where_s', domain: { query:'cat_s:B' }}}"),
+ "response/numFound==1",
+ "facets/count==1",
+ expectedFacets);
+ assertJQ(req("rows", "0", "q", "*:*", "fq", "-*:*", "json.facet",
+ "{w: {type:terms, field:'where_s', domain: { query:'cat_s:B' }}}"),
+ "response/numFound==0",
+ "facets/count==0",
+ expectedFacets);
+ assertJQ(req("rows", "0", "q", "*:*", "fq", "-*:*", "domain_q", "cat_s:B", "json.facet",
+ "{w: {type:terms, field:'where_s', domain: { query:{param:domain_q} }}}"),
+ "response/numFound==0",
+ "facets/count==0",
+ expectedFacets);
+ }
+
+ { // a nested explicit query domain
+
+ // for all of the "top" buckets, the subfacet should have identical sub-buckets
+ final String expectedSubBuckets = "{ buckets:[ { val:'B', count:3}, { val:'A', count:2} ] }";
+ assertJQ(req("rows", "0", "q", "num_i:[0 TO *]", "json.facet",
+ "{w: {type:terms, field:'where_s', " +
+ " facet: { c: { type:terms, field:'cat_s', domain: { query:'*:*' }}}}}")
+ , "facets/w=={ buckets:["
+ + " { val:'NJ', count:2, c: " + expectedSubBuckets + "}, "
+ + " { val:'NY', count:1, c: " + expectedSubBuckets + "} "
+ + "] }"
+ );
+ }
+
+ { // an (effectively) empty query should produce an error
+ ignoreException("'query' domain can not be null");
+ ignoreException("'query' domain must not evaluate to an empty list");
+ for (String raw : Arrays.asList("null", "[ ]", "{param:bogus}")) {
+ expectThrows(SolrException.class, () -> {
+ assertJQ(req("rows", "0", "q", "num_i:[0 TO *]", "json.facet",
+ "{w: {type:terms, field:'where_s', " +
+ " facet: { c: { type:terms, field:'cat_s', domain: { query: "+raw+" }}}}}"));
+ });
+ }
+ }
+ }
+
+
+ @Test
+ public void testSimpleSKG() throws Exception {
+ Client client = Client.localClient();
+ indexSimple(client);
+
+ // using relatedness() as a top level stat, not nested under any facet
+ // (not particularly useful, but shouldn't error either)
+ assertJQ(req("q", "cat_s:[* TO *]", "rows", "0",
+ "fore", "where_s:NY", "back", "*:*",
+ "json.facet", " { skg: 'relatedness($fore,$back)' }")
+ , "facets=={"
+ + " count:5, "
+ + " skg : { relatedness: 0.00699,"
+ + " foreground_popularity: 0.33333,"
+ + " background_popularity: 0.83333,"
+ + " } }"
+ );
+
+ // simple single level facet w/skg stat & sorting
+ for (String sort : Arrays.asList("index asc", "skg desc")) {
+ // the relatedness score of each of our cat_s values is (conviniently) also alphabetical order
+ // so both of these sort options should produce identical output
+ // and testinging "index" sort allows the randomized use of "stream" processor as default to be tested
+ assertJQ(req("q", "cat_s:[* TO *]", "rows", "0",
+ "fore", "where_s:NY", "back", "*:*",
+ "json.facet", ""
+ + "{x: { type: terms, field: 'cat_s', sort: '"+sort+"', "
+ + " facet: { skg: 'relatedness($fore,$back)' } } }")
+ , "facets=={count:5, x:{ buckets:["
+ + " { val:'A', count:2, "
+ + " skg : { relatedness: 0.00554, "
+ //+ " foreground_count: 1, "
+ //+ " foreground_size: 2, "
+ //+ " background_count: 2, "
+ //+ " background_size: 6,"
+ + " foreground_popularity: 0.16667,"
+ + " background_popularity: 0.33333, },"
+ + " }, "
+ + " { val:'B', count:3, "
+ + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
+ //+ " foreground_count: 1, "
+ //+ " foreground_size: 2, "
+ //+ " background_count: 3, "
+ //+ " background_size: 6,"
+ + " foreground_popularity: 0.16667,"
+ + " background_popularity: 0.5 },"
+ + " } ] } } "
+ );
+ }
+
+ // SKG used in multiple nested facets
+ //
+ // we'll re-use these params in 2 requests, one will simulate a shard request
+ final SolrParams nestedSKG = params
+ ("q", "cat_s:[* TO *]", "rows", "0", "fore", "num_i:[-1000 TO 0]", "back", "*:*", "json.facet"
+ , "{x: { type: terms, field: 'cat_s', sort: 'skg desc', "
+ + " facet: { skg: 'relatedness($fore,$back)', "
+ + " y: { type: terms, field: 'where_s', sort: 'skg desc', "
+ + " facet: { skg: 'relatedness($fore,$back)' } } } } }");
+
+ // plain old request
+ assertJQ(req(nestedSKG)
+ , "facets=={count:5, x:{ buckets:["
+ + " { val:'B', count:3, "
+ + " skg : { relatedness: 0.01539, "
+ //+ " foreground_count: 2, "
+ //+ " foreground_size: 2, "
+ //+ " background_count: 3, "
+ //+ " background_size: 6, "
+ + " foreground_popularity: 0.33333,"
+ + " background_popularity: 0.5 },"
+ + " y : { buckets:["
+ + " { val:'NY', count: 1, "
+ + " skg : { relatedness: 0.00554, "
+ //+ " foreground_count: 1, "
+ //+ " foreground_size: 2, "
+ //+ " background_count: 2, "
+ //+ " background_size: 6, "
+ + " foreground_popularity: 0.16667, "
+ + " background_popularity: 0.33333, "
+ + " } }, "
+ + " { val:'NJ', count: 2, "
+ + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
+ //+ " foreground_count: 1, "
+ //+ " foreground_size: 2, "
+ //+ " background_count: 3, "
+ //+ " background_size: 6, "
+ + " foreground_popularity: 0.16667, "
+ + " background_popularity: 0.5, "
+ + " } }, "
+ + " ] } "
+ + " }, "
+ + " { val:'A', count:2, "
+ + " skg : { relatedness:-0.01097, "
+ //+ " foreground_count: 0, "
+ //+ " foreground_size: 2, "
+ //+ " background_count: 2, "
+ //+ " background_size: 6,"
+ + " foreground_popularity: 0.0,"
+ + " background_popularity: 0.33333 },"
+ + " y : { buckets:["
+ + " { val:'NJ', count: 1, "
+ + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
+ //+ " foreground_count: 0, "
+ //+ " foreground_size: 0, "
+ //+ " background_count: 3, "
+ //+ " background_size: 6, "
+ + " foreground_popularity: 0.0, "
+ + " background_popularity: 0.5, "
+ + " } }, "
+ + " { val:'NY', count: 1, "
+ + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
+ //+ " foreground_count: 0, "
+ //+ " foreground_size: 0, "
+ //+ " background_count: 2, "
+ //+ " background_size: 6, "
+ + " foreground_popularity: 0.0, "
+ + " background_popularity: 0.33333, "
+ + " } }, "
+ + " ] } } ] } } ");
+
+ // same request, but with whitebox params testing isShard
+ // to verify the raw counts/sizes
+ assertJQ(req(nestedSKG,
+ // fake an initial shard request
+ "distrib", "false", "isShard", "true", "_facet_", "{}", "shards.purpose", "2097216")
+ , "facets=={count:5, x:{ buckets:["
+ + " { val:'B', count:3, "
+ + " skg : { "
+ + " foreground_count: 2, "
+ + " foreground_size: 2, "
+ + " background_count: 3, "
+ + " background_size: 6 }, "
+ + " y : { buckets:["
+ + " { val:'NY', count: 1, "
+ + " skg : { "
+ + " foreground_count: 1, "
+ + " foreground_size: 2, "
+ + " background_count: 2, "
+ + " background_size: 6, "
+ + " } }, "
+ + " { val:'NJ', count: 2, "
+ + " skg : { "
+ + " foreground_count: 1, "
+ + " foreground_size: 2, "
+ + " background_count: 3, "
+ + " background_size: 6, "
+ + " } }, "
+ + " ] } "
+ + " }, "
+ + " { val:'A', count:2, "
+ + " skg : { "
+ + " foreground_count: 0, "
+ + " foreground_size: 2, "
+ + " background_count: 2, "
+ + " background_size: 6 },"
+ + " y : { buckets:["
+ + " { val:'NJ', count: 1, "
+ + " skg : { "
+ + " foreground_count: 0, "
+ + " foreground_size: 0, "
+ + " background_count: 3, "
+ + " background_size: 6, "
+ + " } }, "
+ + " { val:'NY', count: 1, "
+ + " skg : { "
+ + " foreground_count: 0, "
+ + " foreground_size: 0, "
+ + " background_count: 2, "
+ + " background_size: 6, "
+ + " } }, "
+ + " ] } } ] } } ");
+
+ }
+
+ @Test
public void testRepeatedNumerics() throws Exception {
Client client = Client.localClient();
String field = "num_is"; // docValues of multi-valued points field can contain duplicate values... make sure they don't mess up our counts.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/669b9e7a/solr/solr-ref-guide/src/json-facet-api.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/json-facet-api.adoc b/solr/solr-ref-guide/src/json-facet-api.adoc
index 4f0ec7d..8250117 100644
--- a/solr/solr-ref-guide/src/json-facet-api.adoc
+++ b/solr/solr-ref-guide/src/json-facet-api.adoc
@@ -1,4 +1,5 @@
= JSON Facet API
+:page-tocclass: right
[[JSONFacetAPI]]
== Facet & Analytics Module
@@ -338,17 +339,18 @@ Aggregation functions, also called *facet functions, analytic functions,* or **m
[width="100%",cols="10%,30%,60%",options="header",]
|===
|Aggregation |Example |Description
-|sum |sum(sales) |summation of numeric values
-|avg |avg(popularity) |average of numeric values
-|min |min(salary) |minimum value
-|max |max(mul(price,popularity)) |maximum value
-|unique |unique(author) |number of unique values of the given field. Beyond 100 values it yields not exact estimate
-|uniqueBlock |uniqueBlock(\_root_) |same as above with smaller footprint strictly requires <<uploading-data-with-index-handlers.adoc#nested-child-documents, block index>>. The given field is expected to be unique across blocks, now only singlevalued string fields are supported, docValues are recommended.
-|hll |hll(author) |distributed cardinality estimate via hyper-log-log algorithm
-|percentile |percentile(salary,50,75,99,99.9) |Percentile estimates via t-digest algorithm. When sorting by this metric, the first percentile listed is used as the sort value.
-|sumsq |sumsq(rent) |sum of squares of field or function
-|variance |variance(rent) |variance of numeric field or function
-|stddev |stddev(rent) |standard deviation of field or function
+|sum |`sum(sales)` |summation of numeric values
+|avg |`avg(popularity)` |average of numeric values
+|min |`min(salary)` |minimum value
+|max |`max(mul(price,popularity))` |maximum value
+|unique |`unique(author)` |number of unique values of the given field. Beyond 100 values it yields not exact estimate
+|uniqueBlock |`uniqueBlock(\_root_)` |same as above with smaller footprint strictly requires <<uploading-data-with-index-handlers.adoc#nested-child-documents, block index>>. The given field is expected to be unique across blocks, now only singlevalued string fields are supported, docValues are recommended.
+|hll |`hll(author)` |distributed cardinality estimate via hyper-log-log algorithm
+|percentile |`percentile(salary,50,75,99,99.9)` |Percentile estimates via t-digest algorithm. When sorting by this metric, the first percentile listed is used as the sort value.
+|sumsq |`sumsq(rent)` |sum of squares of field or function
+|variance |`variance(rent)` |variance of numeric field or function
+|stddev |`stddev(rent)` |standard deviation of field or function
+|relatedness |`relatedness('popularity:[100 TO \*]','inStock:true')`|A function for computing a relatedness score of the documents in the domain to a Foreground set, relative to a Background set (both defined as queries). This is primarily for use when building <<Semantic Knowledge Graphs>>.
|===
Numeric aggregation functions such as `avg` can be on any numeric field, or on another function of multiple numeric fields such as `avg(mul(price,popularity))`.
@@ -514,6 +516,126 @@ Aggregation `uniqueBlock(\_root_)` is functionally equivalent to `unique(\_root_
It's recommended to define `limit: -1` for `uniqueBlock` calculation, like in above example,
since default value of `limit` parameter is `10`, while `uniqueBlock` is supposed to be much faster with `-1`.
+== Semantic Knowledge Graphs
+
+The `relatedness(...)` aggregation functions allows for sets of documents to be scored relative to Foreground and Background sets of documents, for the purposes of finding ad-hoc relationships that make up a "Semantic Knowledge Graph":
+
+[quote, Grainger et al., 'https://arxiv.org/abs/1609.00464[The Semantic Knowledge Graph]']
+____
+At its heart, the Semantic Knowledge Graph leverages an inverted index, along with a complementary uninverted index, to represent nodes (terms) and edges (the documents within intersecting postings lists for multiple terms/nodes). This provides a layer of indirection between each pair of nodes and their corresponding edge, enabling edges to materialize dynamically from underlying corpus statistics. As a result, any combination of nodes can have edges to any other nodes materialize and be scored to reveal latent relationships between the nodes.
+____
+
+The `relatedness(...)` function is used to "score" these relationships, relative to "Foreground" and "Background" sets of documents, specified in the function params as queries.
+
+Unlike most aggregation functions, the `relatedness(...)` function is aware of if/how it's used in <<NestedFacets,Nested Facets>>. It evaluates the query defining the current bucket _independently_ from it's parent/ancestor buckets, and intersects those documents with a "Foreground Set" defined by the foreground query _combined with the ancestor buckets_. The result is then compared to a similar intersection done against the "Background Set" (defined exclusively by background query) to see if there is a positive, or negative, correlation between the current bucket and the Foreground Set, relative to the Background Set.
+
+=== Semantic Knowledge Graph Example
+
+
+.Sample Documents
+[source,bash,subs="verbatim,callouts"]
+----
+curl -sS -X POST 'http://localhost:8983/solr/gettingstarted/update?commit=true' -d '[
+{"id":"01",age:15,"state":"AZ","hobbies":["soccer","painting","cycling"]},
+{"id":"02",age:22,"state":"AZ","hobbies":["swimming","darts","cycling"]},
+{"id":"03",age:27,"state":"AZ","hobbies":["swimming","frisbee","painting"]},
+{"id":"04",age:33,"state":"AZ","hobbies":["darts"]},
+{"id":"05",age:42,"state":"AZ","hobbies":["swimming","golf","painting"]},
+{"id":"06",age:54,"state":"AZ","hobbies":["swimming","golf"]},
+{"id":"07",age:67,"state":"AZ","hobbies":["golf","painting"]},
+{"id":"08",age:71,"state":"AZ","hobbies":["painting"]},
+{"id":"09",age:14,"state":"CO","hobbies":["soccer","frisbee","skiing","swimming","skating"]},
+{"id":"10",age:23,"state":"CO","hobbies":["skiing","darts","cycling","swimming"]},
+{"id":"11",age:26,"state":"CO","hobbies":["skiing","golf"]},
+{"id":"12",age:35,"state":"CO","hobbies":["golf","frisbee","painting","skiing"]},
+{"id":"13",age:47,"state":"CO","hobbies":["skiing","darts","painting","skating"]},
+{"id":"14",age:51,"state":"CO","hobbies":["skiing","golf"]},
+{"id":"15",age:64,"state":"CO","hobbies":["skating","cycling"]},
+{"id":"16",age:73,"state":"CO","hobbies":["painting"]},
+]'
+----
+
+.Example Query
+[source,bash,subs="verbatim,callouts"]
+----
+curl -sS -X POST http://localhost:8983/solr/gettingstarted/query -d 'rows=0&q=*:*
+&back=*:* # <1>
+&fore=age:[35 TO *] # <2>
+&json.facet={
+ hobby : {
+ type : terms,
+ field : hobbies,
+ limit : 5,
+ sort : { r1: desc }, # <3>
+ facet : {
+ r1 : "relatedness($fore,$back)", # <4>
+ location : {
+ type : terms,
+ field : state,
+ limit : 2,
+ sort : { r2: desc }, # <3>
+ facet : {
+ r2 : "relatedness($fore,$back)" # <4>
+ }
+ }
+ }
+ }
+}'
+----
+<1> Use the entire collection as our "Background Set"
+<2> Use a query for "age >= 35" to define our (initial) "Foreground Set"
+<3> For both the top level `hobbies` facet & the sub-facet on `state` we will be sorting on the `relatedness(...)` values
+<4> In both calls to the `relatedness(...)` function, we use <<local-parameters-in-queries.adoc#parameter-dereferencing,Parameter Variables>> to refer to the previously defined `fore` and `back` queries.
+
+.The Facet Response
+[source,javascript,subs="verbatim,callouts"]
+----
+"facets":{
+ "count":16,
+ "hobby":{
+ "buckets":[{
+ "val":"golf",
+ "count":6, // <1>
+ "r1":{
+ "relatedness":0.01225,
+ "foreground_popularity":0.3125, // <2>
+ "background_popularity":0.375}, // <3>
+ "location":{
+ "buckets":[{
+ "val":"az",
+ "count":3,
+ "r2":{
+ "relatedness":0.00496, // <4>
+ "foreground_popularity":0.1875, // <6>
+ "background_popularity":0.5}}, // <7>
+ {
+ "val":"co",
+ "count":3,
+ "r2":{
+ "relatedness":-0.00496, // <5>
+ "foreground_popularity":0.125,
+ "background_popularity":0.5}}]}},
+ {
+ "val":"painting",
+ "count":8, // <1>
+ "r1":{
+ "relatedness":0.01097,
+ "foreground_popularity":0.375,
+ "background_popularity":0.5},
+ "location":{
+ "buckets":[{
+ ...
+----
+<1> Even though `hobbies:golf` has a lower total facet `count` then `hobbies:painting`, it has a higher `relatedness` score, indicating that relative to the Background Set (the entire collection) Golf has a stronger correlation to our Foreground Set (people age 35+) then Painting.
+<2> The number of documents matching `age:[35 TO *]` _and_ `hobbies:golf` is 31.25% of the total number of documents in the Background Set
+<3> 37.5% of the documents in the Background Set match `hobbies:golf`
+<4> The state of Arizona (AZ) has a _positive_ relatedness correlation with the _nested_ Foreground Set (people ages 35+ who play Golf) compared to the Background Set -- ie: "People in Arizona are statistically more likely to be '35+ year old Golfers' then the country as a whole."
+<5> The state of Colorado (CO) has a _negative_ correlation with the nested Foreground Set -- ie: "People in Colorado are statistically less likely to be '35+ year old Golfers' then the country as a whole."
+<6> The number documents matching `age:[35 TO *]` _and_ `hobbies:golf` _and_ `state:AZ` is 18.75% of the total number of documents in the Background Set
+<7> 50% of the documents in the Background Set match `state:AZ`
+
+NOTE: While it's very common to define the Background Set as `\*:*`, or some other super-set of the Foreground Query, it is not strictly required. The `relatedness(...)` function can be used to compare the statistical relatedness of sets of documents to orthogonal foreground/background queries.
+
[[References]]
== References