You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2021/03/26 16:59:53 UTC

[lucene-solr] branch branch_8x updated: SOLR-15291: ref-guide note clarifying 'safe' way to do De-Duplication w/SignatureUpdateProcessorFactory in SolrCloud

This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 2557026  SOLR-15291: ref-guide note clarifying 'safe' way to do De-Duplication w/SignatureUpdateProcessorFactory in SolrCloud
2557026 is described below

commit 255702603f442fc612872740757b104dea3fb81c
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Fri Mar 26 09:23:34 2021 -0700

    SOLR-15291: ref-guide note clarifying 'safe' way to do De-Duplication w/SignatureUpdateProcessorFactory in SolrCloud
    
    Include test case demonstrating this works
    
    (cherry picked from commit 507f79158458c450e1f3d2e8ad6ab3e1c3902403)
    (cherry picked from commit 85828e37f3492cde2c19205d3f99c52330902e43)
---
 .../solr/configsets/dedup/conf/schema.xml          |  31 +++++
 .../solr/configsets/dedup/conf/solrconfig.xml      |  53 ++++++++
 .../update/processor/TestCloudDeduplication.java   | 146 +++++++++++++++++++++
 solr/solr-ref-guide/src/de-duplication.adoc        |  40 ++----
 4 files changed, 242 insertions(+), 28 deletions(-)

diff --git a/solr/core/src/test-files/solr/configsets/dedup/conf/schema.xml b/solr/core/src/test-files/solr/configsets/dedup/conf/schema.xml
new file mode 100644
index 0000000..1a4ef0d
--- /dev/null
+++ b/solr/core/src/test-files/solr/configsets/dedup/conf/schema.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<schema name="dedup" version="1.6">
+  
+  <fieldType name="string" class="solr.StrField"/>
+  <fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+  
+  <!-- for versioning -->
+  <field name="_version_" type="long" indexed="true" stored="true"/>
+  <field name="_root_" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
+  <field name="id" type="string" indexed="true" stored="true"/>
+
+  <dynamicField name="*_s" type="string" indexed="true" stored="true" docValues="true"/>
+
+  <uniqueKey>id</uniqueKey>
+</schema>
diff --git a/solr/core/src/test-files/solr/configsets/dedup/conf/solrconfig.xml b/solr/core/src/test-files/solr/configsets/dedup/conf/solrconfig.xml
new file mode 100644
index 0000000..5639a47
--- /dev/null
+++ b/solr/core/src/test-files/solr/configsets/dedup/conf/solrconfig.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- For testing deduplication using SignatureUpdateProcessorFactory -->
+
+<config>
+
+  <dataDir>${solr.data.dir:}</dataDir>
+
+  <directoryFactory name="DirectoryFactory"
+                    class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/>
+  <schemaFactory class="ClassicIndexSchemaFactory"/>
+
+  <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
+  
+  <updateHandler class="solr.DirectUpdateHandler2">
+    <updateLog class="${solr.ulog:solr.UpdateLog}"></updateLog>
+  </updateHandler>
+  
+  <indexConfig>
+    <mergeScheduler class="${solr.mscheduler:org.apache.lucene.index.ConcurrentMergeScheduler}"/>
+  </indexConfig>
+  
+  <updateRequestProcessorChain default="true" >
+    <processor class="solr.LogUpdateProcessorFactory" />
+    <processor class="solr.processor.SignatureUpdateProcessorFactory">
+      <str name="signatureField">id</str>
+      <str name="fields">data_s</str>
+      <str name="signatureClass">solr.processor.Lookup3Signature</str>
+    </processor>
+    <processor class="solr.RunUpdateProcessorFactory" />
+  </updateRequestProcessorChain>
+
+  <requestHandler name="/select" class="solr.SearchHandler" />
+
+</config>
+
diff --git a/solr/core/src/test/org/apache/solr/update/processor/TestCloudDeduplication.java b/solr/core/src/test/org/apache/solr/update/processor/TestCloudDeduplication.java
new file mode 100644
index 0000000..f945ec7
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/update/processor/TestCloudDeduplication.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.request.json.JsonQueryRequest;
+import org.apache.solr.client.solrj.request.json.TermsFacetMap;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.client.solrj.response.json.BucketBasedJsonFacet;
+import org.apache.solr.client.solrj.response.json.BucketJsonFacet;
+import org.apache.solr.cloud.SolrCloudTestCase;
+
+import org.apache.lucene.util.IOUtils;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+/**
+ * Tests the ability to use {@link SignatureUpdateProcessorFactory} to generate uniqueKeys for "duplicate" documents
+ * in cloud mode.
+ */
+public class TestCloudDeduplication extends SolrCloudTestCase {
+  public final static String COLLECTION = "dedup_col";
+  
+  /** One client per node */
+  private static final List<SolrClient> NODE_CLIENTS = new ArrayList<>(7);
+  /** clients (including cloud client) for easy randomization and looping of collection level requests */
+  private static final List<SolrClient> CLIENTS = new ArrayList<>(7);
+  
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    final int numShards = usually() ? 2 : 1;
+    final int numReplicas = usually() ? 2 : 1;
+    final int numNodes = 1 + (numShards * numReplicas);  // at least one node w/o any replicas
+    configureCluster(numNodes) 
+      .addConfig("conf", configset("dedup"))
+      .configure();
+
+    CLIENTS.add(cluster.getSolrClient());
+    for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
+      final SolrClient c = getHttpSolrClient(jetty.getBaseUrl().toString());
+      NODE_CLIENTS.add(c);
+      CLIENTS.add(c);
+    }
+    
+    assertEquals("failed to create collection", 0,
+                 CollectionAdminRequest
+                 .createCollection(COLLECTION, "conf", numShards, numReplicas)
+                 .process(cluster.getSolrClient()).getStatus());
+
+    cluster.waitForActiveCollection(COLLECTION, numShards, numShards * numReplicas);
+
+  }
+  
+  @AfterClass
+  private static void closeClients() throws Exception {
+    try {
+      IOUtils.close(NODE_CLIENTS);
+    } finally {
+      NODE_CLIENTS.clear();
+      CLIENTS.clear();
+    }
+  }
+
+  @After
+  public void clearCollection() throws Exception {
+    assertEquals("DBQ failed", 0, cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*").getStatus());
+    assertEquals("commit failed", 0, cluster.getSolrClient().commit(COLLECTION).getStatus());
+  }
+
+  
+  public void testRandomDocs() throws Exception {
+
+    // index some random documents, using a mix-match of batches, to various SolrClients
+    
+    final int uniqueMod = atLeast(43);         // the number of unique sig values expected
+    final int numBatches = atLeast(uniqueMod); // we'll add at least one doc per batch
+    int docCounter = 0;
+    for (int batchId = 0; batchId < numBatches; batchId++) {
+      final UpdateRequest ureq = new UpdateRequest();
+      final int batchSize = atLeast(2);
+      for (int i = 0; i < batchSize; i++) {
+        docCounter++;
+        ureq.add(sdoc(// NOTE: No 'id' field, SignatureUpdateProcessor fills it in for us
+                      "data_s", (docCounter % uniqueMod)));
+      }
+      assertEquals("add failed", 0, ureq.process(getRandClient(), COLLECTION).getStatus());
+    }
+    assertEquals("commit failed", 0, getRandClient().commit(COLLECTION).getStatus());
+    
+    assert docCounter > uniqueMod;
+    
+    // query our collection and confirm no duplicates on the signature field (using faceting)
+    // Check every (node) for consistency...
+    final JsonQueryRequest req = new JsonQueryRequest()
+      .setQuery("*:*")
+      .setLimit(0)
+      .withFacet("data_facet", new TermsFacetMap("data_s").setLimit(uniqueMod + 1));
+    for (SolrClient client : CLIENTS) {
+      final QueryResponse rsp = req.process(client, COLLECTION);
+      try {
+        assertEquals(0, rsp.getStatus());
+        assertEquals(uniqueMod, rsp.getResults().getNumFound());
+        
+        final BucketBasedJsonFacet facet = rsp.getJsonFacetingResponse().getBucketBasedFacets("data_facet");
+        assertEquals(uniqueMod, facet.getBuckets().size());
+        for (BucketJsonFacet bucket : facet.getBuckets()) {
+          assertEquals("Bucket " + bucket.getVal(),
+                       1, bucket.getCount());
+        }
+      } catch (AssertionError e) {
+        throw new AssertionError(rsp + " + " + client + " => " + e.getMessage(), e);
+      }
+    }
+  }
+  
+  /** 
+   * returns a random SolrClient -- either a CloudSolrClient, or an HttpSolrClient pointed 
+   * at a node in our cluster.
+   */
+  private static SolrClient getRandClient() {
+    return CLIENTS.get(random().nextInt(CLIENTS.size()));
+  }
+}
diff --git a/solr/solr-ref-guide/src/de-duplication.adoc b/solr/solr-ref-guide/src/de-duplication.adoc
index 282f4a9..cb12bca 100644
--- a/solr/solr-ref-guide/src/de-duplication.adoc
+++ b/solr/solr-ref-guide/src/de-duplication.adoc
@@ -43,15 +43,13 @@ The `SignatureUpdateProcessorFactory` has to be registered in `solrconfig.xml` a
 
 [source,xml]
 ----
-<updateRequestProcessorChain name="dedupe">
+<updateRequestProcessorChain default="true">
+  <processor class="solr.LogUpdateProcessorFactory" />
   <processor class="solr.processor.SignatureUpdateProcessorFactory">
-    <bool name="enabled">true</bool>
     <str name="signatureField">id</str>
-    <bool name="overwriteDupes">false</bool>
     <str name="fields">name,features,cat</str>
     <str name="signatureClass">solr.processor.Lookup3Signature</str>
   </processor>
-  <processor class="solr.LogUpdateProcessorFactory" />
   <processor class="solr.RunUpdateProcessorFactory" />
 </updateRequestProcessorChain>
 ----
@@ -77,32 +75,18 @@ enabled::
 Set to *false* to disable de-duplication processing. The default is *true*.
 
 overwriteDupes::
-If true, the default, when a document exists that already matches this signature, it will be overwritten.
-
-=== In schema.xml
-
-If you are using a separate field for storing the signature, you must have it indexed:
-
-[source,xml]
-----
-<field name="signatureField" type="string" stored="true" indexed="true" multiValued="false" />
-----
+If *true*, the default, when a document exists that already matches this signature, it will be overwritten.  If you are using `overwriteDupes=true` the `signatureField` must be `indexed="true"` in your Schema.
 
-Be sure to change your update handlers to use the defined chain, as below:
+.Using `SignatureUpdateProcessorFactory` in SolrCloud
+[WARNING]
+====
+// https://issues.apache.org/jira/browse/SOLR-3473
 
-[source,xml]
-----
-<requestHandler name="/update" class="solr.UpdateRequestHandler" >
-  <lst name="defaults">
-    <str name="update.chain">dedupe</str>
-  </lst>
-...
-</requestHandler>
-----
+There are 2 important things to keep in mind when using `SignatureUpdateProcessorFactory` with SolrCloud:
 
-This example assumes you have other sections of your request handler defined.
+1. The `overwriteDupes=true` setting does not work _except_ in the special case of using the uniqueKey field as the `signatureField`.  Attempting De-duplication on any other `signatureField` will not work correctly because of how updates are forwarded to replicas
+1. When using the uniqueKey field as the `signatureField`, `SignatureUpdateProcessorFactory` must be run prior to the `<<update-request-processors.adoc#update-processors-in-solrcloud,DistributedUpdateProcessor>>` to ensure that documents can be routed to the correct shard leader based on the (generated) uniqueKey field.
 
-[TIP]
-====
-The update processor can also be specified per request with a parameter of `update.chain=dedupe`.
+(Using any other `signatureField` with `overwriteDupes=false` -- to generate a Signature for each document with out De-duplication -- has no limitations.)
 ====
+