You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ge...@apache.org on 2022/09/28 15:20:00 UTC

[solr] branch main updated: SOLR-16428: Add "permissive" mode to IgnoreLargeDocumentsProcessorFactory (#1040)

This is an automated email from the ASF dual-hosted git repository.

gerlowskija pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 708c4619924 SOLR-16428: Add "permissive" mode to IgnoreLargeDocumentsProcessorFactory (#1040)
708c4619924 is described below

commit 708c46199241a55408cde856d6a489e6249d571e
Author: Jason Gerlowski <ge...@apache.org>
AuthorDate: Wed Sep 28 11:19:53 2022 -0400

    SOLR-16428: Add "permissive" mode to IgnoreLargeDocumentsProcessorFactory (#1040)
    
    Prior to this commit, IgnoreLargeDocumentProcessorFactory only had a
    single way to handle documents that exceeded its configurable size
    limit.  The first violation would throw a SolrException: in effect,
    short-circuiting any other documents in the "batch" being processed.
    
    This approach is very dependent on the ordering of docs within a
    batch.  A 100-doc batch with only 1 "size offender" might index 99 docs
    or none at all, depending on where the offender is in the list.  This is
    great for end users whose clients are built to handle the resulting 400
    response, and resubmit their whole batch.  But it's not ideal for every
    use case.
    
    This commit introduces an alternate approach for handling these
    violations: quietly log out the ID and size of the offending doc but
    don't throw any exception that will short-circuit the remainder of the
    batch.
    
    The desired error-handling can be chosen using the URP's new config
    parameter 'permissiveMode'.  When false (the default), the legacy
    behavior of short-circuiting the batch and surfacing a 400 error is
    used.  Otherwise, the new "just log things out and continue" behavior is
    used.
---
 solr/CHANGES.txt                                   |  6 +++-
 .../IgnoreLargeDocumentProcessorFactory.java       | 40 ++++++++++++++++++----
 .../IgnoreLargeDocumentProcessorFactoryTest.java   | 28 +++++++++++++--
 .../pages/update-request-processors.adoc           |  5 +++
 4 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 0c3e5f3a5d4..46e0429d905 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -36,7 +36,11 @@ New Features
 
 Improvements
 ---------------------
-(No changes)
+
+* SOLR-16428: IgnoreLargeDocumentProcessorFactory now supports a "permissive" mode, where it logs and skips
+  offending documents but doesn't short-circuit the entire batch or return a 4xx error.  This mode can be enabled
+  by setting the `permissiveMode` boolean option to `true` in your solrconfig.xml's
+  IgnoreLargeDocumentProcessorFactory declaration. (Jason Gerlowski)
 
 Optimizations
 ---------------------
diff --git a/solr/core/src/java/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactory.java
index fea5fe6b225..16508dd864b 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactory.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactory.java
@@ -21,16 +21,20 @@ import static org.apache.solr.common.SolrException.ErrorCode.BAD_REQUEST;
 import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
 
 import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.util.Collection;
 import java.util.IdentityHashMap;
 import java.util.Map;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.update.AddUpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Gives system administrators a way to ignore very large update from clients. When an update goes
@@ -41,14 +45,20 @@ import org.apache.solr.update.AddUpdateCommand;
  */
 public class IgnoreLargeDocumentProcessorFactory extends UpdateRequestProcessorFactory {
   public static final String LIMIT_SIZE_PARAM = "limit";
+  public static final String PERMISSIVE_MODE_PARAM = "permissiveMode";
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   // limit of a SolrInputDocument size (in kb)
-  private long maxDocumentSize = 1024 * 1024;
+  private long maxDocumentSizeKb = 1024 * 1024;
+  private boolean permissiveModeEnabled = false;
 
   @Override
   public void init(NamedList<?> args) {
-    maxDocumentSize = args.toSolrParams().required().getLong(LIMIT_SIZE_PARAM);
+    final SolrParams params = args.toSolrParams();
+    maxDocumentSizeKb = params.required().getLong(LIMIT_SIZE_PARAM);
     args.remove(LIMIT_SIZE_PARAM);
+    permissiveModeEnabled = params.getBool(PERMISSIVE_MODE_PARAM, false);
+    args.remove(PERMISSIVE_MODE_PARAM);
 
     if (args.size() > 0) {
       throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
@@ -59,15 +69,33 @@ public class IgnoreLargeDocumentProcessorFactory extends UpdateRequestProcessorF
   public UpdateRequestProcessor getInstance(
       SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
     return new UpdateRequestProcessor(next) {
+
       @Override
       public void processAdd(AddUpdateCommand cmd) throws IOException {
-        long docSize = ObjectSizeEstimator.estimate(cmd.getSolrInputDocument());
-        if (docSize / 1024 > maxDocumentSize) {
+        long docSizeBytes = ObjectSizeEstimator.estimate(cmd.getSolrInputDocument());
+        if (docSizeBytes > maxDocumentSizeKb * 1024) {
+          handleViolatingDoc(cmd, docSizeBytes);
+        } else {
+          super.processAdd(cmd);
+        }
+      }
+
+      private void handleViolatingDoc(AddUpdateCommand cmd, long estimatedSizeBytes) {
+        if (permissiveModeEnabled) {
+          log.warn(
+              "Skipping doc because estimated size exceeds limit. [docId={}, estimatedSize={} bytes, limitSize={}kb]",
+              cmd.getPrintableId(),
+              estimatedSizeBytes,
+              maxDocumentSizeKb);
+        } else {
           throw new SolrException(
               BAD_REQUEST,
-              "Size of the document " + cmd.getPrintableId() + " is too large, around:" + docSize);
+              "Size of the document "
+                  + cmd.getPrintableId()
+                  + " is too large, around: "
+                  + estimatedSizeBytes
+                  + " bytes");
         }
-        super.processAdd(cmd);
       }
     };
   }
diff --git a/solr/core/src/test/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactoryTest.java
index e99cb770f69..6ef37a62578 100644
--- a/solr/core/src/test/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactoryTest.java
+++ b/solr/core/src/test/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactoryTest.java
@@ -18,6 +18,7 @@
 package org.apache.solr.update.processor;
 
 import static org.apache.solr.update.processor.IgnoreLargeDocumentProcessorFactory.ObjectSizeEstimator.estimate;
+import static org.hamcrest.Matchers.containsString;
 
 import java.io.IOException;
 import java.nio.charset.Charset;
@@ -31,6 +32,7 @@ import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.util.LogListener;
 import org.junit.Test;
 
 public class IgnoreLargeDocumentProcessorFactoryTest extends SolrTestCase {
@@ -54,10 +56,30 @@ public class IgnoreLargeDocumentProcessorFactoryTest extends SolrTestCase {
     requestProcessor.processAdd(getUpdate(1024));
   }
 
-  public AddUpdateCommand getUpdate(int size) {
+  @Test
+  public void testProcessorInPermissiveMode() throws IOException {
+    NamedList<Object> args = new NamedList<>();
+    args.add(IgnoreLargeDocumentProcessorFactory.LIMIT_SIZE_PARAM, 1);
+    args.add(IgnoreLargeDocumentProcessorFactory.PERMISSIVE_MODE_PARAM, true);
+
+    IgnoreLargeDocumentProcessorFactory factory = new IgnoreLargeDocumentProcessorFactory();
+    factory.init(args);
+
+    UpdateRequestProcessor processor = factory.getInstance(null, null, null);
+    try (LogListener listener = LogListener.warn(IgnoreLargeDocumentProcessorFactory.class)) {
+      processor.processAdd(getUpdate(1024));
+
+      assertThat(
+          listener.pollMessage(),
+          containsString("Skipping doc because estimated size exceeds limit"));
+    }
+  }
+
+  public AddUpdateCommand getUpdate(int sizeBytes) {
     SolrInputDocument document = new SolrInputDocument();
-    document.addField(new String(new byte[size], Charset.defaultCharset()), 1L);
-    assertTrue(IgnoreLargeDocumentProcessorFactory.ObjectSizeEstimator.estimate(document) > size);
+    document.addField(new String(new byte[sizeBytes], Charset.defaultCharset()), 1L);
+    assertTrue(
+        IgnoreLargeDocumentProcessorFactory.ObjectSizeEstimator.estimate(document) > sizeBytes);
 
     AddUpdateCommand cmd = new AddUpdateCommand(null);
     cmd.solrDoc = document;
diff --git a/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc b/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc
index 7faa3f86702..ee160af4136 100644
--- a/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc
+++ b/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc
@@ -331,6 +331,11 @@ See https://cwiki.apache.org/confluence/display/solr/SolrClassification for more
 
 {solr-javadocs}/core/org/apache/solr/update/processor/IgnoreLargeDocumentProcessorFactory.html[IgnoreLargeDocumentProcessorFactory]:: Allows you to prevent large documents with size more than `limit` (in KB) from getting indexed.
 It can help to prevent unexpected problems on indexing as well as on recovering because of very large documents.
++
+By default, this processor will abort the update request and send an error back to users if it encounters a document that exceed its configured limit.
+Documents processed prior to the offender are indexed by Solr; documents following the offender are left unprocessed.
++
+Alternatively, the processor offers a "permissive" mode (`permissiveMode=true`) which skips the offending document and logs a warning, but doesn't abort the remainder of the batch or return an error to users.
 
 {solr-javadocs}/core/org/apache/solr/update/processor/RegexpBoostProcessorFactory.html[RegexpBoostProcessorFactory]:: A processor which will match content of "inputField" against regular expressions found in "boostFilename", and if it matches will return the corresponding boost value from the file and output this to "boostField" as a double value.