You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/29 17:36:17 UTC

[tika] branch main updated: TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 8bf7e977d TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374)
8bf7e977d is described below

commit 8bf7e977db6e08fc84c2e72a7419f39d9fcc97bf
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Sep 29 13:36:11 2023 -0400

    TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374)
---
 .../src/main/java/org/apache/tika/pipes/PipesServer.java   | 14 ++++++++++----
 .../org/apache/tika/pipes/pipesiterator/PipesIterator.java | 13 ++++++++++---
 .../org/apache/tika/sax/BasicContentHandlerFactory.java    |  1 +
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 64d0d602e..ed1e5bb5e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -501,10 +501,14 @@ public class PipesServer implements Runnable {
     private List<Metadata> parseConcatenated(FetchEmitTuple fetchEmitTuple,
                                              HandlerConfig handlerConfig, InputStream stream,
                                              Metadata metadata) {
+        ParseContext parseContext = new ParseContext();
+
         ContentHandlerFactory contentHandlerFactory =
-                new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit());
+                new BasicContentHandlerFactory(handlerConfig.getType(),
+                        handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
+                        parseContext);
+
         ContentHandler handler = contentHandlerFactory.getNewContentHandler();
-        ParseContext parseContext = new ParseContext();
         parseContext.set(DocumentSelector.class, new DocumentSelector() {
             final int maxEmbedded = handlerConfig.maxEmbeddedResources;
             int embedded = 0;
@@ -549,12 +553,14 @@ public class PipesServer implements Runnable {
     private List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple,
                                           HandlerConfig handlerConfig, InputStream stream,
                                           Metadata metadata) {
+        ParseContext parseContext = new ParseContext();
         //Intentionally do not add the metadata filter here!
         //We need to let stacktraces percolate
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit()),
+                new BasicContentHandlerFactory(handlerConfig.getType(),
+                        handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), parseContext),
                 handlerConfig.getMaxEmbeddedResources());
-        ParseContext parseContext = new ParseContext();
+
         long start = System.currentTimeMillis();
         preParse(fetchEmitTuple, stream, metadata, parseContext);
         try {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
index 98b766ce7..34706f7e8 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
@@ -73,6 +73,7 @@ public abstract class PipesIterator extends ConfigBase
 
     private HandlerConfig.PARSE_MODE parseMode = HandlerConfig.PARSE_MODE.RMETA;
 
+    private boolean throwOnWriteLimitReached = false;
     private int writeLimit = -1;
     private int maxEmbeddedResources = -1;
 
@@ -146,6 +147,11 @@ public abstract class PipesIterator extends ConfigBase
         this.writeLimit = writeLimit;
     }
 
+    @Field
+    public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) {
+        this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+    }
+
     @Field
     public void setMaxEmbeddedResources(int maxEmbeddedResources) {
         this.maxEmbeddedResources = maxEmbeddedResources;
@@ -156,8 +162,8 @@ public abstract class PipesIterator extends ConfigBase
         setParseMode(HandlerConfig.PARSE_MODE.parseMode(parseModeString));
     }
 
-    public void setParseMode(HandlerConfig.PARSE_MODE parsePARSEMode) {
-        this.parseMode = parsePARSEMode;
+    public void setParseMode(HandlerConfig.PARSE_MODE parseMode) {
+        this.parseMode = parseMode;
     }
 
     public Integer call() throws Exception {
@@ -168,7 +174,8 @@ public abstract class PipesIterator extends ConfigBase
 
     protected HandlerConfig getHandlerConfig() {
         //TODO: make throwOnWriteLimitReached configurable
-        return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, false);
+        return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources,
+                throwOnWriteLimitReached);
     }
 
     protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 9de0d4071..d10e7adf5 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -41,6 +41,7 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteL
     private final ParseContext parseContext;
 
     /**
+     * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true
      * @param type       basic type of handler
      * @param writeLimit max number of characters to store; if < 0,
      *                   the handler will store all characters