You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/29 17:30:05 UTC

[tika] branch TIKA-4149 created (now 2f40f2ad0)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4149
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 2f40f2ad0 TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory

This branch includes the following new commits:

     new 2f40f2ad0 TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4149
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2f40f2ad0aa2311c45374f0b8472bb18f07cfae5
Author: tballison <ta...@apache.org>
AuthorDate: Fri Sep 29 13:29:57 2023 -0400

    TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory
---
 .../src/main/java/org/apache/tika/pipes/PipesServer.java   | 14 ++++++++++----
 .../org/apache/tika/pipes/pipesiterator/PipesIterator.java | 13 ++++++++++---
 .../org/apache/tika/sax/BasicContentHandlerFactory.java    |  1 +
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 64d0d602e..ed1e5bb5e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -501,10 +501,14 @@ public class PipesServer implements Runnable {
     private List<Metadata> parseConcatenated(FetchEmitTuple fetchEmitTuple,
                                              HandlerConfig handlerConfig, InputStream stream,
                                              Metadata metadata) {
+        ParseContext parseContext = new ParseContext();
+
         ContentHandlerFactory contentHandlerFactory =
-                new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit());
+                new BasicContentHandlerFactory(handlerConfig.getType(),
+                        handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
+                        parseContext);
+
         ContentHandler handler = contentHandlerFactory.getNewContentHandler();
-        ParseContext parseContext = new ParseContext();
         parseContext.set(DocumentSelector.class, new DocumentSelector() {
             final int maxEmbedded = handlerConfig.maxEmbeddedResources;
             int embedded = 0;
@@ -549,12 +553,14 @@ public class PipesServer implements Runnable {
     private List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple,
                                           HandlerConfig handlerConfig, InputStream stream,
                                           Metadata metadata) {
+        ParseContext parseContext = new ParseContext();
         //Intentionally do not add the metadata filter here!
         //We need to let stacktraces percolate
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit()),
+                new BasicContentHandlerFactory(handlerConfig.getType(),
+                        handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), parseContext),
                 handlerConfig.getMaxEmbeddedResources());
-        ParseContext parseContext = new ParseContext();
+
         long start = System.currentTimeMillis();
         preParse(fetchEmitTuple, stream, metadata, parseContext);
         try {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
index 98b766ce7..34706f7e8 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
@@ -73,6 +73,7 @@ public abstract class PipesIterator extends ConfigBase
 
     private HandlerConfig.PARSE_MODE parseMode = HandlerConfig.PARSE_MODE.RMETA;
 
+    private boolean throwOnWriteLimitReached = false;
     private int writeLimit = -1;
     private int maxEmbeddedResources = -1;
 
@@ -146,6 +147,11 @@ public abstract class PipesIterator extends ConfigBase
         this.writeLimit = writeLimit;
     }
 
+    @Field
+    public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) {
+        this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+    }
+
     @Field
     public void setMaxEmbeddedResources(int maxEmbeddedResources) {
         this.maxEmbeddedResources = maxEmbeddedResources;
@@ -156,8 +162,8 @@ public abstract class PipesIterator extends ConfigBase
         setParseMode(HandlerConfig.PARSE_MODE.parseMode(parseModeString));
     }
 
-    public void setParseMode(HandlerConfig.PARSE_MODE parsePARSEMode) {
-        this.parseMode = parsePARSEMode;
+    public void setParseMode(HandlerConfig.PARSE_MODE parseMode) {
+        this.parseMode = parseMode;
     }
 
     public Integer call() throws Exception {
@@ -168,7 +174,8 @@ public abstract class PipesIterator extends ConfigBase
 
     protected HandlerConfig getHandlerConfig() {
         //TODO: make throwOnWriteLimitReached configurable
-        return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, false);
+        return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources,
+                throwOnWriteLimitReached);
     }
 
     protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 9de0d4071..d10e7adf5 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -41,6 +41,7 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteL
     private final ParseContext parseContext;
 
     /**
+     * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true
      * @param type       basic type of handler
      * @param writeLimit max number of characters to store; if < 0,
      *                   the handler will store all characters