You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/29 17:36:17 UTC
[tika] branch main updated: TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8bf7e977d TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374)
8bf7e977d is described below
commit 8bf7e977db6e08fc84c2e72a7419f39d9fcc97bf
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Sep 29 13:36:11 2023 -0400
TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374)
---
.../src/main/java/org/apache/tika/pipes/PipesServer.java | 14 ++++++++++----
.../org/apache/tika/pipes/pipesiterator/PipesIterator.java | 13 ++++++++++---
.../org/apache/tika/sax/BasicContentHandlerFactory.java | 1 +
3 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 64d0d602e..ed1e5bb5e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -501,10 +501,14 @@ public class PipesServer implements Runnable {
private List<Metadata> parseConcatenated(FetchEmitTuple fetchEmitTuple,
HandlerConfig handlerConfig, InputStream stream,
Metadata metadata) {
+ ParseContext parseContext = new ParseContext();
+
ContentHandlerFactory contentHandlerFactory =
- new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit());
+ new BasicContentHandlerFactory(handlerConfig.getType(),
+ handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
+ parseContext);
+
ContentHandler handler = contentHandlerFactory.getNewContentHandler();
- ParseContext parseContext = new ParseContext();
parseContext.set(DocumentSelector.class, new DocumentSelector() {
final int maxEmbedded = handlerConfig.maxEmbeddedResources;
int embedded = 0;
@@ -549,12 +553,14 @@ public class PipesServer implements Runnable {
private List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple,
HandlerConfig handlerConfig, InputStream stream,
Metadata metadata) {
+ ParseContext parseContext = new ParseContext();
//Intentionally do not add the metadata filter here!
//We need to let stacktraces percolate
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit()),
+ new BasicContentHandlerFactory(handlerConfig.getType(),
+ handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), parseContext),
handlerConfig.getMaxEmbeddedResources());
- ParseContext parseContext = new ParseContext();
+
long start = System.currentTimeMillis();
preParse(fetchEmitTuple, stream, metadata, parseContext);
try {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
index 98b766ce7..34706f7e8 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
@@ -73,6 +73,7 @@ public abstract class PipesIterator extends ConfigBase
private HandlerConfig.PARSE_MODE parseMode = HandlerConfig.PARSE_MODE.RMETA;
+ private boolean throwOnWriteLimitReached = false;
private int writeLimit = -1;
private int maxEmbeddedResources = -1;
@@ -146,6 +147,11 @@ public abstract class PipesIterator extends ConfigBase
this.writeLimit = writeLimit;
}
+ @Field
+ public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) {
+ this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+ }
+
@Field
public void setMaxEmbeddedResources(int maxEmbeddedResources) {
this.maxEmbeddedResources = maxEmbeddedResources;
@@ -156,8 +162,8 @@ public abstract class PipesIterator extends ConfigBase
setParseMode(HandlerConfig.PARSE_MODE.parseMode(parseModeString));
}
- public void setParseMode(HandlerConfig.PARSE_MODE parsePARSEMode) {
- this.parseMode = parsePARSEMode;
+ public void setParseMode(HandlerConfig.PARSE_MODE parseMode) {
+ this.parseMode = parseMode;
}
public Integer call() throws Exception {
@@ -168,7 +174,8 @@ public abstract class PipesIterator extends ConfigBase
protected HandlerConfig getHandlerConfig() {
//TODO: make throwOnWriteLimitReached configurable
- return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, false);
+ return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources,
+ throwOnWriteLimitReached);
}
protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 9de0d4071..d10e7adf5 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -41,6 +41,7 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteL
private final ParseContext parseContext;
/**
+ * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true
* @param type basic type of handler
* @param writeLimit max number of characters to store; if < 0,
* the handler will store all characters