You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/15 22:28:45 UTC
[tika] branch main updated: TIKA-3928 -- add workaround to extract container parse exception even if metadatafilter renames field or removes it.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6a14cd785 TIKA-3928 -- add workaround to extract container parse exception even if metadatafilter renames field or removes it.
6a14cd785 is described below
commit 6a14cd785233a6dfa29f98f73b18f3f3a074f093
Author: tballison <ta...@apache.org>
AuthorDate: Tue Nov 15 17:28:35 2022 -0500
TIKA-3928 -- add workaround to extract container parse exception even if metadatafilter renames field or removes it.
---
.../java/org/apache/tika/pipes/PipesClient.java | 10 +-------
.../java/org/apache/tika/pipes/PipesServer.java | 28 ++++++++++++++--------
.../org/apache/tika/pipes/emitter/EmitData.java | 23 +++++++++++++++---
3 files changed, 39 insertions(+), 22 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
index cb4f3ffd9..137388102 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
@@ -44,7 +44,6 @@ import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
@@ -334,7 +333,7 @@ public class PipesClient implements Closeable {
new UnsynchronizedByteArrayInputStream(bytes))) {
EmitData emitData = (EmitData) objectInputStream.readObject();
- String stack = getStack(emitData);
+ String stack = emitData.getContainerStackTrace();
if (StringUtils.isBlank(stack)) {
return new PipesResult(emitData);
} else {
@@ -347,13 +346,6 @@ public class PipesClient implements Closeable {
}
}
- private String getStack(EmitData emitData) {
- if (emitData.getMetadataList() == null || emitData.getMetadataList().size() < 1) {
- return StringUtils.EMPTY;
- }
- return emitData.getMetadataList().get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
- }
-
private void restart() throws IOException, InterruptedException, TimeoutException {
if (process != null) {
LOG.debug("process still alive; trying to destroy it");
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index d0d85dd83..c97a4e39e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -259,10 +259,10 @@ public class PipesServer implements Runnable {
*/
private String getContainerStacktrace(FetchEmitTuple t, List<Metadata> metadataList) {
if (metadataList == null || metadataList.size() < 1) {
- return "";
+ return StringUtils.EMPTY;
}
String stack = metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
- return (stack != null) ? stack : "";
+ return (stack != null) ? stack : StringUtils.EMPTY;
}
@@ -354,6 +354,8 @@ public class PipesServer implements Runnable {
private void emitIt(FetchEmitTuple t, List<Metadata> metadataList) {
long start = System.currentTimeMillis();
String stack = getContainerStacktrace(t, metadataList);
+ //we need to apply this after we pull out the stacktrace
+ filterMetadata(metadataList);
if (StringUtils.isBlank(stack) || t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) {
injectUserMetadata(t.getMetadata(), metadataList);
EmitKey emitKey = t.getEmitKey();
@@ -361,14 +363,13 @@ public class PipesServer implements Runnable {
emitKey = new EmitKey(emitKey.getEmitterName(), t.getFetchKey().getFetchKey());
t.setEmitKey(emitKey);
}
- EmitData emitData = new EmitData(t.getEmitKey(), metadataList);
+ EmitData emitData = new EmitData(t.getEmitKey(), metadataList, stack);
if (maxForEmitBatchBytes >= 0 && emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) {
emit(t.getId(), emitData, stack);
if (LOG.isTraceEnabled()) {
LOG.trace("timer -- emitted: {} ms", System.currentTimeMillis() - start);
}
} else {
- //ignore the stack, it is stored in the emit data
write(emitData);
if (LOG.isTraceEnabled()) {
LOG.trace("timer -- to write data: {} ms", System.currentTimeMillis() - start);
@@ -379,6 +380,16 @@ public class PipesServer implements Runnable {
}
}
+ private void filterMetadata(List<Metadata> metadataList) {
+ for (Metadata m : metadataList) {
+ try {
+ tikaConfig.getMetadataFilter().filter(m);
+ } catch (TikaException e) {
+ LOG.warn("failed to filter metadata", e);
+ }
+ }
+ }
+
private Fetcher getFetcher(FetchEmitTuple t) {
try {
return fetcherManager.getFetcher(t.getFetchKey().getFetcherName());
@@ -516,11 +527,6 @@ public class PipesServer implements Runnable {
if (containerException != null) {
metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, containerException);
}
- try {
- tikaConfig.getMetadataFilter().filter(metadata);
- } catch (TikaException e) {
- LOG.warn("exception mapping metadata", e);
- }
if (LOG.isTraceEnabled()) {
LOG.trace("timer -- parse only time: {} ms", System.currentTimeMillis() - start);
}
@@ -531,9 +537,11 @@ public class PipesServer implements Runnable {
private List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple,
HandlerConfig handlerConfig, InputStream stream,
Metadata metadata) {
+ //Intentionally do not add the metadata filter here!
+ //We need to let stacktraces percolate
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit()),
- handlerConfig.getMaxEmbeddedResources(), tikaConfig.getMetadataFilter());
+ handlerConfig.getMaxEmbeddedResources());
ParseContext parseContext = new ParseContext();
long start = System.currentTimeMillis();
try {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
index c74414060..95376a9fa 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
@@ -20,6 +20,7 @@ import java.io.Serializable;
import java.util.List;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.StringUtils;
public class EmitData implements Serializable {
/**
@@ -30,9 +31,17 @@ public class EmitData implements Serializable {
private final EmitKey emitKey;
private final List<Metadata> metadataList;
+ private final String containerStackTrace;
+
public EmitData(EmitKey emitKey, List<Metadata> metadataList) {
+ this(emitKey, metadataList, StringUtils.EMPTY);
+ }
+
+ public EmitData(EmitKey emitKey, List<Metadata> metadataList, String containerStackTrace) {
this.emitKey = emitKey;
this.metadataList = metadataList;
+ this.containerStackTrace = (containerStackTrace == null) ? StringUtils.EMPTY :
+ containerStackTrace;
}
public EmitKey getEmitKey() {
@@ -43,12 +52,18 @@ public class EmitData implements Serializable {
return metadataList;
}
+ public String getContainerStackTrace() {
+ return containerStackTrace;
+ }
+
public long getEstimatedSizeBytes() {
- return estimateSizeInBytes(getEmitKey().getEmitKey(), getMetadataList());
+ return estimateSizeInBytes(getEmitKey().getEmitKey(), getMetadataList(), containerStackTrace);
}
- private static long estimateSizeInBytes(String id, List<Metadata> metadataList) {
+ private static long estimateSizeInBytes(String id, List<Metadata> metadataList,
+ String containerStackTrace) {
long sz = 36 + id.length() * 2;
+ sz += 36 + containerStackTrace.length() * 2;
for (Metadata m : metadataList) {
for (String n : m.names()) {
sz += 36 + n.length() * 2;
@@ -59,8 +74,10 @@ public class EmitData implements Serializable {
}
return sz;
}
+
@Override
public String toString() {
- return "EmitData{" + "emitKey=" + emitKey + ", metadataList=" + metadataList + '}';
+ return "EmitData{" + "emitKey=" + emitKey + ", metadataList=" + metadataList +
+ ", containerStackTrace='" + containerStackTrace + '\'' + '}';
}
}