You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/15 22:28:45 UTC

[tika] branch main updated: TIKA-3928 -- add workaround to extract container parse exception even if metadatafilter renames field or removes it.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6a14cd785 TIKA-3928 -- add workaround to extract container parse exception even if metadatafilter renames field or removes it.
6a14cd785 is described below

commit 6a14cd785233a6dfa29f98f73b18f3f3a074f093
Author: tballison <ta...@apache.org>
AuthorDate: Tue Nov 15 17:28:35 2022 -0500

    TIKA-3928 -- add workaround to extract container parse exception even if metadatafilter renames field or removes it.
---
 .../java/org/apache/tika/pipes/PipesClient.java    | 10 +-------
 .../java/org/apache/tika/pipes/PipesServer.java    | 28 ++++++++++++++--------
 .../org/apache/tika/pipes/emitter/EmitData.java    | 23 +++++++++++++++---
 3 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
index cb4f3ffd9..137388102 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
@@ -44,7 +44,6 @@ import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.pipes.emitter.EmitData;
 import org.apache.tika.utils.ProcessUtils;
 import org.apache.tika.utils.StringUtils;
@@ -334,7 +333,7 @@ public class PipesClient implements Closeable {
                 new UnsynchronizedByteArrayInputStream(bytes))) {
             EmitData emitData = (EmitData) objectInputStream.readObject();
 
-            String stack = getStack(emitData);
+            String stack = emitData.getContainerStackTrace();
             if (StringUtils.isBlank(stack)) {
                 return new PipesResult(emitData);
             } else {
@@ -347,13 +346,6 @@ public class PipesClient implements Closeable {
         }
     }
 
-    private String getStack(EmitData emitData) {
-        if (emitData.getMetadataList() == null || emitData.getMetadataList().size() < 1) {
-            return StringUtils.EMPTY;
-        }
-        return emitData.getMetadataList().get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
-    }
-
     private void restart() throws IOException, InterruptedException, TimeoutException {
         if (process != null) {
             LOG.debug("process still alive; trying to destroy it");
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index d0d85dd83..c97a4e39e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -259,10 +259,10 @@ public class PipesServer implements Runnable {
      */
     private String getContainerStacktrace(FetchEmitTuple t, List<Metadata> metadataList) {
         if (metadataList == null || metadataList.size() < 1) {
-            return "";
+            return StringUtils.EMPTY;
         }
         String stack = metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
-        return (stack != null) ? stack : "";
+        return (stack != null) ? stack : StringUtils.EMPTY;
     }
 
 
@@ -354,6 +354,8 @@ public class PipesServer implements Runnable {
     private void emitIt(FetchEmitTuple t, List<Metadata> metadataList) {
         long start = System.currentTimeMillis();
         String stack = getContainerStacktrace(t, metadataList);
+        //we need to apply this after we pull out the stacktrace
+        filterMetadata(metadataList);
         if (StringUtils.isBlank(stack) || t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) {
             injectUserMetadata(t.getMetadata(), metadataList);
             EmitKey emitKey = t.getEmitKey();
@@ -361,14 +363,13 @@ public class PipesServer implements Runnable {
                 emitKey = new EmitKey(emitKey.getEmitterName(), t.getFetchKey().getFetchKey());
                 t.setEmitKey(emitKey);
             }
-            EmitData emitData = new EmitData(t.getEmitKey(), metadataList);
+            EmitData emitData = new EmitData(t.getEmitKey(), metadataList, stack);
             if (maxForEmitBatchBytes >= 0 && emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) {
                 emit(t.getId(), emitData, stack);
                 if (LOG.isTraceEnabled()) {
                     LOG.trace("timer -- emitted: {} ms", System.currentTimeMillis() - start);
                 }
             } else {
-                //ignore the stack, it is stored in the emit data
                 write(emitData);
                 if (LOG.isTraceEnabled()) {
                     LOG.trace("timer -- to write data: {} ms", System.currentTimeMillis() - start);
@@ -379,6 +380,16 @@ public class PipesServer implements Runnable {
         }
     }
 
+    private void filterMetadata(List<Metadata> metadataList) {
+        for (Metadata m : metadataList) {
+            try {
+                tikaConfig.getMetadataFilter().filter(m);
+            } catch (TikaException e) {
+                LOG.warn("failed to filter metadata", e);
+            }
+        }
+    }
+
     private Fetcher getFetcher(FetchEmitTuple t) {
         try {
             return fetcherManager.getFetcher(t.getFetchKey().getFetcherName());
@@ -516,11 +527,6 @@ public class PipesServer implements Runnable {
             if (containerException != null) {
                 metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, containerException);
             }
-            try {
-                tikaConfig.getMetadataFilter().filter(metadata);
-            } catch (TikaException e) {
-                LOG.warn("exception mapping metadata", e);
-            }
             if (LOG.isTraceEnabled()) {
                 LOG.trace("timer -- parse only time: {} ms", System.currentTimeMillis() - start);
             }
@@ -531,9 +537,11 @@ public class PipesServer implements Runnable {
     private List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple,
                                           HandlerConfig handlerConfig, InputStream stream,
                                           Metadata metadata) {
+        //Intentionally do not add the metadata filter here!
+        //We need to let stacktraces percolate
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
                 new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit()),
-                handlerConfig.getMaxEmbeddedResources(), tikaConfig.getMetadataFilter());
+                handlerConfig.getMaxEmbeddedResources());
         ParseContext parseContext = new ParseContext();
         long start = System.currentTimeMillis();
         try {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
index c74414060..95376a9fa 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
@@ -20,6 +20,7 @@ import java.io.Serializable;
 import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.StringUtils;
 
 public class EmitData implements Serializable {
     /**
@@ -30,9 +31,17 @@ public class EmitData implements Serializable {
     private final EmitKey emitKey;
     private final List<Metadata> metadataList;
 
+    private final String containerStackTrace;
+
     public EmitData(EmitKey emitKey, List<Metadata> metadataList) {
+        this(emitKey, metadataList, StringUtils.EMPTY);
+    }
+
+    public EmitData(EmitKey emitKey, List<Metadata> metadataList, String containerStackTrace) {
         this.emitKey = emitKey;
         this.metadataList = metadataList;
+        this.containerStackTrace = (containerStackTrace == null) ? StringUtils.EMPTY :
+                containerStackTrace;
     }
 
     public EmitKey getEmitKey() {
@@ -43,12 +52,18 @@ public class EmitData implements Serializable {
         return metadataList;
     }
 
+    public String getContainerStackTrace() {
+        return containerStackTrace;
+    }
+
     public long getEstimatedSizeBytes() {
-        return estimateSizeInBytes(getEmitKey().getEmitKey(), getMetadataList());
+        return estimateSizeInBytes(getEmitKey().getEmitKey(), getMetadataList(), containerStackTrace);
     }
 
-    private static long estimateSizeInBytes(String id, List<Metadata> metadataList) {
+    private static long estimateSizeInBytes(String id, List<Metadata> metadataList,
+                                            String containerStackTrace) {
         long sz = 36 + id.length() * 2;
+        sz += 36 + containerStackTrace.length() * 2;
         for (Metadata m : metadataList) {
             for (String n : m.names()) {
                 sz += 36 + n.length() * 2;
@@ -59,8 +74,10 @@ public class EmitData implements Serializable {
         }
         return sz;
     }
+
     @Override
     public String toString() {
-        return "EmitData{" + "emitKey=" + emitKey + ", metadataList=" + metadataList + '}';
+        return "EmitData{" + "emitKey=" + emitKey + ", metadataList=" + metadataList +
+                ", containerStackTrace='" + containerStackTrace + '\'' + '}';
     }
 }