You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/13 18:15:32 UTC
[tika] 09/13: Move logic for recording embedded parser failures in
the metadata to utils, and use for multiple parsers
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 97b97b345b49b7dd510af560598e6d1ab7baf28c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:24:41 2018 +0000
Move logic for recording embedded parser failures in the metadata to utils, and use for multiple parsers
---
.../apache/tika/parser/RecursiveParserWrapper.java | 10 +++-------
.../parser/multiple/AbstractMultipleParser.java | 1 +
.../tika/parser/multiple/FallbackParser.java | 3 ---
.../tika/parser/multiple/SupplementingParser.java | 3 ---
.../java/org/apache/tika/utils/ParserUtils.java | 22 +++++++++++++++++++++-
5 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 1e8e5b1..c426a42 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -31,7 +31,6 @@ import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.ContentHandlerFactory;
-import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -85,8 +84,7 @@ public class RecursiveParserWrapper implements Parser {
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
- public final static Property EMBEDDED_EXCEPTION =
- Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+ public final static Property EMBEDDED_EXCEPTIONx = ParserUtils.EMBEDDED_EXCEPTION;
//move this to TikaCoreProperties?
public final static Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
@@ -304,16 +302,14 @@ public class RecursiveParserWrapper implements Parser {
metadata.add(WRITE_LIMIT_REACHED, "true");
} else {
if (catchEmbeddedExceptions) {
- String trace = ExceptionUtils.getStackTrace(e);
- metadata.set(EMBEDDED_EXCEPTION, trace);
+ ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
}
}
} catch (TikaException e) {
if (catchEmbeddedExceptions) {
- String trace = ExceptionUtils.getStackTrace(e);
- metadata.set(EMBEDDED_EXCEPTION, trace);
+ ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 4695e0a..d857b35 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -216,6 +216,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
try {
p.parse(parserStream, handler, metadata, context);
} catch (Exception e) {
+ recordParserFailure(p, e, metadata);
failure = e;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
index 9b6a0bf..97a8aaf 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
@@ -61,9 +61,6 @@ public class FallbackParser extends AbstractMultipleParser {
// If there was no exception, abort further parsers
if (exception == null) return false;
- // Record the details of this exception in the metadata
- // TODO Share logic with the Recursive Parser Wrapper
-
// Have the next parser tried
return true;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
index fd5d037..c1dec34 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -72,9 +72,6 @@ public class SupplementingParser extends AbstractMultipleParser {
// If there was no exception, just carry on to the next
if (exception == null) return true;
- // Record the details of this exception in the metadata
- // TODO Share logic with the Recursive Parser Wrapper
-
// Have the next parser tried
return true;
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 58105a6..c3c63ba 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -17,6 +17,8 @@
package org.apache.tika.utils;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
@@ -24,6 +26,11 @@ import org.apache.tika.parser.ParserDecorator;
* Helper util methods for Parsers themselves.
*/
public class ParserUtils {
+ public final static Property EMBEDDED_PARSER =
+ Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
+ public final static Property EMBEDDED_EXCEPTION =
+ Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+
/**
* Does a deep clone of a Metadata object.
*/
@@ -56,11 +63,24 @@ public class ParserUtils {
}
/**
- * Records details of the {@link Parser} used to the Metadata,
+ * Records details of the {@link Parser} used to the {@link Metadata},
* typically wanted where multiple parsers could be picked between
* or used.
*/
public static void recordParserDetails(Parser parser, Metadata metadata) {
metadata.add("X-Parsed-By", getParserClassname(parser));
}
+
+ /**
+ * Records details of a {@link Parser}'s failure to the
+ * {@link Metadata}, so you can check what went wrong even if the
+ * {@link Exception} wasn't immediately thrown (eg when several different
+ * Parsers are used)
+ */
+ public static void recordParserFailure(Parser parser, Exception failure,
+ Metadata metadata) {
+ String trace = ExceptionUtils.getStackTrace(failure);
+ metadata.add(EMBEDDED_EXCEPTION, trace);
+ metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
+ }
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.