You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/14 17:35:15 UTC
[tika] 01/04: Correct Metadata merging by policy,
and get (incomplete) unit tests to pass
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d181c3c05a70163597dd40a9e1b140303ec2739f
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 16:53:26 2018 +0000
Correct Metadata merging by policy, and get (incomplete) unit tests to pass
---
.../parser/multiple/AbstractMultipleParser.java | 37 ++++++++++++++++++----
.../java/org/apache/tika/utils/ParserUtils.java | 3 +-
.../tika/parser/multiple/MultipleParserTest.java | 17 +++++++---
3 files changed, 46 insertions(+), 11 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index f99ad1a..c471826 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -37,6 +37,7 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -194,7 +195,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Force the stream to be a Tika one
// Force the stream to be file-backed, so we can re-read safely
// later if required for parser 2+
- // TODO Should we use RereadableInputStream instead?
+ // TODO Should we support RereadableInputStream as well?
+ // TODO Can we put this re-read logic in a utils method?
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
taggedStream.getPath();
@@ -223,6 +225,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
try {
p.parse(taggedStream, handler, metadata, context);
} catch (Exception e) {
+ // Record the failure such that it can't get lost / overwritten
+ recordParserFailure(p, e, originalMetadata);
recordParserFailure(p, e, metadata);
failure = e;
}
@@ -255,7 +259,10 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Finally, copy the latest metadata back onto their supplied object
for (String n : metadata.names()) {
- originalMetadata.set(n, metadata.get(n));
+ originalMetadata.remove(n);
+ for (String val : metadata.getValues(n)) {
+ originalMetadata.add(n, val);
+ }
}
}
@@ -268,13 +275,31 @@ public abstract class AbstractMultipleParser extends AbstractParser {
}
for (String n : lastMetadata.names()) {
- if (newMetadata.get(n) == null) {
- newMetadata.set(n, lastMetadata.get(n));
+ // If this is one of the metadata keys we're setting ourselves
+ // for tracking/errors, then always keep the latest one!
+ if (n.equals(ParserUtils.X_PARSED_BY)) continue;
+ if (n.equals(ParserUtils.EMBEDDED_PARSER.getName())) continue;
+ if (n.equals(ParserUtils.EMBEDDED_EXCEPTION.getName())) continue;
+
+ // Merge as per policy
+ String[] newVals = newMetadata.getValues(n);
+ String[] oldVals = lastMetadata.getValues(n);
+ if (newVals == null) {
+ // Metadata only in previous run, keep old values
+ for (String val : oldVals) {
+ newMetadata.add(n, val);
+ }
+ } else if (Arrays.deepEquals(oldVals, newVals)) {
+ // Metadata is the same, nothing to do
+ continue;
} else {
switch (policy) {
case FIRST_WINS:
- // Use the earlier value
- newMetadata.set(n, lastMetadata.get(n));
+ // Use the earlier value(s) in place of this/these one/s
+ newMetadata.remove(n);
+ for (String val : oldVals) {
+ newMetadata.add(n, val);
+ }
continue;
case LAST_WINS:
// Most recent (last) parser has already won
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index c3c63ba..4005217 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -26,6 +26,7 @@ import org.apache.tika.parser.ParserDecorator;
* Helper util methods for Parsers themselves.
*/
public class ParserUtils {
+ public final static String X_PARSED_BY = "X-Parsed-By";
public final static Property EMBEDDED_PARSER =
Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
public final static Property EMBEDDED_EXCEPTION =
@@ -68,7 +69,7 @@ public class ParserUtils {
* or used.
*/
public static void recordParserDetails(Parser parser, Metadata metadata) {
- metadata.add("X-Parsed-By", getParserClassname(parser));
+ metadata.add(X_PARSED_BY, getParserClassname(parser));
}
/**
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
index 9de0a99..21d9724 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.multiple;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
import java.util.Arrays;
@@ -96,7 +97,8 @@ public class MultipleParserTest {
usedParsers = metadata.getValues("X-Parsed-By");
assertEquals(2, usedParsers.length);
- assertEquals(DummyParser.class.getName(), usedParsers[0]);
+ assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+ assertEquals(DummyParser.class.getName(), usedParsers[1]);
// TODO Check we got an exception
@@ -134,8 +136,15 @@ public class MultipleParserTest {
EmptyParser pNothing = new EmptyParser();
+ // Supplemental doesn't support DISCARD
+ try {
+ new SupplementingParser(null, MetadataPolicy.DISCARD_ALL, new Parser[0]);
+ fail("Discard shouldn't be supported");
+ } catch (IllegalArgumentException e) {}
+
+
// With only one parser defined, works as normal
- p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent1);
+ p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pContent1);
metadata = new Metadata();
handler = new BodyContentHandler();
@@ -151,12 +160,12 @@ public class MultipleParserTest {
// Check the First, Last and All policies
- p = new FallbackParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
+ p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
- assertEquals("Fell back 1!", handler.toString());
+ assertEquals("Fell back 1!Fell back 2!", handler.toString());
assertEquals("Test1", metadata.get("T1"));
assertEquals("Test1", metadata.get("TBoth"));
--
To stop receiving notification emails like this one, please contact
nick@apache.org.