You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/14 17:35:15 UTC

[tika] 01/04: Correct Metadata merging by policy, and get (incomplete) unit tests to pass

This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d181c3c05a70163597dd40a9e1b140303ec2739f
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 16:53:26 2018 +0000

    Correct Metadata merging by policy, and get (incomplete) unit tests to pass
---
 .../parser/multiple/AbstractMultipleParser.java    | 37 ++++++++++++++++++----
 .../java/org/apache/tika/utils/ParserUtils.java    |  3 +-
 .../tika/parser/multiple/MultipleParserTest.java   | 17 +++++++---
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index f99ad1a..c471826 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -37,6 +37,7 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -194,7 +195,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             // Force the stream to be a Tika one
             // Force the stream to be file-backed, so we can re-read safely
             //  later if required for parser 2+
-            // TODO Should we use RereadableInputStream instead?
+            // TODO Should we support RereadableInputStream as well?
+            // TODO Can we put this re-read logic in a utils method?
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             taggedStream.getPath();
             
@@ -223,6 +225,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 try {
                     p.parse(taggedStream, handler, metadata, context);
                 } catch (Exception e) {
+                    // Record the failure such that it can't get lost / overwritten
+                    recordParserFailure(p, e, originalMetadata);
                     recordParserFailure(p, e, metadata);
                     failure = e;
                 }
@@ -255,7 +259,10 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         
         // Finally, copy the latest metadata back onto their supplied object
         for (String n : metadata.names()) {
-            originalMetadata.set(n, metadata.get(n));
+            originalMetadata.remove(n);
+            for (String val : metadata.getValues(n)) {
+                originalMetadata.add(n, val);
+            }
         }
     }
     
@@ -268,13 +275,31 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         }
         
         for (String n : lastMetadata.names()) {
-            if (newMetadata.get(n) == null) {
-                newMetadata.set(n, lastMetadata.get(n));
+            // If this is one of the metadata keys we're setting ourselves
+            //  for tracking/errors, then always keep the latest one!
+            if (n.equals(ParserUtils.X_PARSED_BY)) continue;
+            if (n.equals(ParserUtils.EMBEDDED_PARSER.getName())) continue;
+            if (n.equals(ParserUtils.EMBEDDED_EXCEPTION.getName())) continue;
+            
+            // Merge as per policy 
+            String[] newVals = newMetadata.getValues(n);
+            String[] oldVals = lastMetadata.getValues(n);
+            if (newVals == null) {
+                // Metadata only in previous run, keep old values
+                for (String val : oldVals) {
+                    newMetadata.add(n, val);
+                }
+            } else if (Arrays.deepEquals(oldVals, newVals)) {
+                // Metadata is the same, nothing to do
+                continue;
             } else {
                 switch (policy) {
                 case FIRST_WINS:
-                    // Use the earlier value 
-                    newMetadata.set(n, lastMetadata.get(n));
+                    // Use the earlier value(s) in place of this/these one/s
+                    newMetadata.remove(n);
+                    for (String val : oldVals) {
+                        newMetadata.add(n, val);
+                    }
                     continue;
                 case LAST_WINS:
                     // Most recent (last) parser has already won
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index c3c63ba..4005217 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -26,6 +26,7 @@ import org.apache.tika.parser.ParserDecorator;
  * Helper util methods for Parsers themselves.
  */
 public class ParserUtils {
+    public final static String X_PARSED_BY = "X-Parsed-By"; 
     public final static Property EMBEDDED_PARSER =
             Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
     public final static Property EMBEDDED_EXCEPTION =
@@ -68,7 +69,7 @@ public class ParserUtils {
      *  or used.
      */
     public static void recordParserDetails(Parser parser, Metadata metadata) {
-        metadata.add("X-Parsed-By", getParserClassname(parser));
+        metadata.add(X_PARSED_BY, getParserClassname(parser));
     }
 
     /**
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
index 9de0a99..21d9724 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.multiple;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
 import java.util.Arrays;
@@ -96,7 +97,8 @@ public class MultipleParserTest {
        
         usedParsers = metadata.getValues("X-Parsed-By");
         assertEquals(2, usedParsers.length);
-        assertEquals(DummyParser.class.getName(), usedParsers[0]);
+        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+        assertEquals(DummyParser.class.getName(), usedParsers[1]);
         
         // TODO Check we got an exception
         
@@ -134,8 +136,15 @@ public class MultipleParserTest {
         EmptyParser pNothing = new EmptyParser();
         
         
+        // Supplemental doesn't support DISCARD
+        try {
+            new SupplementingParser(null, MetadataPolicy.DISCARD_ALL, new Parser[0]);
+            fail("Discard shouldn't be supported");
+        } catch (IllegalArgumentException e) {}
+        
+        
         // With only one parser defined, works as normal
-        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent1);
+        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pContent1);
 
         metadata = new Metadata();
         handler = new BodyContentHandler();
@@ -151,12 +160,12 @@ public class MultipleParserTest {
         
         
         // Check the First, Last and All policies
-        p = new FallbackParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
+        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
 
         metadata = new Metadata();
         handler = new BodyContentHandler();
         p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
-        assertEquals("Fell back 1!", handler.toString());
+        assertEquals("Fell back 1!Fell back 2!", handler.toString());
         
         assertEquals("Test1", metadata.get("T1"));
         assertEquals("Test1", metadata.get("TBoth"));

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.