You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/13 18:15:23 UTC

[tika] branch multiple-parsers updated (bc8a75e -> 348bfb9)

This is an automated email from the ASF dual-hosted git repository.

nick pushed a change to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from bc8a75e  Sample fallback and supplemental config files based on https://wiki.apache.org/tika/CompositeParserDiscussion
     new 217a9ce  Name sample config files based on issue number
     new 62cf6f6  Add TODOs for code to be shared/copied with other areas
     new 3555745  Ignore vim temp files
     new d5a06ba  Pull out deep Metadata clone to a utils method for re-use
     new 427417c  Prepare to track metadata between parsers
     new c3897db  Fix exception handling
     new d229ab6  Pull common "Real Parser" identification logic out to utils
     new f4a926c  Use utils for recording details of the parser used
     new 97b97b3  Move logic for recording embedded parser failures in the metadata to utils, and use for multiple parsers
     new 9be93c6  TODO updates, enforce allowed policies
     new 82f6f5f  Bring over stream reset logic from ParserDecorator and update comments
     new ee60f5e  Implement some metadata policies for merging values from multiple parsers
     new 348bfb9  More metadata handling between parsers, start on unit testing

The 13 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .gitignore                                         |   1 +
 .../org/apache/tika/parser/CompositeParser.java    |   7 +-
 .../apache/tika/parser/RecursiveParserWrapper.java |  34 ++-----
 .../parser/multiple/AbstractMultipleParser.java    | 110 ++++++++++++++------
 .../tika/parser/multiple/FallbackParser.java       |   3 -
 .../tika/parser/multiple/SupplementingParser.java  |   9 +-
 .../java/org/apache/tika/utils/ParserUtils.java    |  86 ++++++++++++++++
 .../tika/parser/multiple/MultipleParserTest.java   | 111 +++++++++++++++++++++
 ...allback.xml => TIKA-1509-multiple-fallback.xml} |   0
 ...tal.xml => TIKA-1509-multiple-supplemental.xml} |   0
 10 files changed, 291 insertions(+), 70 deletions(-)
 create mode 100644 tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
 create mode 100644 tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
 rename tika-core/src/test/resources/org/apache/tika/config/{multiple-fallback.xml => TIKA-1509-multiple-fallback.xml} (100%)
 rename tika-core/src/test/resources/org/apache/tika/config/{multiple-supplemental.xml => TIKA-1509-multiple-supplemental.xml} (100%)

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 13/13: More metadata handling between parsers, start on unit testing

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 348bfb9be46036833bbfda38c1912c9bf9eeb06e
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 18:15:14 2018 +0000

    More metadata handling between parsers, start on unit testing
---
 .../parser/multiple/AbstractMultipleParser.java    |  19 ++--
 .../tika/parser/multiple/MultipleParserTest.java   | 111 +++++++++++++++++++++
 2 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 9781f49..0aded0c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -175,11 +175,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
      */
     public void parse(
             InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
+            Metadata originalMetadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         // Track the metadata between parsers, so we can apply our policy
-        Metadata originalMetadata = cloneMetadata(metadata);
-        Metadata lastMetadata = originalMetadata;
+        Metadata lastMetadata = cloneMetadata(originalMetadata);
+        Metadata metadata = lastMetadata;
         
         // Start tracking resources, so we can clean up when done
         TemporaryResources tmp = new TemporaryResources();
@@ -203,7 +203,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 taggedStream.mark(-1);
                 
                 // Record that we used this parser
-                recordParserDetails(p, metadata);
+                recordParserDetails(p, originalMetadata);
 
                 // Prepare an near-empty Metadata, will merge after
                 metadata = cloneMetadata(originalMetadata);
@@ -220,6 +220,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // Notify the implementation how it went
                 boolean tryNext = parserCompleted(p, metadata, handler, failure);
                 
+                // Handle metadata merging / clashes
+                metadata = mergeMetadata(metadata, lastMetadata, policy);
+                
                 // Abort if requested, with the exception if there was one
                 if (!tryNext) {
                    if (failure != null) {
@@ -232,9 +235,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                    break;
                 }
                 
-                // Handle metadata merging / clashes
-                metadata = mergeMetadata(metadata, lastMetadata, policy);
-                
                 // Prepare for the next parser, if present
                 lastMetadata = cloneMetadata(metadata);
                 taggedStream.reset();
@@ -242,6 +242,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         } finally {
             tmp.dispose();
         }
+        
+        // Finally, copy the latest metadata back onto their supplied object
+        for (String n : metadata.names()) {
+            originalMetadata.set(n, metadata.get(n));
+        }
     }
     
     // TODO Provide a method that takes an InputStreamSource as well,
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
new file mode 100644
index 0000000..b3166eb
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.parser.DummyParser;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ErrorParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+public class MultipleParserTest {
+    /**
+     * Tests how {@link AbstractMultipleParser} works out which
+     *  mime types to offer, based on the types of the parsers
+     */
+    @Test
+    public void testMimeTypeSupported() {
+        // TODO
+    }
+    
+    /**
+     * Test {@link FallbackParser}
+     */
+    @Test
+    public void testFallback() throws Exception {
+        ParseContext context = new ParseContext();
+        BodyContentHandler handler;
+        Metadata metadata;
+        Parser p;
+        String[] usedParsers;
+        
+        // Some media types
+        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+        Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+                MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+        
+        // Some parsers
+        ErrorParser pFail = new ErrorParser();
+        DummyParser pContent = new DummyParser(onlyOct, new HashMap<String,String>(),
+                                               "Fell back!");
+        EmptyParser pNothing = new EmptyParser();
+        
+        
+        // With only one parser defined, works as normal
+        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back!", handler.toString());
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(1, usedParsers.length);
+        assertEquals(DummyParser.class.getName(), usedParsers[0]);
+        
+        
+        // With a failing parser, will go to the working one
+        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back!", handler.toString());
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(2, usedParsers.length);
+        assertEquals(DummyParser.class.getName(), usedParsers[0]);
+        
+        // TODO Check we got an exception
+        
+        
+        // Won't go past the working one
+        // TODO
+    }
+    
+    /**
+     * Test for {@link SupplementingParser}
+     */
+    @Test
+    public void testSupplemental() throws Exception {
+        // TODO 
+    }
+}

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 08/13: Use utils for recording details of the parser used

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f4a926ca94c50a6158891c7746e725cd720a2faa
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:13:19 2018 +0000

    Use utils for recording details of the parser used
---
 .../src/main/java/org/apache/tika/parser/CompositeParser.java    | 2 +-
 .../org/apache/tika/parser/multiple/AbstractMultipleParser.java  | 4 ++--
 tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java   | 9 +++++++++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 0098468..c5c95a6 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -272,7 +272,7 @@ public class CompositeParser extends AbstractParser {
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             TaggedContentHandler taggedHandler = 
                 handler != null ? new TaggedContentHandler(handler) : null;
-            metadata.add("X-Parsed-By", ParserUtils.getParserClassname(parser));
+            ParserUtils.recordParserDetails(parser, metadata);
             try {
                 parser.parse(taggedStream, taggedHandler, metadata, context);
             } catch (RuntimeException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index d66c541..4695e0a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -205,8 +205,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // TODO What's the best way to reset each time?
                 TikaInputStream parserStream = TikaInputStream.get(path);
                 
-                // Record this parser
-                metadata.add("X-Parsed-By", getParserClassname(p));
+                // Record that we used this parser
+                recordParserDetails(p, metadata);
                 
                 // TODO Handle metadata clashes based on the Policy
                 
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index bdbb04c..58105a6 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -54,4 +54,13 @@ public class ParserUtils {
             return parser.getClass().getName();
         }
     }
+
+    /**
+     * Records details of the {@link Parser} used to the Metadata,
+     *  typically wanted where multiple parsers could be picked between
+     *  or used.
+     */
+    public static void recordParserDetails(Parser parser, Metadata metadata) {
+        metadata.add("X-Parsed-By", getParserClassname(parser));
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 05/13: Prepare to track metadata between parsers

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 427417c5d17f1e03724f3e6ded64779bf7366677
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:04:43 2018 +0000

    Prepare to track metadata between parsers
---
 .../org/apache/tika/parser/multiple/AbstractMultipleParser.java    | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index c47e762..46cd064 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -34,6 +34,7 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -178,7 +179,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
+        // Track the metadata between parsers, so we can apply our policy
+        Metadata originalMetadata = ParserUtils.cloneMetadata(metadata);
+        Metadata lastMetadata = originalMetadata;
         
+        // Start tracking resources, so we can clean up when done
         TemporaryResources tmp = new TemporaryResources();
         try {
             // Force the stream to be a Tika one
@@ -187,6 +192,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             // TODO Support an InputStreamFactory as an alternative to
             //  Files, see TIKA-2585
             // TODO Rewind support copy from ParserDecorator.withFallbacks
+            // TODO Should we use RereadableInputStream instead?
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             Path path = taggedStream.getPath();
             
@@ -222,6 +228,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 }
                 
                 // TODO Handle metadata clashes based on the Policy
+                lastMetadata = ParserUtils.cloneMetadata(metadata);
             }
         } finally {
             tmp.dispose();

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 09/13: Move logic for recording embedded parser failures in the metadata to utils, and use for multiple parsers

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 97b97b345b49b7dd510af560598e6d1ab7baf28c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:24:41 2018 +0000

    Move logic for recording embedded parser failures in the metadata to utils, and use for multiple parsers
---
 .../apache/tika/parser/RecursiveParserWrapper.java | 10 +++-------
 .../parser/multiple/AbstractMultipleParser.java    |  1 +
 .../tika/parser/multiple/FallbackParser.java       |  3 ---
 .../tika/parser/multiple/SupplementingParser.java  |  3 ---
 .../java/org/apache/tika/utils/ParserUtils.java    | 22 +++++++++++++++++++++-
 5 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 1e8e5b1..c426a42 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -31,7 +31,6 @@ import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.sax.ContentHandlerFactory;
-import org.apache.tika.utils.ExceptionUtils;
 import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -85,8 +84,7 @@ public class RecursiveParserWrapper implements Parser {
     public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = 
                 Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
 
-    public final static Property EMBEDDED_EXCEPTION =
-            Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+    public final static Property EMBEDDED_EXCEPTIONx = ParserUtils.EMBEDDED_EXCEPTION;
     //move this to TikaCoreProperties?
     public final static Property EMBEDDED_RESOURCE_PATH = 
                 Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
@@ -304,16 +302,14 @@ public class RecursiveParserWrapper implements Parser {
                     metadata.add(WRITE_LIMIT_REACHED, "true");
                 } else {
                     if (catchEmbeddedExceptions) {
-                        String trace = ExceptionUtils.getStackTrace(e);
-                        metadata.set(EMBEDDED_EXCEPTION, trace);
+                        ParserUtils.recordParserFailure(this, e, metadata);
                     } else {
                         throw e;
                     }
                 }
             } catch (TikaException e) {
                 if (catchEmbeddedExceptions) {
-                    String trace = ExceptionUtils.getStackTrace(e);
-                    metadata.set(EMBEDDED_EXCEPTION, trace);
+                    ParserUtils.recordParserFailure(this, e, metadata);
                 } else {
                     throw e;
                 }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 4695e0a..d857b35 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -216,6 +216,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 try {
                     p.parse(parserStream, handler, metadata, context);
                 } catch (Exception e) {
+                    recordParserFailure(p, e, metadata);
                     failure = e;
                 }
                 
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
index 9b6a0bf..97a8aaf 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
@@ -61,9 +61,6 @@ public class FallbackParser extends AbstractMultipleParser {
         // If there was no exception, abort further parsers
         if (exception == null) return false;
         
-        // Record the details of this exception in the metadata
-        // TODO Share logic with the Recursive Parser Wrapper
-        
         // Have the next parser tried
         return true;
     }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
index fd5d037..c1dec34 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -72,9 +72,6 @@ public class SupplementingParser extends AbstractMultipleParser {
         // If there was no exception, just carry on to the next
         if (exception == null) return true;
         
-        // Record the details of this exception in the metadata
-        // TODO Share logic with the Recursive Parser Wrapper
-        
         // Have the next parser tried
         return true;
     }
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 58105a6..c3c63ba 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -17,6 +17,8 @@
 package org.apache.tika.utils;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 
@@ -24,6 +26,11 @@ import org.apache.tika.parser.ParserDecorator;
  * Helper util methods for Parsers themselves.
  */
 public class ParserUtils {
+    public final static Property EMBEDDED_PARSER =
+            Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
+    public final static Property EMBEDDED_EXCEPTION =
+            Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+    
     /**
      * Does a deep clone of a Metadata object.
      */
@@ -56,11 +63,24 @@ public class ParserUtils {
     }
 
     /**
-     * Records details of the {@link Parser} used to the Metadata,
+     * Records details of the {@link Parser} used to the {@link Metadata},
      *  typically wanted where multiple parsers could be picked between
      *  or used.
      */
     public static void recordParserDetails(Parser parser, Metadata metadata) {
         metadata.add("X-Parsed-By", getParserClassname(parser));
     }
+
+    /**
+     * Records details of a {@link Parser}'s failure to the
+     *  {@link Metadata}, so you can check what went wrong even if the
+     *  {@link Exception} wasn't immediately thrown (eg when several different
+     *  Parsers are used)
+     */
+    public static void recordParserFailure(Parser parser, Exception failure, 
+                                           Metadata metadata) {
+        String trace = ExceptionUtils.getStackTrace(failure);
+        metadata.add(EMBEDDED_EXCEPTION, trace);
+        metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 07/13: Pull common "Real Parser" identification logic out to utils

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d229ab6f666cde8b007f568b13001a2c780ff477
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:10:16 2018 +0000

    Pull common "Real Parser" identification logic out to utils
---
 .../java/org/apache/tika/parser/CompositeParser.java    |  7 ++-----
 .../tika/parser/multiple/AbstractMultipleParser.java    | 17 ++++-------------
 .../main/java/org/apache/tika/utils/ParserUtils.java    | 14 ++++++++++++++
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index ea3968e..0098468 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,6 +23,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -271,11 +272,7 @@ public class CompositeParser extends AbstractParser {
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             TaggedContentHandler taggedHandler = 
                 handler != null ? new TaggedContentHandler(handler) : null;
-            if (parser instanceof ParserDecorator){
-                metadata.add("X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
-            } else {
-                metadata.add("X-Parsed-By", parser.getClass().getName());
-            }
+            metadata.add("X-Parsed-By", ParserUtils.getParserClassname(parser));
             try {
                 parser.parse(taggedStream, taggedHandler, metadata, context);
             } catch (RuntimeException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 02d7e51..d66c541 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -34,7 +34,7 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
-import org.apache.tika.utils.ParserUtils;
+import static org.apache.tika.utils.ParserUtils.*;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -180,7 +180,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         // Track the metadata between parsers, so we can apply our policy
-        Metadata originalMetadata = ParserUtils.cloneMetadata(metadata);
+        Metadata originalMetadata = cloneMetadata(metadata);
         Metadata lastMetadata = originalMetadata;
         
         // Start tracking resources, so we can clean up when done
@@ -206,7 +206,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 TikaInputStream parserStream = TikaInputStream.get(path);
                 
                 // Record this parser
-                metadata.add("X-Parsed-By", getParserName(p));
+                metadata.add("X-Parsed-By", getParserClassname(p));
                 
                 // TODO Handle metadata clashes based on the Policy
                 
@@ -234,20 +234,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 }
                 
                 // TODO Handle metadata clashes based on the Policy
-                lastMetadata = ParserUtils.cloneMetadata(metadata);
+                lastMetadata = cloneMetadata(metadata);
             }
         } finally {
             tmp.dispose();
         }
     }
-    
-    private String getParserName(Parser parser) {
-        // TODO Share this logic with CompositeParser
-        if (parser instanceof ParserDecorator){
-            return ((ParserDecorator) parser).getWrappedParser().getClass().getName();
-        } else {
-            return parser.getClass().getName();
-        }
-    }
 }
 
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 289cbc2..bdbb04c 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -17,6 +17,8 @@
 package org.apache.tika.utils;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 
 /**
  * Helper util methods for Parsers themselves.
@@ -40,4 +42,16 @@ public class ParserUtils {
         }
         return clone;
     }
+
+    /**
+     * Identifies the real class name of the {@link Parser}, unwrapping
+     *  any {@link ParserDecorator} decorations on top of it.
+     */
+    public static String getParserClassname(Parser parser) {
+        if (parser instanceof ParserDecorator){
+            return ((ParserDecorator) parser).getWrappedParser().getClass().getName();
+        } else {
+            return parser.getClass().getName();
+        }
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 12/13: Implement some metadata policies for merging values from multiple parsers

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ee60f5e8ac4002cb6a296adc24cbcb7183cb1f8e
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 17:43:30 2018 +0000

    Implement some metadata policies for merging values from multiple parsers
---
 .../parser/multiple/AbstractMultipleParser.java    | 48 ++++++++++++++++++----
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 6262dc1..9781f49 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -16,9 +16,12 @@
  */
 package org.apache.tika.parser.multiple;
 
+import static org.apache.tika.utils.ParserUtils.cloneMetadata;
+import static org.apache.tika.utils.ParserUtils.recordParserDetails;
+import static org.apache.tika.utils.ParserUtils.recordParserFailure;
+
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.Path;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
@@ -34,7 +37,6 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
-import static org.apache.tika.utils.ParserUtils.*;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -187,7 +189,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             //  later if required for parser 2+
             // TODO Should we use RereadableInputStream instead?
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
-            Path path = taggedStream.getPath();
+            taggedStream.getPath();
             
             // TODO Somehow shield/wrap the Handler, so that we can
             //  avoid failures if multiple parsers want to do content
@@ -202,8 +204,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 
                 // Record that we used this parser
                 recordParserDetails(p, metadata);
-                
-                // TODO Handle metadata clashes based on the Policy
+
+                // Prepare an near-empty Metadata, will merge after
+                metadata = cloneMetadata(originalMetadata);
                 
                 // Process if possible
                 Exception failure = null;
@@ -229,14 +232,45 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                    break;
                 }
                 
-                // TODO Handle metadata clashes based on the Policy
-                lastMetadata = cloneMetadata(metadata);
+                // Handle metadata merging / clashes
+                metadata = mergeMetadata(metadata, lastMetadata, policy);
                 
                 // Prepare for the next parser, if present
+                lastMetadata = cloneMetadata(metadata);
                 taggedStream.reset();
             }
         } finally {
             tmp.dispose();
         }
     }
+    
+    // TODO Provide a method that takes an InputStreamSource as well,
+    //  and a ContentHandlerFactory. Will need wrappers to convert standard
+    
+    protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) {
+        if (policy == MetadataPolicy.DISCARD_ALL) {
+            return newMetadata;
+        }
+        
+        for (String n : lastMetadata.names()) {
+            if (newMetadata.get(n) == null) {
+                newMetadata.set(n, lastMetadata.get(n));
+            } else {
+                switch (policy) {
+                case FIRST_WINS:
+                    // Use the earlier value 
+                    newMetadata.set(n, lastMetadata.get(n));
+                    continue;
+                case LAST_WINS:
+                    // Most recent (last) parser has already won
+                    continue;
+                case KEEP_ALL:
+                    // TODO Find unique values to add
+                    // TODO Implement
+                    continue;
+                }
+            }
+        }
+        return newMetadata;
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 04/13: Pull out deep Metadata clone to a utils method for re-use

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d5a06ba6d17b0846cfc58b2e3c0a3df6abc31b0c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:02:31 2018 +0000

    Pull out deep Metadata clone to a utils method for re-use
---
 .../apache/tika/parser/RecursiveParserWrapper.java | 24 ++----------
 .../java/org/apache/tika/utils/ParserUtils.java    | 43 ++++++++++++++++++++++
 2 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 3cba1f1..1e8e5b1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -32,6 +32,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
@@ -169,7 +170,7 @@ public class RecursiveParserWrapper implements Parser {
             if (hitMaxEmbeddedResources) {
                 metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
             }
-            metadatas.add(0, deepCopy(metadata));
+            metadatas.add(0, ParserUtils.cloneMetadata(metadata));
         }
     }
 
@@ -226,23 +227,6 @@ public class RecursiveParserWrapper implements Parser {
         }
     }
     
-    //defensive copy
-    private Metadata deepCopy(Metadata m) {
-        Metadata clone = new Metadata();
-        
-        for (String n : m.names()){
-            if (! m.isMultiValued(n)) {
-                clone.set(n, m.get(n));
-            } else {
-                String[] vals = m.getValues(n);
-                for (int i = 0; i < vals.length; i++) {
-                    clone.add(n, vals[i]);
-                }
-            }
-        }
-        return clone;
-    }
-    
     private String getResourceName(Metadata metadata) {
         String objectName = "";
         if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
@@ -348,9 +332,7 @@ public class RecursiveParserWrapper implements Parser {
                 return;
             }
             addContent(localHandler, metadata);
-            metadatas.add(deepCopy(metadata));
+            metadatas.add(ParserUtils.cloneMetadata(metadata));
         }        
     }
-
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
new file mode 100644
index 0000000..289cbc2
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Helper util methods for Parsers themselves.
+ */
+public class ParserUtils {
+    /**
+     * Does a deep clone of a Metadata object.
+     */
+    public static Metadata cloneMetadata(Metadata m) {
+        Metadata clone = new Metadata();
+        
+        for (String n : m.names()){
+            if (! m.isMultiValued(n)) {
+                clone.set(n, m.get(n));
+            } else {
+                String[] vals = m.getValues(n);
+                for (int i = 0; i < vals.length; i++) {
+                    clone.add(n, vals[i]);
+                }
+            }
+        }
+        return clone;
+    }
+}

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 03/13: Ignore vim temp files

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3555745fcbb6a8601dcd1af27a6a9ab07fa40250
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 14:54:02 2018 +0000

    Ignore vim temp files
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index d8e7384..7c3e3e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ nb-configuration.xml
 *.DS_Store
 *.tmp-inception
 *.snap
+.*.swp
 tika-deployment/tika-snap-app/parts/
 tika-deployment/tika-snap-app/prime/
 tika-deployment/tika-snap-app/snap/

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 11/13: Bring over stream reset logic from ParserDecorator and update comments

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 82f6f5f6068d72b2afcb6c47840b9124554afdbf
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 17:12:34 2018 +0000

    Bring over stream reset logic from ParserDecorator and update comments
---
 .../parser/multiple/AbstractMultipleParser.java    | 28 ++++++++++------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 4d3ff0c..6262dc1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -167,13 +167,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             ContentHandler handler, Exception exception);
     
     /**
-     * Delegates the call to one or more Parsers, 
-     * Delegates the call to the matching component parser.
-     * <p>
-     * Potential {@link RuntimeException}s, {@link IOException}s and
-     * {@link SAXException}s unrelated to the given input stream and content
-     * handler are automatically wrapped into {@link TikaException}s to better
-     * honor the {@link Parser} contract.
+     * Processes the given Stream through one or more parsers, 
+     *  resetting things between parsers as requested by policy.
+     * The actual processing is delegated to one or more {@link Parser}s
      */
     public void parse(
             InputStream stream, ContentHandler handler,
@@ -187,11 +183,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         TemporaryResources tmp = new TemporaryResources();
         try {
             // Force the stream to be a Tika one
-            // Force the stream to be file-backed, so we can
-            //  re-wind it safely if required
-            // TODO Support an InputStreamFactory as an alternative to
-            //  Files, see TIKA-2585
-            // TODO Rewind support copy from ParserDecorator.withFallbacks
+            // Force the stream to be file-backed, so we can re-read safely
+            //  later if required for parser 2+
             // TODO Should we use RereadableInputStream instead?
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             Path path = taggedStream.getPath();
@@ -202,8 +195,10 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             // TODO Provide a way to supply a ContentHandlerFactory?
 
             for (Parser p : parsers) {
-                // TODO What's the best way to reset each time?
-                TikaInputStream parserStream = TikaInputStream.get(path);
+                // Indicate we may need to re-read the stream later
+                // TODO Support an InputStreamFactory as an alternative to
+                //  Files, see TIKA-2585
+                taggedStream.mark(-1);
                 
                 // Record that we used this parser
                 recordParserDetails(p, metadata);
@@ -213,7 +208,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // Process if possible
                 Exception failure = null;
                 try {
-                    p.parse(parserStream, handler, metadata, context);
+                    p.parse(taggedStream, handler, metadata, context);
                 } catch (Exception e) {
                     recordParserFailure(p, e, metadata);
                     failure = e;
@@ -236,6 +231,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 
                 // TODO Handle metadata clashes based on the Policy
                 lastMetadata = cloneMetadata(metadata);
+                
+                // Prepare for the next parser, if present
+                taggedStream.reset();
             }
         } finally {
             tmp.dispose();

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 02/13: Add TODOs for code to be shared/copied with other areas

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 62cf6f6cb3539ffbdb2886ff5485a997b0fe6773
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 07:17:41 2018 +0000

    Add TODOs for code to be shared/copied with other areas
---
 .../apache/tika/parser/multiple/AbstractMultipleParser.java   | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 08a90fd..c47e762 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -83,6 +83,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
     
     // TODO Figure out some sort of Content Policy and how
     //  it might possibly work
+    // TODO Is an overridden method that takes a 
+    //  ContentHandlerFactory the best way?
 
     /**
      * Media type registry.
@@ -184,12 +186,14 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             //  re-wind it safely if required
             // TODO Support an InputStreamFactory as an alternative to
             //  Files, see TIKA-2585
+            // TODO Rewind support copy from ParserDecorator.withFallbacks
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             Path path = taggedStream.getPath();
             
             // TODO Somehow shield/wrap the Handler, so that we can
             //  avoid failures if multiple parsers want to do content
             // TODO Solve the multiple-content problem!
+            // TODO Provide a way to supply a ContentHandlerFactory?
 
             for (Parser p : parsers) {
                 // TODO What's the best way to reset each time?
@@ -201,6 +205,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // TODO Handle metadata clashes based on the Policy
                 
                 // Process if possible
+                // TODO Share error recording logic with RecursiveParserWrapper
                 Exception failure = null;
                 try {
                     p.parse(parserStream, handler, metadata, context);
@@ -210,7 +215,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 
                 // Notify the implementation how it went
                 boolean tryNext = parserCompleted(p, metadata, handler, failure);
-                if (!tryNext) break;
+                // Abort if requested, with the exception if there was one
+                if (!tryNext) {
+                   if (failure != null) throw failure;
+                   break;
+                }
                 
                 // TODO Handle metadata clashes based on the Policy
             }

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 06/13: Fix exception handling

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c3897db807970e7eb39c87840e4e040713eb759c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:06:42 2018 +0000

    Fix exception handling
---
 .../org/apache/tika/parser/multiple/AbstractMultipleParser.java   | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 46cd064..02d7e51 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -223,7 +223,13 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 boolean tryNext = parserCompleted(p, metadata, handler, failure);
                 // Abort if requested, with the exception if there was one
                 if (!tryNext) {
-                   if (failure != null) throw failure;
+                   if (failure != null) {
+                       if (failure instanceof IOException) throw (IOException)failure;
+                       if (failure instanceof SAXException) throw (SAXException)failure;
+                       if (failure instanceof TikaException) throw (TikaException)failure;
+                       throw new TikaException("Unexpected RuntimeException from " + p, failure);
+                   }
+                   // Abort processing, don't try any more parsers
                    break;
                 }
                 

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 01/13: Name sample config files based on issue number

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 217a9cef62eae3bfdc23882f4483a00baea259fb
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 07:15:11 2018 +0000

    Name sample config files based on issue number
---
 .../config/{multiple-fallback.xml => TIKA-1509-multiple-fallback.xml}     | 0
 .../{multiple-supplemental.xml => TIKA-1509-multiple-supplemental.xml}    | 0
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/tika-core/src/test/resources/org/apache/tika/config/multiple-fallback.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-fallback.xml
similarity index 100%
rename from tika-core/src/test/resources/org/apache/tika/config/multiple-fallback.xml
rename to tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-fallback.xml
diff --git a/tika-core/src/test/resources/org/apache/tika/config/multiple-supplemental.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-supplemental.xml
similarity index 100%
rename from tika-core/src/test/resources/org/apache/tika/config/multiple-supplemental.xml
rename to tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-supplemental.xml

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 10/13: TODO updates, enforce allowed policies

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9be93c6bef2eabfb5ea93f60549762a2510b2dce
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 17:03:50 2018 +0000

    TODO updates, enforce allowed policies
---
 .../org/apache/tika/parser/multiple/AbstractMultipleParser.java     | 3 +--
 .../java/org/apache/tika/parser/multiple/SupplementingParser.java   | 6 +++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index d857b35..4d3ff0c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -211,7 +211,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // TODO Handle metadata clashes based on the Policy
                 
                 // Process if possible
-                // TODO Share error recording logic with RecursiveParserWrapper
                 Exception failure = null;
                 try {
                     p.parse(parserStream, handler, metadata, context);
@@ -222,6 +221,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 
                 // Notify the implementation how it went
                 boolean tryNext = parserCompleted(p, metadata, handler, failure);
+                
                 // Abort if requested, with the exception if there was one
                 if (!tryNext) {
                    if (failure != null) {
@@ -242,4 +242,3 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         }
     }
 }
-
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
index c1dec34..7eab004 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -63,7 +63,11 @@ public class SupplementingParser extends AbstractMultipleParser {
     public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy,
                                List<Parser> parsers) {
         super(registry, policy, parsers);
-        // TODO Check the policy is one we support
+        
+        // Ensure it's a supported policy
+        if (!allowedPolicies.contains(policy)) {
+            throw new IllegalArgumentException("Unsupported policy for SupplementingParser: " + policy);
+        }
     }
 
     @Override

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.