You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/13 18:15:36 UTC

[tika] 13/13: More metadata handling between parsers, start on unit testing

This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 348bfb9be46036833bbfda38c1912c9bf9eeb06e
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 18:15:14 2018 +0000

    More metadata handling between parsers, start on unit testing
---
 .../parser/multiple/AbstractMultipleParser.java    |  19 ++--
 .../tika/parser/multiple/MultipleParserTest.java   | 111 +++++++++++++++++++++
 2 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 9781f49..0aded0c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -175,11 +175,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
      */
     public void parse(
             InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
+            Metadata originalMetadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         // Track the metadata between parsers, so we can apply our policy
-        Metadata originalMetadata = cloneMetadata(metadata);
-        Metadata lastMetadata = originalMetadata;
+        Metadata lastMetadata = cloneMetadata(originalMetadata);
+        Metadata metadata = lastMetadata;
         
         // Start tracking resources, so we can clean up when done
         TemporaryResources tmp = new TemporaryResources();
@@ -203,7 +203,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 taggedStream.mark(-1);
                 
                 // Record that we used this parser
-                recordParserDetails(p, metadata);
+                recordParserDetails(p, originalMetadata);
 
                 // Prepare an near-empty Metadata, will merge after
                 metadata = cloneMetadata(originalMetadata);
@@ -220,6 +220,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // Notify the implementation how it went
                 boolean tryNext = parserCompleted(p, metadata, handler, failure);
                 
+                // Handle metadata merging / clashes
+                metadata = mergeMetadata(metadata, lastMetadata, policy);
+                
                 // Abort if requested, with the exception if there was one
                 if (!tryNext) {
                    if (failure != null) {
@@ -232,9 +235,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                    break;
                 }
                 
-                // Handle metadata merging / clashes
-                metadata = mergeMetadata(metadata, lastMetadata, policy);
-                
                 // Prepare for the next parser, if present
                 lastMetadata = cloneMetadata(metadata);
                 taggedStream.reset();
@@ -242,6 +242,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         } finally {
             tmp.dispose();
         }
+        
+        // Finally, copy the latest metadata back onto their supplied object
+        for (String n : metadata.names()) {
+            originalMetadata.set(n, metadata.get(n));
+        }
     }
     
     // TODO Provide a method that takes an InputStreamSource as well,
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
new file mode 100644
index 0000000..b3166eb
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.parser.DummyParser;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ErrorParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+public class MultipleParserTest {
+    /**
+     * Tests how {@link AbstractMultipleParser} works out which
+     *  mime types to offer, based on the types of the parsers
+     */
+    @Test
+    public void testMimeTypeSupported() {
+        // TODO
+    }
+    
+    /**
+     * Test {@link FallbackParser}
+     */
+    @Test
+    public void testFallback() throws Exception {
+        ParseContext context = new ParseContext();
+        BodyContentHandler handler;
+        Metadata metadata;
+        Parser p;
+        String[] usedParsers;
+        
+        // Some media types
+        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+        Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+                MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+        
+        // Some parsers
+        ErrorParser pFail = new ErrorParser();
+        DummyParser pContent = new DummyParser(onlyOct, new HashMap<String,String>(),
+                                               "Fell back!");
+        EmptyParser pNothing = new EmptyParser();
+        
+        
+        // With only one parser defined, works as normal
+        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back!", handler.toString());
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(1, usedParsers.length);
+        assertEquals(DummyParser.class.getName(), usedParsers[0]);
+        
+        
+        // With a failing parser, will go to the working one
+        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back!", handler.toString());
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(2, usedParsers.length);
+        assertEquals(DummyParser.class.getName(), usedParsers[0]);
+        
+        // TODO Check we got an exception
+        
+        
+        // Won't go past the working one
+        // TODO
+    }
+    
+    /**
+     * Test for {@link SupplementingParser}
+     */
+    @Test
+    public void testSupplemental() throws Exception {
+        // TODO 
+    }
+}

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.