You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/13 18:15:36 UTC
[tika] 13/13: More metadata handling between parsers,
start on unit testing
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 348bfb9be46036833bbfda38c1912c9bf9eeb06e
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 18:15:14 2018 +0000
More metadata handling between parsers, start on unit testing
---
.../parser/multiple/AbstractMultipleParser.java | 19 ++--
.../tika/parser/multiple/MultipleParserTest.java | 111 +++++++++++++++++++++
2 files changed, 123 insertions(+), 7 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 9781f49..0aded0c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -175,11 +175,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
*/
public void parse(
InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
+ Metadata originalMetadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Track the metadata between parsers, so we can apply our policy
- Metadata originalMetadata = cloneMetadata(metadata);
- Metadata lastMetadata = originalMetadata;
+ Metadata lastMetadata = cloneMetadata(originalMetadata);
+ Metadata metadata = lastMetadata;
// Start tracking resources, so we can clean up when done
TemporaryResources tmp = new TemporaryResources();
@@ -203,7 +203,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
taggedStream.mark(-1);
// Record that we used this parser
- recordParserDetails(p, metadata);
+ recordParserDetails(p, originalMetadata);
// Prepare an near-empty Metadata, will merge after
metadata = cloneMetadata(originalMetadata);
@@ -220,6 +220,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Notify the implementation how it went
boolean tryNext = parserCompleted(p, metadata, handler, failure);
+ // Handle metadata merging / clashes
+ metadata = mergeMetadata(metadata, lastMetadata, policy);
+
// Abort if requested, with the exception if there was one
if (!tryNext) {
if (failure != null) {
@@ -232,9 +235,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
break;
}
- // Handle metadata merging / clashes
- metadata = mergeMetadata(metadata, lastMetadata, policy);
-
// Prepare for the next parser, if present
lastMetadata = cloneMetadata(metadata);
taggedStream.reset();
@@ -242,6 +242,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
} finally {
tmp.dispose();
}
+
+ // Finally, copy the latest metadata back onto their supplied object
+ for (String n : metadata.names()) {
+ originalMetadata.set(n, metadata.get(n));
+ }
}
// TODO Provide a method that takes an InputStreamSource as well,
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
new file mode 100644
index 0000000..b3166eb
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.parser.DummyParser;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ErrorParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+public class MultipleParserTest {
+ /**
+ * Tests how {@link AbstractMultipleParser} works out which
+ * mime types to offer, based on the types of the parsers
+ */
+ @Test
+ public void testMimeTypeSupported() {
+ // TODO
+ }
+
+ /**
+ * Test {@link FallbackParser}
+ */
+ @Test
+ public void testFallback() throws Exception {
+ ParseContext context = new ParseContext();
+ BodyContentHandler handler;
+ Metadata metadata;
+ Parser p;
+ String[] usedParsers;
+
+ // Some media types
+ Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+ Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+ MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+
+ // Some parsers
+ ErrorParser pFail = new ErrorParser();
+ DummyParser pContent = new DummyParser(onlyOct, new HashMap<String,String>(),
+ "Fell back!");
+ EmptyParser pNothing = new EmptyParser();
+
+
+ // With only one parser defined, works as normal
+ p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(1, usedParsers.length);
+ assertEquals(DummyParser.class.getName(), usedParsers[0]);
+
+
+ // With a failing parser, will go to the working one
+ p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(2, usedParsers.length);
+ assertEquals(DummyParser.class.getName(), usedParsers[0]);
+
+ // TODO Check we got an exception
+
+
+ // Won't go past the working one
+ // TODO
+ }
+
+ /**
+ * Test for {@link SupplementingParser}
+ */
+ @Test
+ public void testSupplemental() throws Exception {
+ // TODO
+ }
+}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.