You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/13 18:15:23 UTC
[tika] branch multiple-parsers updated (bc8a75e -> 348bfb9)
This is an automated email from the ASF dual-hosted git repository.
nick pushed a change to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git.
from bc8a75e Sample fallback and supplemental config files based on https://wiki.apache.org/tika/CompositeParserDiscussion
new 217a9ce Name sample config files based on issue number
new 62cf6f6 Add TODOs for code to be shared/copied with other areas
new 3555745 Ignore vim temp files
new d5a06ba Pull out deep Metadata clone to a utils method for re-use
new 427417c Prepare to track metadata between parsers
new c3897db Fix exception handling
new d229ab6 Pull common "Real Parser" identification logic out to utils
new f4a926c Use utils for recording details of the parser used
new 97b97b3 Move logic for recording embedded parser failures in the metadata to utils, and use for multiple parsers
new 9be93c6 TODO updates, enforce allowed policies
new 82f6f5f Bring over stream reset logic from ParserDecorator and update comments
new ee60f5e Implement some metadata policies for merging values from multiple parsers
new 348bfb9 More metadata handling between parsers, start on unit testing
The 13 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
.gitignore | 1 +
.../org/apache/tika/parser/CompositeParser.java | 7 +-
.../apache/tika/parser/RecursiveParserWrapper.java | 34 ++-----
.../parser/multiple/AbstractMultipleParser.java | 110 ++++++++++++++------
.../tika/parser/multiple/FallbackParser.java | 3 -
.../tika/parser/multiple/SupplementingParser.java | 9 +-
.../java/org/apache/tika/utils/ParserUtils.java | 86 ++++++++++++++++
.../tika/parser/multiple/MultipleParserTest.java | 111 +++++++++++++++++++++
...allback.xml => TIKA-1509-multiple-fallback.xml} | 0
...tal.xml => TIKA-1509-multiple-supplemental.xml} | 0
10 files changed, 291 insertions(+), 70 deletions(-)
create mode 100644 tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
create mode 100644 tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
rename tika-core/src/test/resources/org/apache/tika/config/{multiple-fallback.xml => TIKA-1509-multiple-fallback.xml} (100%)
rename tika-core/src/test/resources/org/apache/tika/config/{multiple-supplemental.xml => TIKA-1509-multiple-supplemental.xml} (100%)
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 13/13: More metadata handling between parsers,
start on unit testing
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 348bfb9be46036833bbfda38c1912c9bf9eeb06e
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 18:15:14 2018 +0000
More metadata handling between parsers, start on unit testing
---
.../parser/multiple/AbstractMultipleParser.java | 19 ++--
.../tika/parser/multiple/MultipleParserTest.java | 111 +++++++++++++++++++++
2 files changed, 123 insertions(+), 7 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 9781f49..0aded0c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -175,11 +175,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
*/
public void parse(
InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
+ Metadata originalMetadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Track the metadata between parsers, so we can apply our policy
- Metadata originalMetadata = cloneMetadata(metadata);
- Metadata lastMetadata = originalMetadata;
+ Metadata lastMetadata = cloneMetadata(originalMetadata);
+ Metadata metadata = lastMetadata;
// Start tracking resources, so we can clean up when done
TemporaryResources tmp = new TemporaryResources();
@@ -203,7 +203,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
taggedStream.mark(-1);
// Record that we used this parser
- recordParserDetails(p, metadata);
+ recordParserDetails(p, originalMetadata);
// Prepare an near-empty Metadata, will merge after
metadata = cloneMetadata(originalMetadata);
@@ -220,6 +220,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Notify the implementation how it went
boolean tryNext = parserCompleted(p, metadata, handler, failure);
+ // Handle metadata merging / clashes
+ metadata = mergeMetadata(metadata, lastMetadata, policy);
+
// Abort if requested, with the exception if there was one
if (!tryNext) {
if (failure != null) {
@@ -232,9 +235,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
break;
}
- // Handle metadata merging / clashes
- metadata = mergeMetadata(metadata, lastMetadata, policy);
-
// Prepare for the next parser, if present
lastMetadata = cloneMetadata(metadata);
taggedStream.reset();
@@ -242,6 +242,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
} finally {
tmp.dispose();
}
+
+ // Finally, copy the latest metadata back onto their supplied object
+ for (String n : metadata.names()) {
+ originalMetadata.set(n, metadata.get(n));
+ }
}
// TODO Provide a method that takes an InputStreamSource as well,
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
new file mode 100644
index 0000000..b3166eb
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.parser.DummyParser;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ErrorParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+public class MultipleParserTest {
+ /**
+ * Tests how {@link AbstractMultipleParser} works out which
+ * mime types to offer, based on the types of the parsers
+ */
+ @Test
+ public void testMimeTypeSupported() {
+ // TODO
+ }
+
+ /**
+ * Test {@link FallbackParser}
+ */
+ @Test
+ public void testFallback() throws Exception {
+ ParseContext context = new ParseContext();
+ BodyContentHandler handler;
+ Metadata metadata;
+ Parser p;
+ String[] usedParsers;
+
+ // Some media types
+ Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+ Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+ MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+
+ // Some parsers
+ ErrorParser pFail = new ErrorParser();
+ DummyParser pContent = new DummyParser(onlyOct, new HashMap<String,String>(),
+ "Fell back!");
+ EmptyParser pNothing = new EmptyParser();
+
+
+ // With only one parser defined, works as normal
+ p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(1, usedParsers.length);
+ assertEquals(DummyParser.class.getName(), usedParsers[0]);
+
+
+ // With a failing parser, will go to the working one
+ p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(2, usedParsers.length);
+ assertEquals(DummyParser.class.getName(), usedParsers[0]);
+
+ // TODO Check we got an exception
+
+
+ // Won't go past the working one
+ // TODO
+ }
+
+ /**
+ * Test for {@link SupplementingParser}
+ */
+ @Test
+ public void testSupplemental() throws Exception {
+ // TODO
+ }
+}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 08/13: Use utils for recording details of the parser used
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit f4a926ca94c50a6158891c7746e725cd720a2faa
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:13:19 2018 +0000
Use utils for recording details of the parser used
---
.../src/main/java/org/apache/tika/parser/CompositeParser.java | 2 +-
.../org/apache/tika/parser/multiple/AbstractMultipleParser.java | 4 ++--
tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java | 9 +++++++++
3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 0098468..c5c95a6 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -272,7 +272,7 @@ public class CompositeParser extends AbstractParser {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TaggedContentHandler taggedHandler =
handler != null ? new TaggedContentHandler(handler) : null;
- metadata.add("X-Parsed-By", ParserUtils.getParserClassname(parser));
+ ParserUtils.recordParserDetails(parser, metadata);
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (RuntimeException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index d66c541..4695e0a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -205,8 +205,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// TODO What's the best way to reset each time?
TikaInputStream parserStream = TikaInputStream.get(path);
- // Record this parser
- metadata.add("X-Parsed-By", getParserClassname(p));
+ // Record that we used this parser
+ recordParserDetails(p, metadata);
// TODO Handle metadata clashes based on the Policy
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index bdbb04c..58105a6 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -54,4 +54,13 @@ public class ParserUtils {
return parser.getClass().getName();
}
}
+
+ /**
+ * Records details of the {@link Parser} used to the Metadata,
+ * typically wanted where multiple parsers could be picked between
+ * or used.
+ */
+ public static void recordParserDetails(Parser parser, Metadata metadata) {
+ metadata.add("X-Parsed-By", getParserClassname(parser));
+ }
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 05/13: Prepare to track metadata between parsers
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 427417c5d17f1e03724f3e6ded64779bf7366677
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:04:43 2018 +0000
Prepare to track metadata between parsers
---
.../org/apache/tika/parser/multiple/AbstractMultipleParser.java | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index c47e762..46cd064 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -34,6 +34,7 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -178,7 +179,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ // Track the metadata between parsers, so we can apply our policy
+ Metadata originalMetadata = ParserUtils.cloneMetadata(metadata);
+ Metadata lastMetadata = originalMetadata;
+ // Start tracking resources, so we can clean up when done
TemporaryResources tmp = new TemporaryResources();
try {
// Force the stream to be a Tika one
@@ -187,6 +192,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// TODO Support an InputStreamFactory as an alternative to
// Files, see TIKA-2585
// TODO Rewind support copy from ParserDecorator.withFallbacks
+ // TODO Should we use RereadableInputStream instead?
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
Path path = taggedStream.getPath();
@@ -222,6 +228,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
}
// TODO Handle metadata clashes based on the Policy
+ lastMetadata = ParserUtils.cloneMetadata(metadata);
}
} finally {
tmp.dispose();
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 09/13: Move logic for recording embedded parser failures in
the metadata to utils, and use for multiple parsers
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 97b97b345b49b7dd510af560598e6d1ab7baf28c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:24:41 2018 +0000
Move logic for recording embedded parser failures in the metadata to utils, and use for multiple parsers
---
.../apache/tika/parser/RecursiveParserWrapper.java | 10 +++-------
.../parser/multiple/AbstractMultipleParser.java | 1 +
.../tika/parser/multiple/FallbackParser.java | 3 ---
.../tika/parser/multiple/SupplementingParser.java | 3 ---
.../java/org/apache/tika/utils/ParserUtils.java | 22 +++++++++++++++++++++-
5 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 1e8e5b1..c426a42 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -31,7 +31,6 @@ import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.ContentHandlerFactory;
-import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -85,8 +84,7 @@ public class RecursiveParserWrapper implements Parser {
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
- public final static Property EMBEDDED_EXCEPTION =
- Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+ public final static Property EMBEDDED_EXCEPTIONx = ParserUtils.EMBEDDED_EXCEPTION;
//move this to TikaCoreProperties?
public final static Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
@@ -304,16 +302,14 @@ public class RecursiveParserWrapper implements Parser {
metadata.add(WRITE_LIMIT_REACHED, "true");
} else {
if (catchEmbeddedExceptions) {
- String trace = ExceptionUtils.getStackTrace(e);
- metadata.set(EMBEDDED_EXCEPTION, trace);
+ ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
}
}
} catch (TikaException e) {
if (catchEmbeddedExceptions) {
- String trace = ExceptionUtils.getStackTrace(e);
- metadata.set(EMBEDDED_EXCEPTION, trace);
+ ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 4695e0a..d857b35 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -216,6 +216,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
try {
p.parse(parserStream, handler, metadata, context);
} catch (Exception e) {
+ recordParserFailure(p, e, metadata);
failure = e;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
index 9b6a0bf..97a8aaf 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
@@ -61,9 +61,6 @@ public class FallbackParser extends AbstractMultipleParser {
// If there was no exception, abort further parsers
if (exception == null) return false;
- // Record the details of this exception in the metadata
- // TODO Share logic with the Recursive Parser Wrapper
-
// Have the next parser tried
return true;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
index fd5d037..c1dec34 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -72,9 +72,6 @@ public class SupplementingParser extends AbstractMultipleParser {
// If there was no exception, just carry on to the next
if (exception == null) return true;
- // Record the details of this exception in the metadata
- // TODO Share logic with the Recursive Parser Wrapper
-
// Have the next parser tried
return true;
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 58105a6..c3c63ba 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -17,6 +17,8 @@
package org.apache.tika.utils;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
@@ -24,6 +26,11 @@ import org.apache.tika.parser.ParserDecorator;
* Helper util methods for Parsers themselves.
*/
public class ParserUtils {
+ public final static Property EMBEDDED_PARSER =
+ Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
+ public final static Property EMBEDDED_EXCEPTION =
+ Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+
/**
* Does a deep clone of a Metadata object.
*/
@@ -56,11 +63,24 @@ public class ParserUtils {
}
/**
- * Records details of the {@link Parser} used to the Metadata,
+ * Records details of the {@link Parser} used to the {@link Metadata},
* typically wanted where multiple parsers could be picked between
* or used.
*/
public static void recordParserDetails(Parser parser, Metadata metadata) {
metadata.add("X-Parsed-By", getParserClassname(parser));
}
+
+ /**
+ * Records details of a {@link Parser}'s failure to the
+ * {@link Metadata}, so you can check what went wrong even if the
+ * {@link Exception} wasn't immediately thrown (eg when several different
+ * Parsers are used)
+ */
+ public static void recordParserFailure(Parser parser, Exception failure,
+ Metadata metadata) {
+ String trace = ExceptionUtils.getStackTrace(failure);
+ metadata.add(EMBEDDED_EXCEPTION, trace);
+ metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
+ }
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 07/13: Pull common "Real Parser" identification logic out to
utils
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d229ab6f666cde8b007f568b13001a2c780ff477
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:10:16 2018 +0000
Pull common "Real Parser" identification logic out to utils
---
.../java/org/apache/tika/parser/CompositeParser.java | 7 ++-----
.../tika/parser/multiple/AbstractMultipleParser.java | 17 ++++-------------
.../main/java/org/apache/tika/utils/ParserUtils.java | 14 ++++++++++++++
3 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index ea3968e..0098468 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,6 +23,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -271,11 +272,7 @@ public class CompositeParser extends AbstractParser {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TaggedContentHandler taggedHandler =
handler != null ? new TaggedContentHandler(handler) : null;
- if (parser instanceof ParserDecorator){
- metadata.add("X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
- } else {
- metadata.add("X-Parsed-By", parser.getClass().getName());
- }
+ metadata.add("X-Parsed-By", ParserUtils.getParserClassname(parser));
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (RuntimeException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 02d7e51..d66c541 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -34,7 +34,7 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
-import org.apache.tika.utils.ParserUtils;
+import static org.apache.tika.utils.ParserUtils.*;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -180,7 +180,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Track the metadata between parsers, so we can apply our policy
- Metadata originalMetadata = ParserUtils.cloneMetadata(metadata);
+ Metadata originalMetadata = cloneMetadata(metadata);
Metadata lastMetadata = originalMetadata;
// Start tracking resources, so we can clean up when done
@@ -206,7 +206,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
TikaInputStream parserStream = TikaInputStream.get(path);
// Record this parser
- metadata.add("X-Parsed-By", getParserName(p));
+ metadata.add("X-Parsed-By", getParserClassname(p));
// TODO Handle metadata clashes based on the Policy
@@ -234,20 +234,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
}
// TODO Handle metadata clashes based on the Policy
- lastMetadata = ParserUtils.cloneMetadata(metadata);
+ lastMetadata = cloneMetadata(metadata);
}
} finally {
tmp.dispose();
}
}
-
- private String getParserName(Parser parser) {
- // TODO Share this logic with CompositeParser
- if (parser instanceof ParserDecorator){
- return ((ParserDecorator) parser).getWrappedParser().getClass().getName();
- } else {
- return parser.getClass().getName();
- }
- }
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 289cbc2..bdbb04c 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -17,6 +17,8 @@
package org.apache.tika.utils;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
/**
* Helper util methods for Parsers themselves.
@@ -40,4 +42,16 @@ public class ParserUtils {
}
return clone;
}
+
+ /**
+ * Identifies the real class name of the {@link Parser}, unwrapping
+ * any {@link ParserDecorator} decorations on top of it.
+ */
+ public static String getParserClassname(Parser parser) {
+ if (parser instanceof ParserDecorator){
+ return ((ParserDecorator) parser).getWrappedParser().getClass().getName();
+ } else {
+ return parser.getClass().getName();
+ }
+ }
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 12/13: Implement some metadata policies for merging values
from multiple parsers
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit ee60f5e8ac4002cb6a296adc24cbcb7183cb1f8e
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 17:43:30 2018 +0000
Implement some metadata policies for merging values from multiple parsers
---
.../parser/multiple/AbstractMultipleParser.java | 48 ++++++++++++++++++----
1 file changed, 41 insertions(+), 7 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 6262dc1..9781f49 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -16,9 +16,12 @@
*/
package org.apache.tika.parser.multiple;
+import static org.apache.tika.utils.ParserUtils.cloneMetadata;
+import static org.apache.tika.utils.ParserUtils.recordParserDetails;
+import static org.apache.tika.utils.ParserUtils.recordParserFailure;
+
import java.io.IOException;
import java.io.InputStream;
-import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
@@ -34,7 +37,6 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
-import static org.apache.tika.utils.ParserUtils.*;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -187,7 +189,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// later if required for parser 2+
// TODO Should we use RereadableInputStream instead?
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
- Path path = taggedStream.getPath();
+ taggedStream.getPath();
// TODO Somehow shield/wrap the Handler, so that we can
// avoid failures if multiple parsers want to do content
@@ -202,8 +204,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Record that we used this parser
recordParserDetails(p, metadata);
-
- // TODO Handle metadata clashes based on the Policy
+
+ // Prepare an near-empty Metadata, will merge after
+ metadata = cloneMetadata(originalMetadata);
// Process if possible
Exception failure = null;
@@ -229,14 +232,45 @@ public abstract class AbstractMultipleParser extends AbstractParser {
break;
}
- // TODO Handle metadata clashes based on the Policy
- lastMetadata = cloneMetadata(metadata);
+ // Handle metadata merging / clashes
+ metadata = mergeMetadata(metadata, lastMetadata, policy);
// Prepare for the next parser, if present
+ lastMetadata = cloneMetadata(metadata);
taggedStream.reset();
}
} finally {
tmp.dispose();
}
}
+
+ // TODO Provide a method that takes an InputStreamSource as well,
+ // and a ContentHandlerFactory. Will need wrappers to convert standard
+
+ protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) {
+ if (policy == MetadataPolicy.DISCARD_ALL) {
+ return newMetadata;
+ }
+
+ for (String n : lastMetadata.names()) {
+ if (newMetadata.get(n) == null) {
+ newMetadata.set(n, lastMetadata.get(n));
+ } else {
+ switch (policy) {
+ case FIRST_WINS:
+ // Use the earlier value
+ newMetadata.set(n, lastMetadata.get(n));
+ continue;
+ case LAST_WINS:
+ // Most recent (last) parser has already won
+ continue;
+ case KEEP_ALL:
+ // TODO Find unique values to add
+ // TODO Implement
+ continue;
+ }
+ }
+ }
+ return newMetadata;
+ }
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 04/13: Pull out deep Metadata clone to a utils method for
re-use
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d5a06ba6d17b0846cfc58b2e3c0a3df6abc31b0c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:02:31 2018 +0000
Pull out deep Metadata clone to a utils method for re-use
---
.../apache/tika/parser/RecursiveParserWrapper.java | 24 ++----------
.../java/org/apache/tika/utils/ParserUtils.java | 43 ++++++++++++++++++++++
2 files changed, 46 insertions(+), 21 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 3cba1f1..1e8e5b1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -32,6 +32,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -169,7 +170,7 @@ public class RecursiveParserWrapper implements Parser {
if (hitMaxEmbeddedResources) {
metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
}
- metadatas.add(0, deepCopy(metadata));
+ metadatas.add(0, ParserUtils.cloneMetadata(metadata));
}
}
@@ -226,23 +227,6 @@ public class RecursiveParserWrapper implements Parser {
}
}
- //defensive copy
- private Metadata deepCopy(Metadata m) {
- Metadata clone = new Metadata();
-
- for (String n : m.names()){
- if (! m.isMultiValued(n)) {
- clone.set(n, m.get(n));
- } else {
- String[] vals = m.getValues(n);
- for (int i = 0; i < vals.length; i++) {
- clone.add(n, vals[i]);
- }
- }
- }
- return clone;
- }
-
private String getResourceName(Metadata metadata) {
String objectName = "";
if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
@@ -348,9 +332,7 @@ public class RecursiveParserWrapper implements Parser {
return;
}
addContent(localHandler, metadata);
- metadatas.add(deepCopy(metadata));
+ metadatas.add(ParserUtils.cloneMetadata(metadata));
}
}
-
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
new file mode 100644
index 0000000..289cbc2
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Helper util methods for Parsers themselves.
+ */
+public class ParserUtils {
+ /**
+ * Does a deep clone of a Metadata object.
+ */
+ public static Metadata cloneMetadata(Metadata m) {
+ Metadata clone = new Metadata();
+
+ for (String n : m.names()){
+ if (! m.isMultiValued(n)) {
+ clone.set(n, m.get(n));
+ } else {
+ String[] vals = m.getValues(n);
+ for (int i = 0; i < vals.length; i++) {
+ clone.add(n, vals[i]);
+ }
+ }
+ }
+ return clone;
+ }
+}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 03/13: Ignore vim temp files
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3555745fcbb6a8601dcd1af27a6a9ab07fa40250
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 14:54:02 2018 +0000
Ignore vim temp files
---
.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index d8e7384..7c3e3e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ nb-configuration.xml
*.DS_Store
*.tmp-inception
*.snap
+.*.swp
tika-deployment/tika-snap-app/parts/
tika-deployment/tika-snap-app/prime/
tika-deployment/tika-snap-app/snap/
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 11/13: Bring over stream reset logic from ParserDecorator
and update comments
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 82f6f5f6068d72b2afcb6c47840b9124554afdbf
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 17:12:34 2018 +0000
Bring over stream reset logic from ParserDecorator and update comments
---
.../parser/multiple/AbstractMultipleParser.java | 28 ++++++++++------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 4d3ff0c..6262dc1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -167,13 +167,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
ContentHandler handler, Exception exception);
/**
- * Delegates the call to one or more Parsers,
- * Delegates the call to the matching component parser.
- * <p>
- * Potential {@link RuntimeException}s, {@link IOException}s and
- * {@link SAXException}s unrelated to the given input stream and content
- * handler are automatically wrapped into {@link TikaException}s to better
- * honor the {@link Parser} contract.
+ * Processes the given Stream through one or more parsers,
+ * resetting things between parsers as requested by policy.
+ * The actual processing is delegated to one or more {@link Parser}s
*/
public void parse(
InputStream stream, ContentHandler handler,
@@ -187,11 +183,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
TemporaryResources tmp = new TemporaryResources();
try {
// Force the stream to be a Tika one
- // Force the stream to be file-backed, so we can
- // re-wind it safely if required
- // TODO Support an InputStreamFactory as an alternative to
- // Files, see TIKA-2585
- // TODO Rewind support copy from ParserDecorator.withFallbacks
+ // Force the stream to be file-backed, so we can re-read safely
+ // later if required for parser 2+
// TODO Should we use RereadableInputStream instead?
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
Path path = taggedStream.getPath();
@@ -202,8 +195,10 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// TODO Provide a way to supply a ContentHandlerFactory?
for (Parser p : parsers) {
- // TODO What's the best way to reset each time?
- TikaInputStream parserStream = TikaInputStream.get(path);
+ // Indicate we may need to re-read the stream later
+ // TODO Support an InputStreamFactory as an alternative to
+ // Files, see TIKA-2585
+ taggedStream.mark(-1);
// Record that we used this parser
recordParserDetails(p, metadata);
@@ -213,7 +208,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Process if possible
Exception failure = null;
try {
- p.parse(parserStream, handler, metadata, context);
+ p.parse(taggedStream, handler, metadata, context);
} catch (Exception e) {
recordParserFailure(p, e, metadata);
failure = e;
@@ -236,6 +231,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// TODO Handle metadata clashes based on the Policy
lastMetadata = cloneMetadata(metadata);
+
+ // Prepare for the next parser, if present
+ taggedStream.reset();
}
} finally {
tmp.dispose();
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 02/13: Add TODOs for code to be shared/copied with other
areas
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 62cf6f6cb3539ffbdb2886ff5485a997b0fe6773
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 07:17:41 2018 +0000
Add TODOs for code to be shared/copied with other areas
---
.../apache/tika/parser/multiple/AbstractMultipleParser.java | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 08a90fd..c47e762 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -83,6 +83,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// TODO Figure out some sort of Content Policy and how
// it might possibly work
+ // TODO Is an overridden method that takes a
+ // ContentHandlerFactory the best way?
/**
* Media type registry.
@@ -184,12 +186,14 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// re-wind it safely if required
// TODO Support an InputStreamFactory as an alternative to
// Files, see TIKA-2585
+ // TODO Rewind support copy from ParserDecorator.withFallbacks
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
Path path = taggedStream.getPath();
// TODO Somehow shield/wrap the Handler, so that we can
// avoid failures if multiple parsers want to do content
// TODO Solve the multiple-content problem!
+ // TODO Provide a way to supply a ContentHandlerFactory?
for (Parser p : parsers) {
// TODO What's the best way to reset each time?
@@ -201,6 +205,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// TODO Handle metadata clashes based on the Policy
// Process if possible
+ // TODO Share error recording logic with RecursiveParserWrapper
Exception failure = null;
try {
p.parse(parserStream, handler, metadata, context);
@@ -210,7 +215,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Notify the implementation how it went
boolean tryNext = parserCompleted(p, metadata, handler, failure);
- if (!tryNext) break;
+ // Abort if requested, with the exception if there was one
+ if (!tryNext) {
+ if (failure != null) throw failure;
+ break;
+ }
// TODO Handle metadata clashes based on the Policy
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 06/13: Fix exception handling
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit c3897db807970e7eb39c87840e4e040713eb759c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 15:06:42 2018 +0000
Fix exception handling
---
.../org/apache/tika/parser/multiple/AbstractMultipleParser.java | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 46cd064..02d7e51 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -223,7 +223,13 @@ public abstract class AbstractMultipleParser extends AbstractParser {
boolean tryNext = parserCompleted(p, metadata, handler, failure);
// Abort if requested, with the exception if there was one
if (!tryNext) {
- if (failure != null) throw failure;
+ if (failure != null) {
+ if (failure instanceof IOException) throw (IOException)failure;
+ if (failure instanceof SAXException) throw (SAXException)failure;
+ if (failure instanceof TikaException) throw (TikaException)failure;
+ throw new TikaException("Unexpected RuntimeException from " + p, failure);
+ }
+ // Abort processing, don't try any more parsers
break;
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 01/13: Name sample config files based on issue number
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 217a9cef62eae3bfdc23882f4483a00baea259fb
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 07:15:11 2018 +0000
Name sample config files based on issue number
---
.../config/{multiple-fallback.xml => TIKA-1509-multiple-fallback.xml} | 0
.../{multiple-supplemental.xml => TIKA-1509-multiple-supplemental.xml} | 0
2 files changed, 0 insertions(+), 0 deletions(-)
diff --git a/tika-core/src/test/resources/org/apache/tika/config/multiple-fallback.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-fallback.xml
similarity index 100%
rename from tika-core/src/test/resources/org/apache/tika/config/multiple-fallback.xml
rename to tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-fallback.xml
diff --git a/tika-core/src/test/resources/org/apache/tika/config/multiple-supplemental.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-supplemental.xml
similarity index 100%
rename from tika-core/src/test/resources/org/apache/tika/config/multiple-supplemental.xml
rename to tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-supplemental.xml
--
To stop receiving notification emails like this one, please contact
nick@apache.org.
[tika] 10/13: TODO updates, enforce allowed policies
Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9be93c6bef2eabfb5ea93f60549762a2510b2dce
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 17:03:50 2018 +0000
TODO updates, enforce allowed policies
---
.../org/apache/tika/parser/multiple/AbstractMultipleParser.java | 3 +--
.../java/org/apache/tika/parser/multiple/SupplementingParser.java | 6 +++++-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index d857b35..4d3ff0c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -211,7 +211,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// TODO Handle metadata clashes based on the Policy
// Process if possible
- // TODO Share error recording logic with RecursiveParserWrapper
Exception failure = null;
try {
p.parse(parserStream, handler, metadata, context);
@@ -222,6 +221,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Notify the implementation how it went
boolean tryNext = parserCompleted(p, metadata, handler, failure);
+
// Abort if requested, with the exception if there was one
if (!tryNext) {
if (failure != null) {
@@ -242,4 +242,3 @@ public abstract class AbstractMultipleParser extends AbstractParser {
}
}
}
-
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
index c1dec34..7eab004 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -63,7 +63,11 @@ public class SupplementingParser extends AbstractMultipleParser {
public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy,
List<Parser> parsers) {
super(registry, policy, parsers);
- // TODO Check the policy is one we support
+
+ // Ensure it's a supported policy
+ if (!allowedPolicies.contains(policy)) {
+ throw new IllegalArgumentException("Unsupported policy for SupplementingParser: " + policy);
+ }
}
@Override
--
To stop receiving notification emails like this one, please contact
nick@apache.org.