You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/13 18:15:35 UTC
[tika] 12/13: Implement some metadata policies for merging values
from multiple parsers
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit ee60f5e8ac4002cb6a296adc24cbcb7183cb1f8e
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Tue Mar 13 17:43:30 2018 +0000
Implement some metadata policies for merging values from multiple parsers
---
.../parser/multiple/AbstractMultipleParser.java | 48 ++++++++++++++++++----
1 file changed, 41 insertions(+), 7 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 6262dc1..9781f49 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -16,9 +16,12 @@
*/
package org.apache.tika.parser.multiple;
+import static org.apache.tika.utils.ParserUtils.cloneMetadata;
+import static org.apache.tika.utils.ParserUtils.recordParserDetails;
+import static org.apache.tika.utils.ParserUtils.recordParserFailure;
+
import java.io.IOException;
import java.io.InputStream;
-import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
@@ -34,7 +37,6 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
-import static org.apache.tika.utils.ParserUtils.*;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -187,7 +189,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// later if required for parser 2+
// TODO Should we use RereadableInputStream instead?
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
- Path path = taggedStream.getPath();
+ taggedStream.getPath();
// TODO Somehow shield/wrap the Handler, so that we can
// avoid failures if multiple parsers want to do content
@@ -202,8 +204,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
// Record that we used this parser
recordParserDetails(p, metadata);
-
- // TODO Handle metadata clashes based on the Policy
+
+ // Prepare an near-empty Metadata, will merge after
+ metadata = cloneMetadata(originalMetadata);
// Process if possible
Exception failure = null;
@@ -229,14 +232,45 @@ public abstract class AbstractMultipleParser extends AbstractParser {
break;
}
- // TODO Handle metadata clashes based on the Policy
- lastMetadata = cloneMetadata(metadata);
+ // Handle metadata merging / clashes
+ metadata = mergeMetadata(metadata, lastMetadata, policy);
// Prepare for the next parser, if present
+ lastMetadata = cloneMetadata(metadata);
taggedStream.reset();
}
} finally {
tmp.dispose();
}
}
+
+ // TODO Provide a method that takes an InputStreamSource as well,
+ // and a ContentHandlerFactory. Will need wrappers to convert standard
+
+ protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) {
+ if (policy == MetadataPolicy.DISCARD_ALL) {
+ return newMetadata;
+ }
+
+ for (String n : lastMetadata.names()) {
+ if (newMetadata.get(n) == null) {
+ newMetadata.set(n, lastMetadata.get(n));
+ } else {
+ switch (policy) {
+ case FIRST_WINS:
+ // Use the earlier value
+ newMetadata.set(n, lastMetadata.get(n));
+ continue;
+ case LAST_WINS:
+ // Most recent (last) parser has already won
+ continue;
+ case KEEP_ALL:
+ // TODO Find unique values to add
+ // TODO Implement
+ continue;
+ }
+ }
+ }
+ return newMetadata;
+ }
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.