You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/03/24 16:35:24 UTC
svn commit: r1085003 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java
tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java
Author: nick
Date: Thu Mar 24 15:35:24 2011
New Revision: 1085003
URL: http://svn.apache.org/viewvc?rev=1085003&view=rev
Log:
TIKA-620 - Have CompositeParser always use the canonical mimetype internally, via suitable calls to registry.normalise, rather than trying to handle the aliases individually
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1085003&r1=1085002&r2=1085003&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Thu Mar 24 15:35:24 2011
@@ -25,7 +25,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
-import java.util.SortedSet;
+import java.util.logging.Level;
+import java.util.logging.Logger;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TaggedInputStream;
@@ -79,7 +80,16 @@ public class CompositeParser implements
Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
for (Parser parser : parsers) {
for (MediaType type : parser.getSupportedTypes(context)) {
- map.put(type, parser);
+ MediaType canonicalType = registry.normalize(type);
+ if (map.containsKey(canonicalType)) {
+ if (map.get(canonicalType) != parser) {
+ Logger.getLogger(getClass().getName()).log(
+ Level.INFO, "Duplicate parser definition for " + type +
+ " (" + canonicalType + "), using " + parser
+ );
+ }
+ }
+ map.put(canonicalType, parser);
}
}
return map;
@@ -165,6 +175,11 @@ public class CompositeParser implements
protected Parser getParser(Metadata metadata, ParseContext context) {
Map<MediaType, Parser> map = getParsers(context);
MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+ if (type != null) {
+ // We always work on the normalised, canonical form
+ type = registry.normalize(type);
+ }
+
while (type != null) {
// Try finding a parser for the type
Parser parser = map.get(type);
@@ -172,15 +187,6 @@ public class CompositeParser implements
return parser;
}
- // Next up, look for one for its aliases
- SortedSet<MediaType> aliases = registry.getAliases(type);
- for (MediaType alias : aliases) {
- parser = map.get(alias);
- if (parser != null) {
- return parser;
- }
- }
-
// Failing that, try for the parent of the type
type = registry.getSupertype(type);
}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java?rev=1085003&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java Thu Mar 24 15:35:24 2011
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+public class CompositeParserTest extends TestCase {
+ public void testDefaultParser() throws Exception {
+ TikaConfig config = TikaConfig.getDefaultConfig();
+
+ CompositeParser parser = (CompositeParser)config.getParser();
+
+ // Check it has the full registry
+ assertEquals(config.getMediaTypeRegistry(), parser.getMediaTypeRegistry());
+ }
+
+ public void testMimeTypeAliases() throws Exception {
+ MediaType bmpCanonical = MediaType.image("x-ms-bmp");
+ Map<String,String> bmpCanonicalMetadata = new HashMap<String, String>();
+ bmpCanonicalMetadata.put("BMP", "True");
+ bmpCanonicalMetadata.put("Canonical", "True");
+ Parser bmpCanonicalParser = new DummyParser(
+ new HashSet<MediaType>(Arrays.asList(bmpCanonical)),
+ bmpCanonicalMetadata, null
+ );
+
+ MediaType bmpAlias = MediaType.image("bmp");
+ Map<String,String> bmpAliasMetadata = new HashMap<String, String>();
+ bmpAliasMetadata.put("BMP", "True");
+ bmpAliasMetadata.put("Alias", "True");
+ Parser bmpAliasParser = new DummyParser(
+ new HashSet<MediaType>(Arrays.asList(bmpAlias)),
+ bmpAliasMetadata, null
+ );
+
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ CompositeParser canonical = new CompositeParser(
+ config.getMediaTypeRegistry(), bmpCanonicalParser
+ );
+ CompositeParser alias = new CompositeParser(
+ config.getMediaTypeRegistry(), bmpAliasParser
+ );
+ CompositeParser both = new CompositeParser(
+ config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser
+ );
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata;
+
+ // Canonical and Canonical
+ metadata = new Metadata();
+ metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
+ canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+ assertEquals("True", metadata.get("BMP"));
+ assertEquals("True", metadata.get("Canonical"));
+
+
+ // Alias and Alias
+ metadata = new Metadata();
+ metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
+ alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+ assertEquals("True", metadata.get("BMP"));
+ assertEquals("True", metadata.get("Alias"));
+
+
+ // Alias type and Canonical parser
+ metadata = new Metadata();
+ metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
+ canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+ assertEquals("True", metadata.get("BMP"));
+ assertEquals("True", metadata.get("Canonical"));
+
+
+ // Canonical type and Alias parser
+ metadata = new Metadata();
+ metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
+ alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+ assertEquals("True", metadata.get("BMP"));
+ assertEquals("True", metadata.get("Alias"));
+
+
+ // And when both are there, will go for the last one
+ // to be registered (which is the alias one)
+ metadata = new Metadata();
+ metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
+ both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+ assertEquals("True", metadata.get("BMP"));
+ assertEquals("True", metadata.get("Alias"));
+ }
+}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java?rev=1085003&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java Thu Mar 24 15:35:24 2011
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A Dummy Parser for use with unit tests.
+ */
+public class DummyParser implements Parser {
+ private Set<MediaType> types;
+ private Map<String,String> metadata;
+ private String xmlText;
+
+ public DummyParser(Set<MediaType> types, Map<String, String> metadata,
+ String xmlText) {
+ this.types = types;
+ this.metadata = metadata;
+ this.xmlText = xmlText;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return types;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ for (Entry<String,String> m : this.metadata.entrySet()) {
+ metadata.add(m.getKey(), m.getValue());
+ }
+
+ handler.startDocument();
+ if (xmlText != null) {
+ handler.characters(xmlText.toCharArray(), 0, xmlText.length());
+ }
+ handler.endDocument();
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+}