You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/10/10 20:56:18 UTC
svn commit: r1006336 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika: config/TikaConfig.java
parser/AutoDetectParser.java parser/CompositeParser.java
parser/DefaultParser.java parser/ParserDecorator.java
Author: jukka
Date: Sun Oct 10 18:56:18 2010
New Revision: 1006336
URL: http://svn.apache.org/viewvc?rev=1006336&view=rev
Log:
TIKA-527: Allow override mapping mime<-->parsers through config
Add an explicit DefaultParser class that implements the service provider loading mechanism that was previously included in TikaConfig. This allows a tika-config.xml file to easily express a configuration that loads all default parsers and then explicitly overrides the parsers of selected media types.
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java (with props)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sun Oct 10 18:56:18 2010
@@ -20,9 +20,11 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-import java.util.HashMap;
-import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
import java.util.Map;
+import java.util.Set;
import javax.imageio.spi.ServiceRegistry;
import javax.xml.parsers.DocumentBuilder;
@@ -35,8 +37,10 @@ import org.apache.tika.mime.MediaTypeReg
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
-import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -48,11 +52,19 @@ import org.xml.sax.SAXException;
*/
public class TikaConfig {
- private final Map<MediaType, Parser> parsers =
- new HashMap<MediaType, Parser>();
+ private final CompositeParser parser;
private final MimeTypes mimeTypes;
+ private TikaConfig(CompositeParser parser, MimeTypes mimeTypes) {
+ this.parser = parser;
+ this.mimeTypes = mimeTypes;
+ }
+
+ private TikaConfig(CompositeParser parser) {
+ this(parser, MimeTypes.getDefaultMimeTypes());
+ }
+
public TikaConfig(String file)
throws TikaException, IOException, SAXException {
this(new File(file));
@@ -103,6 +115,7 @@ public class TikaConfig {
mimeTypes = MimeTypes.getDefaultMimeTypes();
}
+ List<Parser> parsers = new ArrayList<Parser>();
NodeList nodes = element.getElementsByTagName("parser");
for (int i = 0; i < nodes.getLength(); i++) {
Element node = (Element) nodes.item(i);
@@ -119,22 +132,21 @@ public class TikaConfig {
NodeList mimes = node.getElementsByTagName("mime");
if (mimes.getLength() > 0) {
+ Set<MediaType> types = new HashSet<MediaType>();
for (int j = 0; j < mimes.getLength(); j++) {
String mime = getText(mimes.item(j));
MediaType type = MediaType.parse(mime);
if (type != null) {
- parsers.put(type, parser);
+ types.add(type);
} else {
throw new TikaException(
"Invalid media type name: " + mime);
}
}
- } else {
- ParseContext context = new ParseContext();
- for (MediaType type : parser.getSupportedTypes(context)) {
- parsers.put(type, parser);
- }
+ parser = ParserDecorator.withTypes(parser, types);
}
+
+ parsers.add(parser);
} catch (ClassNotFoundException e) {
throw new TikaException(
"Configured parser class not found: " + name, e);
@@ -146,6 +158,8 @@ public class TikaConfig {
"Unable to instantiate a parser class: " + name, e);
}
}
+ this.parser =
+ new CompositeParser(mimeTypes.getMediaTypeRegistry(), parsers);
}
/**
@@ -162,19 +176,7 @@ public class TikaConfig {
*/
public TikaConfig(ClassLoader loader)
throws MimeTypeException, IOException {
- if (loader != null) {
- ParseContext context = new ParseContext();
- Iterator<Parser> iterator =
- ServiceRegistry.lookupProviders(Parser.class, loader);
- while (iterator.hasNext()) {
- Parser parser = iterator.next();
- for (MediaType type : parser.getSupportedTypes(context)) {
- parsers.put(type, parser);
- }
- }
- }
-
- mimeTypes = MimeTypes.getDefaultMimeTypes();
+ this(new DefaultParser(loader));
}
/**
@@ -187,27 +189,7 @@ public class TikaConfig {
* @throws IOException if the built-in media type rules can not be read
*/
public TikaConfig() throws MimeTypeException, IOException {
- this(getContextClassLoader());
- }
-
- /**
- * Returns the context class loader of the current thread. If such
- * a class loader is not available, then the loader of this class or
- * finally the system class loader is returned.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-441">TIKA-441</a>
- * @return context class loader, or <code>null</code> if no loader
- * is available
- */
- private static ClassLoader getContextClassLoader() {
- ClassLoader loader = Thread.currentThread().getContextClassLoader();
- if (loader == null) {
- loader = TikaConfig.class.getClassLoader();
- }
- if (loader == null) {
- loader = ClassLoader.getSystemClassLoader();
- }
- return loader;
+ this(new DefaultParser());
}
/**
@@ -235,20 +217,28 @@ public class TikaConfig {
}
/**
- * Returns the parser instance configured for the given MIME type.
- * Returns <code>null</code> if the given MIME type is unknown.
- *
- * @param mimeType MIME type
- * @return configured Parser instance, or <code>null</code>
+ * @deprecated Use the {@link #getParser()} method instead
*/
public Parser getParser(MediaType mimeType) {
- return parsers.get(mimeType);
+ return parser.getParsers().get(mimeType);
}
+ /**
+ * Returns the configured parser instance.
+ *
+ * @return configured parser
+ */
+ public Parser getParser() {
+ return parser;
+ }
+
+ /**
+ * @deprecated Use the {@link #getParser()} method instead
+ */
public Map<MediaType, Parser> getParsers() {
- return parsers;
+ return parser.getParsers();
}
-
+
public MimeTypes getMimeRepository(){
return mimeTypes;
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Sun Oct 10 18:56:18 2010
@@ -29,6 +29,7 @@ import org.apache.tika.io.CountingInputS
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.sax.SecureContentHandler;
import org.xml.sax.ContentHandler;
@@ -36,9 +37,7 @@ import org.xml.sax.SAXException;
public class AutoDetectParser extends CompositeParser {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 6110455808615143122L;
/**
@@ -72,26 +71,20 @@ public class AutoDetectParser extends Co
public AutoDetectParser(Parser...parsers) {
this(MimeTypes.getDefaultMimeTypes(), parsers);
}
-
+
public AutoDetectParser(Detector detector, Parser...parsers) {
+ super(MediaTypeRegistry.getDefaultRegistry(), parsers);
setDetector(detector);
- setMediaTypeRegistry(MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry());
-
- Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
- for (Parser parser : parsers) {
- ParseContext context = new ParseContext();
- for (MediaType type : parser.getSupportedTypes(context)) {
- map.put(type, parser);
- }
- }
-
- setParsers(map);
}
-
+
public AutoDetectParser(TikaConfig config) {
- setConfig(config);
+ super(config.getMediaTypeRegistry(), config.getParser());
+ setDetector(config.getMimeRepository());
}
+ /**
+ * @deprecated This method will be removed in Tika 1.0
+ */
public void setConfig(TikaConfig config) {
setParsers(config.getParsers());
setDetector(config.getMimeRepository());
@@ -111,9 +104,7 @@ public class AutoDetectParser extends Co
/**
* Sets the type detector used by this parser to auto-detect the type
- * of a document. Note that calling the {@link #setConfig(TikaConfig)}
- * method will override the type detector setting with the type settings
- * included in the given configuration.
+ * of a document.
*
* @param detector type detector
* @since Apache Tika 0.4
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Sun Oct 10 18:56:18 2010
@@ -18,7 +18,11 @@ package org.apache.tika.parser;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -39,26 +43,47 @@ import org.xml.sax.SAXException;
*/
public class CompositeParser implements Parser {
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = 5613173903360405824L;
+ /** Serial version UID */
+ private static final long serialVersionUID = 2192845797749627824L;
/**
* Media type registry.
*/
- private MediaTypeRegistry registry = new MediaTypeRegistry();
+ private MediaTypeRegistry registry;
/**
- * Set of component parsers, keyed by the supported media types.
+ * List of component parsers.
*/
- private Map<MediaType, Parser> parsers = new HashMap<MediaType, Parser>();
+ private List<Parser> parsers;
/**
* The fallback parser, used when no better parser is available.
*/
private Parser fallback = new EmptyParser();
+ public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers) {
+ this.parsers = parsers;
+ this.registry = registry;
+ }
+
+ public CompositeParser(MediaTypeRegistry registry, Parser... parsers) {
+ this(registry, Arrays.asList(parsers));
+ }
+
+ public CompositeParser() {
+ this(new MediaTypeRegistry());
+ }
+
+ public Map<MediaType, Parser> getParsers(ParseContext context) {
+ Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
+ for (Parser parser : parsers) {
+ for (MediaType type : parser.getSupportedTypes(context)) {
+ map.put(type, parser);
+ }
+ }
+ return map;
+ }
+
/**
* Returns the media type registry used to infer type relationships.
*
@@ -85,7 +110,7 @@ public class CompositeParser implements
* @return component parsers, keyed by media type
*/
public Map<MediaType, Parser> getParsers() {
- return parsers;
+ return getParsers(new ParseContext());
}
/**
@@ -94,7 +119,11 @@ public class CompositeParser implements
* @param parsers component parsers, keyed by media type
*/
public void setParsers(Map<MediaType, Parser> parsers) {
- this.parsers = parsers;
+ this.parsers = new ArrayList<Parser>(parsers.size());
+ for (Map.Entry<MediaType, Parser> entry : parsers.entrySet()) {
+ this.parsers.add(ParserDecorator.withTypes(
+ entry.getValue(), Collections.singleton(entry.getKey())));
+ }
}
/**
@@ -129,9 +158,14 @@ public class CompositeParser implements
* @return matching parser
*/
protected Parser getParser(Metadata metadata) {
+ return getParser(metadata, new ParseContext());
+ }
+
+ protected Parser getParser(Metadata metadata, ParseContext context) {
+ Map<MediaType, Parser> map = getParsers(context);
MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
while (type != null) {
- Parser parser = parsers.get(type);
+ Parser parser = map.get(type);
if (parser != null) {
return parser;
}
@@ -141,7 +175,7 @@ public class CompositeParser implements
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
- return parsers.keySet();
+ return getParsers(context).keySet();
}
/**
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java?rev=1006336&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java Sun Oct 10 18:56:18 2010
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import javax.imageio.spi.ServiceRegistry;
+
+import org.apache.tika.mime.MediaTypeRegistry;
+
+/**
+ * A composite parser based on all the {@link Parser} implementations
+ * available through the {@link ServiceRegistry service provider mechanism}.
+ *
+ * @since Apache Tika 0.8
+ */
+public class DefaultParser extends CompositeParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 3612324825403757520L;
+
+ /**
+ * Returns the context class loader of the current thread. If such
+ * a class loader is not available, then the loader of this class or
+ * finally the system class loader is returned.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-441">TIKA-441</a>
+ * @return context class loader, or <code>null</code> if no loader
+ * is available
+ */
+ private static ClassLoader getContextClassLoader() {
+ ClassLoader loader = Thread.currentThread().getContextClassLoader();
+ if (loader == null) {
+ loader = DefaultParser.class.getClassLoader();
+ }
+ if (loader == null) {
+ loader = ClassLoader.getSystemClassLoader();
+ }
+ return loader;
+ }
+
+ /**
+ * Returns all the parsers available through the given class loader.
+ *
+ * @param loader class loader
+ * @return available parsers
+ */
+ private static List<Parser> loadParsers(ClassLoader loader) {
+ List<Parser> parsers = new ArrayList<Parser>();
+ if (loader != null) {
+ Iterator<Parser> iterator =
+ ServiceRegistry.lookupProviders(Parser.class, loader);
+ while (iterator.hasNext()) {
+ parsers.add(iterator.next());
+ }
+ }
+ return parsers;
+ }
+
+ public DefaultParser(ClassLoader loader) {
+ super(new MediaTypeRegistry(), loadParsers(loader));
+ }
+
+ public DefaultParser() {
+ this(getContextClassLoader());
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Sun Oct 10 18:56:18 2010
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -34,6 +34,28 @@ import org.xml.sax.SAXException;
*/
public class ParserDecorator implements Parser {
+ /** Serial version UID */
+ private static final long serialVersionUID = -3861669115439125268L;
+
+ /**
+ * Decorates the given parser so that it always claims to support
+ * parsing of the given media types.
+ *
+ * @param parser the parser to be decorated
+ * @param types supported media types
+ * @return the decorated parser
+ */
+ public static final Parser withTypes(
+ Parser parser, final Set<MediaType> types) {
+ return new ParserDecorator(parser) {
+ private static final long serialVersionUID = -7345051519565330731L;
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return types;
+ }
+ };
+ }
+
/**
* The decorated parser instance.
*/