You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/10/10 20:56:18 UTC

svn commit: r1006336 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: config/TikaConfig.java parser/AutoDetectParser.java parser/CompositeParser.java parser/DefaultParser.java parser/ParserDecorator.java

Author: jukka
Date: Sun Oct 10 18:56:18 2010
New Revision: 1006336

URL: http://svn.apache.org/viewvc?rev=1006336&view=rev
Log:
TIKA-527: Allow override mapping mime<-->parsers through config

Add an explicit DefaultParser class that implements the service provider loading mechanism that was previously included in TikaConfig. This allows a tika-config.xml file to easily express a configuration that loads all default parsers and then explicitly overrides the parsers of selected media types.

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java   (with props)
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sun Oct 10 18:56:18 2010
@@ -20,9 +20,11 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.util.HashMap;
-import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import javax.imageio.spi.ServiceRegistry;
 import javax.xml.parsers.DocumentBuilder;
@@ -35,8 +37,10 @@ import org.apache.tika.mime.MediaTypeReg
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;
-import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
@@ -48,11 +52,19 @@ import org.xml.sax.SAXException;
  */
 public class TikaConfig {
 
-    private final Map<MediaType, Parser> parsers =
-        new HashMap<MediaType, Parser>();
+    private final CompositeParser parser;
     
     private final MimeTypes mimeTypes;
 
+    private TikaConfig(CompositeParser parser, MimeTypes mimeTypes) {
+        this.parser = parser;
+        this.mimeTypes = mimeTypes;
+    }
+
+    private TikaConfig(CompositeParser parser) {
+        this(parser, MimeTypes.getDefaultMimeTypes());
+    }
+
     public TikaConfig(String file)
             throws TikaException, IOException, SAXException {
         this(new File(file));
@@ -103,6 +115,7 @@ public class TikaConfig {
             mimeTypes = MimeTypes.getDefaultMimeTypes();
         }
 
+        List<Parser> parsers = new ArrayList<Parser>();
         NodeList nodes = element.getElementsByTagName("parser");
         for (int i = 0; i < nodes.getLength(); i++) {
             Element node = (Element) nodes.item(i);
@@ -119,22 +132,21 @@ public class TikaConfig {
 
                 NodeList mimes = node.getElementsByTagName("mime");
                 if (mimes.getLength() > 0) {
+                    Set<MediaType> types = new HashSet<MediaType>();
                     for (int j = 0; j < mimes.getLength(); j++) {
                         String mime = getText(mimes.item(j));
                         MediaType type = MediaType.parse(mime);
                         if (type != null) {
-                            parsers.put(type, parser);
+                            types.add(type);
                         } else {
                             throw new TikaException(
                                     "Invalid media type name: " + mime);
                         }
                     }
-                } else {
-                    ParseContext context = new ParseContext();
-                    for (MediaType type : parser.getSupportedTypes(context)) {
-                        parsers.put(type, parser);
-                    }
+                    parser = ParserDecorator.withTypes(parser, types);
                 }
+
+                parsers.add(parser);
             } catch (ClassNotFoundException e) {
                 throw new TikaException(
                         "Configured parser class not found: " + name, e);
@@ -146,6 +158,8 @@ public class TikaConfig {
                         "Unable to instantiate a parser class: " + name, e);
             }
         }
+        this.parser =
+            new CompositeParser(mimeTypes.getMediaTypeRegistry(), parsers);
     }
 
     /**
@@ -162,19 +176,7 @@ public class TikaConfig {
      */
     public TikaConfig(ClassLoader loader)
             throws MimeTypeException, IOException {
-        if (loader != null) {
-            ParseContext context = new ParseContext();
-            Iterator<Parser> iterator =
-                ServiceRegistry.lookupProviders(Parser.class, loader);
-            while (iterator.hasNext()) {
-                Parser parser = iterator.next();
-                for (MediaType type : parser.getSupportedTypes(context)) {
-                    parsers.put(type, parser);
-                }
-            }
-        }
-        
-        mimeTypes = MimeTypes.getDefaultMimeTypes();
+        this(new DefaultParser(loader));
     }
 
     /**
@@ -187,27 +189,7 @@ public class TikaConfig {
      * @throws IOException  if the built-in media type rules can not be read
      */
     public TikaConfig() throws MimeTypeException, IOException {
-        this(getContextClassLoader());
-    }
-
-    /**
-     * Returns the context class loader of the current thread. If such
-     * a class loader is not available, then the loader of this class or
-     * finally the system class loader is returned.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-441">TIKA-441</a>
-     * @return context class loader, or <code>null</code> if no loader
-     *         is available
-     */
-    private static ClassLoader getContextClassLoader() {
-        ClassLoader loader = Thread.currentThread().getContextClassLoader();
-        if (loader == null) {
-            loader = TikaConfig.class.getClassLoader();
-        }
-        if (loader == null) {
-            loader = ClassLoader.getSystemClassLoader();
-        }
-        return loader;
+        this(new DefaultParser());
     }
 
     /**
@@ -235,20 +217,28 @@ public class TikaConfig {
     }
 
     /**
-     * Returns the parser instance configured for the given MIME type.
-     * Returns <code>null</code> if the given MIME type is unknown.
-     *
-     * @param mimeType MIME type
-     * @return configured Parser instance, or <code>null</code>
+     * @deprecated Use the {@link #getParser()} method instead
      */
     public Parser getParser(MediaType mimeType) {
-        return parsers.get(mimeType);
+        return parser.getParsers().get(mimeType);
     }
 
+    /**
+     * Returns the configured parser instance.
+     *
+     * @return configured parser
+     */
+    public Parser getParser() {
+        return parser;
+    }
+
+    /**
+     * @deprecated Use the {@link #getParser()} method instead
+     */
     public Map<MediaType, Parser> getParsers() {
-        return parsers;
+        return parser.getParsers();
     }
-    
+
     public MimeTypes getMimeRepository(){
         return mimeTypes;
     }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Sun Oct 10 18:56:18 2010
@@ -29,6 +29,7 @@ import org.apache.tika.io.CountingInputS
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.sax.SecureContentHandler;
 import org.xml.sax.ContentHandler;
@@ -36,9 +37,7 @@ import org.xml.sax.SAXException;
 
 public class AutoDetectParser extends CompositeParser {
 
-    /**
-     * Serial version UID
-     */
+    /** Serial version UID */
     private static final long serialVersionUID = 6110455808615143122L;
 
     /**
@@ -72,26 +71,20 @@ public class AutoDetectParser extends Co
     public AutoDetectParser(Parser...parsers) {
         this(MimeTypes.getDefaultMimeTypes(), parsers);
     }
-    
+
     public AutoDetectParser(Detector detector, Parser...parsers) {
+        super(MediaTypeRegistry.getDefaultRegistry(), parsers);
         setDetector(detector);
-        setMediaTypeRegistry(MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry());
-        
-        Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
-        for (Parser parser : parsers) {
-            ParseContext context = new ParseContext();
-            for (MediaType type : parser.getSupportedTypes(context)) {
-                map.put(type, parser);
-            }
-        }
-        
-        setParsers(map);
     }
-    
+
     public AutoDetectParser(TikaConfig config) {
-        setConfig(config);
+        super(config.getMediaTypeRegistry(), config.getParser());
+        setDetector(config.getMimeRepository());
     }
 
+    /**
+     * @deprecated This method will be removed in Tika 1.0
+     */
     public void setConfig(TikaConfig config) {
         setParsers(config.getParsers());
         setDetector(config.getMimeRepository());
@@ -111,9 +104,7 @@ public class AutoDetectParser extends Co
 
     /**
      * Sets the type detector used by this parser to auto-detect the type
-     * of a document. Note that calling the {@link #setConfig(TikaConfig)}
-     * method will override the type detector setting with the type settings
-     * included in the given configuration.
+     * of a document.
      *
      * @param detector type detector
      * @since Apache Tika 0.4

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Sun Oct 10 18:56:18 2010
@@ -18,7 +18,11 @@ package org.apache.tika.parser;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -39,26 +43,47 @@ import org.xml.sax.SAXException;
  */
 public class CompositeParser implements Parser {
 
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = 5613173903360405824L;
+    /** Serial version UID */
+    private static final long serialVersionUID = 2192845797749627824L;
 
     /**
      * Media type registry.
      */
-    private MediaTypeRegistry registry = new MediaTypeRegistry();
+    private MediaTypeRegistry registry;
 
     /**
-     * Set of component parsers, keyed by the supported media types.
+     * List of component parsers.
      */
-    private Map<MediaType, Parser> parsers = new HashMap<MediaType, Parser>();
+    private List<Parser> parsers;
 
     /**
      * The fallback parser, used when no better parser is available.
      */
     private Parser fallback = new EmptyParser();
 
+    public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers) {
+        this.parsers = parsers;
+        this.registry = registry;
+    }
+
+    public CompositeParser(MediaTypeRegistry registry, Parser... parsers) {
+        this(registry, Arrays.asList(parsers));
+    }
+
+    public CompositeParser() {
+        this(new MediaTypeRegistry());
+    }
+
+    public Map<MediaType, Parser> getParsers(ParseContext context) {
+        Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
+        for (Parser parser : parsers) {
+            for (MediaType type : parser.getSupportedTypes(context)) {
+                map.put(type, parser);
+            }
+        }
+        return map;
+    }
+
     /**
      * Returns the media type registry used to infer type relationships.
      *
@@ -85,7 +110,7 @@ public class CompositeParser implements 
      * @return component parsers, keyed by media type
      */
     public Map<MediaType, Parser> getParsers() {
-        return parsers;
+        return getParsers(new ParseContext());
     }
 
     /**
@@ -94,7 +119,11 @@ public class CompositeParser implements 
      * @param parsers component parsers, keyed by media type
      */
     public void setParsers(Map<MediaType, Parser> parsers) {
-        this.parsers = parsers;
+        this.parsers = new ArrayList<Parser>(parsers.size());
+        for (Map.Entry<MediaType, Parser> entry : parsers.entrySet()) {
+            this.parsers.add(ParserDecorator.withTypes(
+                    entry.getValue(), Collections.singleton(entry.getKey())));
+        }
     }
 
     /**
@@ -129,9 +158,14 @@ public class CompositeParser implements 
      * @return matching parser
      */
     protected Parser getParser(Metadata metadata) {
+        return getParser(metadata, new ParseContext());
+    }
+
+    protected Parser getParser(Metadata metadata, ParseContext context) {
+        Map<MediaType, Parser> map = getParsers(context);
         MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
         while (type != null) {
-            Parser parser = parsers.get(type);
+            Parser parser = map.get(type);
             if (parser != null) {
                 return parser;
             }
@@ -141,7 +175,7 @@ public class CompositeParser implements 
     }
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return parsers.keySet();
+        return getParsers(context).keySet();
     }
 
     /**

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java?rev=1006336&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java Sun Oct 10 18:56:18 2010
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import javax.imageio.spi.ServiceRegistry;
+
+import org.apache.tika.mime.MediaTypeRegistry;
+
+/**
+ * A composite parser based on all the {@link Parser} implementations
+ * available through the {@link ServiceRegistry service provider mechanism}.
+ *
+ * @since Apache Tika 0.8
+ */
+public class DefaultParser extends CompositeParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 3612324825403757520L;
+
+    /**
+     * Returns the context class loader of the current thread. If such
+     * a class loader is not available, then the loader of this class or
+     * finally the system class loader is returned.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-441">TIKA-441</a>
+     * @return context class loader, or <code>null</code> if no loader
+     *         is available
+     */
+    private static ClassLoader getContextClassLoader() {
+        ClassLoader loader = Thread.currentThread().getContextClassLoader();
+        if (loader == null) {
+            loader = DefaultParser.class.getClassLoader();
+        }
+        if (loader == null) {
+            loader = ClassLoader.getSystemClassLoader();
+        }
+        return loader;
+    }
+
+    /**
+     * Returns all the parsers available through the given class loader.
+     *
+     * @param loader class loader 
+     * @return available parsers
+     */
+    private static List<Parser> loadParsers(ClassLoader loader) {
+        List<Parser> parsers = new ArrayList<Parser>();
+        if (loader != null) {
+            Iterator<Parser> iterator =
+                ServiceRegistry.lookupProviders(Parser.class, loader);
+            while (iterator.hasNext()) {
+                parsers.add(iterator.next());
+            }
+        }
+        return parsers;
+    }
+
+    public DefaultParser(ClassLoader loader) {
+        super(new MediaTypeRegistry(), loadParsers(loader));
+    }
+
+    public DefaultParser() {
+        this(getContextClassLoader());
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1006336&r1=1006335&r2=1006336&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Sun Oct 10 18:56:18 2010
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -34,6 +34,28 @@ import org.xml.sax.SAXException;
  */
 public class ParserDecorator implements Parser {
 
+    /** Serial version UID */
+    private static final long serialVersionUID = -3861669115439125268L;
+
+    /**
+     * Decorates the given parser so that it always claims to support
+     * parsing of the given media types.
+     *
+     * @param parser the parser to be decorated
+     * @param types supported media types
+     * @return the decorated parser
+     */
+    public static final Parser withTypes(
+            Parser parser, final Set<MediaType> types) {
+        return new ParserDecorator(parser) {
+            private static final long serialVersionUID = -7345051519565330731L;
+            @Override
+            public Set<MediaType> getSupportedTypes(ParseContext context) {
+                return types;
+            }
+        };
+    }
+
     /**
      * The decorated parser instance.
      */