You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/04/06 18:19:17 UTC

svn commit: r1089516 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: config/ parser/ parser/external/

Author: nick
Date: Wed Apr  6 16:19:17 2011
New Revision: 1089516

URL: http://svn.apache.org/viewvc?rev=1089516&view=rev
Log:
TIKA-634 - Initial work on supporting more flexible ExternalParser loading (via XML, part done), and external parser metadata extraction

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1089516&r1=1089515&r2=1089516&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java Wed Apr  6 16:19:17 2011
@@ -93,6 +93,23 @@ public class ServiceLoader {
     public ServiceLoader() {
         this(getContextClassLoader());
     }
+    
+    /**
+     * Returns all the available service resources matching the
+     *  given pattern, such as all instances of tika-mimetypes.xml 
+     *  on the classpath, or all org.apache.tika.parser.Parser 
+     *  service files.
+     */
+    public Enumeration<URL> findServiceResources(String filePattern) {
+       try {
+          Enumeration<URL> resources = loader.getResources(filePattern);
+          return resources;
+       } catch (IOException ignore) {
+          // We couldn't get the list of service resource files
+          List<URL> empty = Collections.emptyList();
+          return Collections.enumeration( empty );
+      }
+    }
 
     /**
      * Returns all the available service providers of the given type.
@@ -107,18 +124,14 @@ public class ServiceLoader {
         if (loader != null) {
             Set<String> names = new HashSet<String>();
 
-            try {
-                String name = service.getName();
-                Enumeration<URL> resources = loader.getResources("META-INF/services/" + name);
-                for (URL resource : Collections.list(resources)) {
-                    try {
-                        names.addAll(getServiceClassNames(resource));
-                    } catch (IOException e) {
-                        handler.handleLoadError(name, e);
-                    }
+            String serviceName = service.getName();
+            Enumeration<URL> resources = findServiceResources("META-INF/services/" + serviceName);
+            for (URL resource : Collections.list(resources)) {
+                try {
+                    names.addAll(getServiceClassNames(resource));
+                } catch (IOException e) {
+                    handler.handleLoadError(serviceName, e);
                 }
-            } catch (IOException ignore) {
-                // We couldn't get the list of service resource files
             }
 
             for (String name : names) {

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java?rev=1089516&r1=1089515&r2=1089516&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java Wed Apr  6 16:19:17 2011
@@ -35,9 +35,12 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * Parser that uses an external program (like catdoc or pdf2txt) to extract
- * text content from a given document.
+ * Very basic parser that uses an external program (like catdoc or pdf2txt) 
+ *  to extract text content from a given document.
+ * 
+ * @deprecated Use the more advanced {@link org.apache.tika.parser.external.ExternalParser} instead
  */
+@Deprecated
 public class ExternalParser extends AbstractParser {
 
     /**

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java Wed Apr  6 16:19:17 2011
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.NullOutputStream;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser that uses an external program (like catdoc or pdf2txt) to extract
+ *  text content and metadata from a given document.
+ */
+public class ExternalParser extends AbstractParser {
+    private static final long serialVersionUID = -1079128990650687037L;
+    
+    /**
+     * The token, which if present in the Command string, will
+     *  be replaced with the input filename. 
+     * Alternately, the input data can be streamed over STDIN.
+     */
+    public static final String INPUT_FILE_TOKEN = "${INPUT}";
+    /**
+     * The token, which if present in the Command string, will
+     *  be replaced with the output filename. 
+     * Alternately, the output data can be collected on STDOUT.
+     */
+    public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";
+
+    /**
+     * Media types supported by the external program.
+     */
+    private Set<MediaType> supportedTypes = Collections.emptySet();
+    
+    /**
+     * Regular Expressions to run over STDOUT to
+     *  extract Metadata.
+     */
+    private Map<Pattern,String> metadataPatterns = null;
+
+    /**
+     * The external command to invoke.
+     * @see Runtime#exec(String[])
+     */
+    private String[] command = new String[] { "cat" };
+    
+    
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return getSupportedTypes();
+    }
+
+    public Set<MediaType> getSupportedTypes() {
+        return supportedTypes;
+    }
+
+    public void setSupportedTypes(Set<MediaType> supportedTypes) {
+        this.supportedTypes =
+            Collections.unmodifiableSet(new HashSet<MediaType>(supportedTypes));
+    }
+
+
+    public String[] getCommand() {
+        return command;
+    }
+
+    /**
+     * Sets the command to be run. This can include either of
+     *  {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
+     *  if the command needs filenames.
+     * @see Runtime#exec(String[])
+     */
+    public void setCommand(String... command) {
+        this.command = command;
+    }
+    
+    
+    public Map<Pattern,String> getMetadataExtractionPatterns() {
+       return metadataPatterns;
+    }
+    
+    /**
+     * Sets the map of regular expression patterns and Metadata
+     *  keys. Any matching patterns will have the matching
+     *  metadata entries set.
+     * Set this to null to disable Metadata extraction.
+     */
+    public void setMetadataExtractionPatterns(Map<Pattern,String> patterns) {
+       this.metadataPatterns = patterns;
+    }
+    
+
+    /**
+     * Executes the configured external command and passes the given document
+     *  stream as a simple XHTML document to the given SAX content handler.
+     * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
+     *  has been called to set patterns.
+     */
+    public void parse(
+            final InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml =
+            new XHTMLContentHandler(handler, metadata);
+        
+        boolean inputToStdIn = true;
+        boolean outputFromStdOut = true;
+        boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
+        
+        TikaInputStream tikaStream = TikaInputStream.get(stream);
+        
+        // Build our command
+        String[] cmd = new String[command.length];
+        System.arraycopy(command, 0, cmd, 0, command.length);
+        for(int i=0; i<cmd.length; i++) {
+           if(cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
+              cmd[i].replace(INPUT_FILE_TOKEN, tikaStream.getFile().toString());
+              inputToStdIn = false;
+           }
+           if(cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
+              // TODO
+           }
+        }
+
+        // Execute
+        Process process;
+        if(cmd.length == 1) {
+           process = Runtime.getRuntime().exec( cmd[0] );
+        } else {
+           process = Runtime.getRuntime().exec( cmd );
+        }
+        
+        try {
+            if(inputToStdIn) {
+               sendInput(process, stream);
+            } else {
+               process.getOutputStream().close();
+            }
+
+            InputStream out = process.getInputStream();
+            InputStream err = process.getErrorStream();
+            
+            if(hasPatterns) {
+               extractMetadata(err, metadata);
+               
+               if(outputFromStdOut) {
+                  extractOutput(out, xhtml);
+               } else {
+                  extractMetadata(out, metadata);
+               }
+            } else {
+               ignoreStream(err);
+               
+               if(outputFromStdOut) {
+                  extractOutput(out, xhtml);
+               } else {
+                  ignoreStream(out);
+               }
+            }
+        } finally {
+            try {
+                process.waitFor();
+            } catch (InterruptedException ignore) {
+            }
+        }
+        
+        // Grab the output if we haven't already
+        // TODO
+    }
+
+    /**
+     * Starts a thread that extracts the contents of the standard output
+     * stream of the given process to the given XHTML content handler.
+     * The standard output stream is closed once fully processed.
+     *
+     * @param process process
+     * @param xhtml XHTML content handler
+     * @throws SAXException if the XHTML SAX events could not be handled
+     * @throws IOException if an input error occurred
+     */
+    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
+            throws SAXException, IOException {
+        Reader reader = new InputStreamReader(stream);
+        try {
+            xhtml.startDocument();
+            xhtml.startElement("p");
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                xhtml.characters(buffer, 0, n);
+            }
+            xhtml.endElement("p");
+            xhtml.endDocument();
+        } finally {
+            reader.close();
+        }
+    }
+
+    /**
+     * Starts a thread that sends the contents of the given input stream
+     * to the standard input stream of the given process. Potential
+     * exceptions are ignored, and the standard input stream is closed
+     * once fully processed. Note that the given input stream is <em>not</em>
+     * closed by this method.
+     *
+     * @param process process
+     * @param stream input stream
+     */
+    private void sendInput(final Process process, final InputStream stream) {
+        new Thread() {
+            public void run() {
+                OutputStream stdin = process.getOutputStream();
+                try {
+                    IOUtils.copy(stream, stdin);
+                } catch (IOException e) {
+                } finally {
+                    IOUtils.closeQuietly(stdin);
+                }
+            }
+        }.start();
+    }
+
+    /**
+     * Starts a thread that reads and discards the contents of the
+     * standard stream of the given process. Potential exceptions
+     * are ignored, and the stream is closed once fully processed.
+     *
+     * @param process process
+     */
+    private void ignoreStream(final InputStream stream) {
+        new Thread() {
+            public void run() {
+                try {
+                    IOUtils.copy(stream, new NullOutputStream());
+                } catch (IOException e) {
+                } finally {
+                    IOUtils.closeQuietly(stream);
+                }
+            }
+        }.start();
+    }
+    
+    private void extractMetadata(final InputStream stream, final Metadata metadata) {
+       new Thread() {
+          public void run() {
+             BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
+             try {
+                String line;
+                while ( (line = reader.readLine()) != null ) {
+                   for(Pattern p : metadataPatterns.keySet()) {
+                      Matcher m = p.matcher(line);
+                      if(m.find()) {
+                         metadata.add( metadataPatterns.get(p), m.group(1) );
+                      }
+                   }
+                }
+             } catch (IOException e) {
+             } finally {
+                IOUtils.closeQuietly(reader);
+                IOUtils.closeQuietly(stream);
+            }
+          }
+       }.start();
+    }
+}

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java Wed Apr  6 16:19:17 2011
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * Builds up ExternalParser instances based on XML file(s)
+ *  which define what to run, for what, and how to process
+ *  any output metadata.
+ * Typically used to configure up a series of external programs 
+ *  (like catdoc or pdf2txt) to extract text content from documents.
+ *  
+ * <pre>
+ *  TODO XML DTD Here
+ * </pre>
+ */
+public final class ExternalParsersConfigReader implements ExternalParsersConfigReaderMetKeys {
+   
+   public static List<ExternalParser> read(InputStream stream) throws TikaException, IOException {
+      try {
+          DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+          DocumentBuilder builder = factory.newDocumentBuilder();
+          Document document = builder.parse(new InputSource(stream));
+          return read(document);
+      } catch (ParserConfigurationException e) {
+          throw new TikaException("Unable to create an XML parser", e);
+      } catch (SAXException e) {
+          throw new TikaException("Invalid parser configuration", e);
+      }
+   }
+   
+   public static List<ExternalParser> read(Document document) throws TikaException, IOException {
+      return read(document.getDocumentElement());
+   }
+   
+   public static List<ExternalParser> read(Element element) throws TikaException, IOException {
+      List<ExternalParser> parsers = new ArrayList<ExternalParser>();
+      
+      if (element != null && element.getTagName().equals(EXTERNAL_PARSERS_TAG)) {
+         NodeList nodes = element.getChildNodes();
+         for (int i = 0; i < nodes.getLength(); i++) {
+            Node node = nodes.item(i);
+            if (node.getNodeType() == Node.ELEMENT_NODE) {
+               Element child = (Element) node;
+               if (child.getTagName().equals(PARSER_TAG)) {
+                  ExternalParser p = readParser(child);
+                  if(p != null) {
+                     parsers.add( p );
+                  }
+               }
+            }
+         }
+      } else {
+         throw new MimeTypeException(
+               "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: "
+               + element.getTagName());
+      }
+      
+      return parsers;
+   }
+   
+   /**
+    * Builds and Returns an ExternalParser, or null if a check
+    *  command was given that didn't match.
+    */
+   private static ExternalParser readParser(Element parserDef) throws TikaException {
+      ExternalParser parser = new ExternalParser();
+
+      NodeList children = parserDef.getChildNodes();
+      for(int i=0; i<children.getLength(); i++) {
+         Node node = children.item(i);
+         if (node.getNodeType() == Node.ELEMENT_NODE) {
+            Element child = (Element) node;
+            if (child.getTagName().equals(CHECK_TAG)) {
+               // TODO
+            }
+            else if (child.getTagName().equals(COMMAND_TAG)) {
+               parser.setCommand(
+                     child.getFirstChild().getNodeValue()
+               );
+            }
+            else if (child.getTagName().equals(MIMETYPES_TAG)) {
+               parser.setSupportedTypes(
+                     readMimeTypes(child)
+               );
+            }
+            else if (child.getTagName().equals(METADATA_TAG)) {
+               parser.setMetadataExtractionPatterns(
+                     readMetadataPatterns(child)
+               );
+            }
+         }
+      }
+      
+      return parser;
+   }
+   
+   private static Set<MediaType> readMimeTypes(Element mimeTypes) {
+      Set<MediaType> types = new HashSet<MediaType>();
+      return types;
+   }
+   
+   private static Map<Pattern,String> readMetadataPatterns(Element metadataDef) {
+      Map<Pattern, String> metadata = new HashMap<Pattern, String>();
+      
+      NodeList children = metadataDef.getChildNodes();
+      for(int i=0; i<children.getLength(); i++) {
+         Node node = children.item(i);
+         if (node.getNodeType() == Node.ELEMENT_NODE) {
+            Element child = (Element) node;
+            if (child.getTagName().equals(METADATA_MATCH_TAG)) {
+               String metadataKey = child.getAttribute(METADATA_KEY_ATTR);
+               Pattern pattern = Pattern.compile( child.getFirstChild().getNodeValue() );
+               metadata.put(pattern, metadataKey);
+            }
+         }
+      }
+      
+      return metadata;
+   }
+}

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java Wed Apr  6 16:19:17 2011
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+/**
+ * Met Keys used by the {@link ExternalParsersConfigReader}.
+ */
+public interface ExternalParsersConfigReaderMetKeys {
+
+    String EXTERNAL_PARSERS_TAG = "external-parsers";
+
+    String PARSER_TAG = "parser";
+
+    String COMMAND_TAG = "command";
+    
+    String CHECK_TAG = "check";
+    
+    String MIMETYPES_TAG = "mime-types";
+    
+    String MIMETYPE_TAG = "mime-type";
+    
+    String METADATA_TAG = "metadata";
+    
+    String METADATA_MATCH_TAG = "match";
+    
+    String METADATA_KEY_ATTR = "key";
+}

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java Wed Apr  6 16:19:17 2011
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.List;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Creates instances of ExternalParser based on XML 
+ *  configuration files.
+ *  
+ * @see ExternalParsersConfigReader
+ */
+public class ExternalParsersFactory {
+   
+   public static List<ExternalParser> create() throws IOException, TikaException {
+      return create(new ServiceLoader());
+   }
+   
+   public static List<ExternalParser> create(ServiceLoader loader) 
+           throws IOException, TikaException {
+      return create("tika-external-parsers.xml", loader);
+   }
+   
+   public static List<ExternalParser> create(String filename, ServiceLoader loader) 
+           throws IOException, TikaException {
+      String filepath = ExternalParsersFactory.class.getPackage().getName().replace('.', '/') +
+                     "/" + filename;
+      Enumeration<URL> files = loader.findServiceResources(filepath);
+      ArrayList<URL> list = Collections.list(files);
+      URL[] urls = list.toArray(new URL[list.size()]);
+      return create(urls);
+   }
+   
+   public static List<ExternalParser> create(URL... urls) throws IOException, TikaException {
+      List<ExternalParser> parsers = new ArrayList<ExternalParser>();
+      for(URL url : urls) {
+         InputStream stream = url.openStream();
+         try {
+            parsers.addAll(
+                  ExternalParsersConfigReader.read(stream)
+            );
+         } finally {
+            stream.close();
+         }
+      }
+      return parsers;
+   }
+   
+   public static void attachExternalParsers(TikaConfig config) throws IOException, TikaException {
+      attachExternalParsers( create(), config );
+   }
+   
+   public static void attachExternalParsers(List<ExternalParser> parsers, TikaConfig config) {
+      // TODO
+   }
+}