You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/04/06 18:19:17 UTC
svn commit: r1089516 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika: config/ parser/
parser/external/
Author: nick
Date: Wed Apr 6 16:19:17 2011
New Revision: 1089516
URL: http://svn.apache.org/viewvc?rev=1089516&view=rev
Log:
TIKA-634 - Initial work on supporting more flexible ExternalParser loading (via XML, part done), and external parser metadata extraction
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1089516&r1=1089515&r2=1089516&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java Wed Apr 6 16:19:17 2011
@@ -93,6 +93,23 @@ public class ServiceLoader {
public ServiceLoader() {
this(getContextClassLoader());
}
+
+ /**
+ * Returns all the available service resources matching the
+ * given pattern, such as all instances of tika-mimetypes.xml
+ * on the classpath, or all org.apache.tika.parser.Parser
+ * service files.
+ */
+ public Enumeration<URL> findServiceResources(String filePattern) {
+ try {
+ Enumeration<URL> resources = loader.getResources(filePattern);
+ return resources;
+ } catch (IOException ignore) {
+ // We couldn't get the list of service resource files
+ List<URL> empty = Collections.emptyList();
+ return Collections.enumeration( empty );
+ }
+ }
/**
* Returns all the available service providers of the given type.
@@ -107,18 +124,14 @@ public class ServiceLoader {
if (loader != null) {
Set<String> names = new HashSet<String>();
- try {
- String name = service.getName();
- Enumeration<URL> resources = loader.getResources("META-INF/services/" + name);
- for (URL resource : Collections.list(resources)) {
- try {
- names.addAll(getServiceClassNames(resource));
- } catch (IOException e) {
- handler.handleLoadError(name, e);
- }
+ String serviceName = service.getName();
+ Enumeration<URL> resources = findServiceResources("META-INF/services/" + serviceName);
+ for (URL resource : Collections.list(resources)) {
+ try {
+ names.addAll(getServiceClassNames(resource));
+ } catch (IOException e) {
+ handler.handleLoadError(serviceName, e);
}
- } catch (IOException ignore) {
- // We couldn't get the list of service resource files
}
for (String name : names) {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java?rev=1089516&r1=1089515&r2=1089516&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java Wed Apr 6 16:19:17 2011
@@ -35,9 +35,12 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * Parser that uses an external program (like catdoc or pdf2txt) to extract
- * text content from a given document.
+ * Very basic parser that uses an external program (like catdoc or pdf2txt)
+ * to extract text content from a given document.
+ *
+ * @deprecated Use the more advanced {@link org.apache.tika.parser.external.ExternalParser} instead
*/
+@Deprecated
public class ExternalParser extends AbstractParser {
/**
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java Wed Apr 6 16:19:17 2011
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.NullOutputStream;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser that uses an external program (like catdoc or pdf2txt) to extract
+ * text content and metadata from a given document.
+ */
+public class ExternalParser extends AbstractParser {
+ private static final long serialVersionUID = -1079128990650687037L;
+
+ /**
+ * The token, which if present in the Command string, will
+ * be replaced with the input filename.
+ * Alternately, the input data can be streamed over STDIN.
+ */
+ public static final String INPUT_FILE_TOKEN = "${INPUT}";
+ /**
+ * The token, which if present in the Command string, will
+ * be replaced with the output filename.
+ * Alternately, the output data can be collected on STDOUT.
+ */
+ public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";
+
+ /**
+ * Media types supported by the external program.
+ */
+ private Set<MediaType> supportedTypes = Collections.emptySet();
+
+ /**
+ * Regular Expressions to run over STDOUT to
+ * extract Metadata.
+ */
+ private Map<Pattern,String> metadataPatterns = null;
+
+ /**
+ * The external command to invoke.
+ * @see Runtime#exec(String[])
+ */
+ private String[] command = new String[] { "cat" };
+
+
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return getSupportedTypes();
+ }
+
+ public Set<MediaType> getSupportedTypes() {
+ return supportedTypes;
+ }
+
+ public void setSupportedTypes(Set<MediaType> supportedTypes) {
+ this.supportedTypes =
+ Collections.unmodifiableSet(new HashSet<MediaType>(supportedTypes));
+ }
+
+
+ public String[] getCommand() {
+ return command;
+ }
+
+ /**
+ * Sets the command to be run. This can include either of
+ * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
+ * if the command needs filenames.
+ * @see Runtime#exec(String[])
+ */
+ public void setCommand(String... command) {
+ this.command = command;
+ }
+
+
+ public Map<Pattern,String> getMetadataExtractionPatterns() {
+ return metadataPatterns;
+ }
+
+ /**
+ * Sets the map of regular expression patterns and Metadata
+ * keys. Any matching patterns will have the matching
+ * metadata entries set.
+ * Set this to null to disable Metadata extraction.
+ */
+ public void setMetadataExtractionPatterns(Map<Pattern,String> patterns) {
+ this.metadataPatterns = patterns;
+ }
+
+
+ /**
+ * Executes the configured external command and passes the given document
+ * stream as a simple XHTML document to the given SAX content handler.
+ * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
+ * has been called to set patterns.
+ */
+ public void parse(
+ final InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+
+ boolean inputToStdIn = true;
+ boolean outputFromStdOut = true;
+ boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
+
+ TikaInputStream tikaStream = TikaInputStream.get(stream);
+
+ // Build our command
+ String[] cmd = new String[command.length];
+ System.arraycopy(command, 0, cmd, 0, command.length);
+ for(int i=0; i<cmd.length; i++) {
+ if(cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
+ cmd[i].replace(INPUT_FILE_TOKEN, tikaStream.getFile().toString());
+ inputToStdIn = false;
+ }
+ if(cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
+ // TODO
+ }
+ }
+
+ // Execute
+ Process process;
+ if(cmd.length == 1) {
+ process = Runtime.getRuntime().exec( cmd[0] );
+ } else {
+ process = Runtime.getRuntime().exec( cmd );
+ }
+
+ try {
+ if(inputToStdIn) {
+ sendInput(process, stream);
+ } else {
+ process.getOutputStream().close();
+ }
+
+ InputStream out = process.getInputStream();
+ InputStream err = process.getErrorStream();
+
+ if(hasPatterns) {
+ extractMetadata(err, metadata);
+
+ if(outputFromStdOut) {
+ extractOutput(out, xhtml);
+ } else {
+ extractMetadata(out, metadata);
+ }
+ } else {
+ ignoreStream(err);
+
+ if(outputFromStdOut) {
+ extractOutput(out, xhtml);
+ } else {
+ ignoreStream(out);
+ }
+ }
+ } finally {
+ try {
+ process.waitFor();
+ } catch (InterruptedException ignore) {
+ }
+ }
+
+ // Grab the output if we haven't already
+ // TODO
+ }
+
+ /**
+ * Starts a thread that extracts the contents of the standard output
+ * stream of the given process to the given XHTML content handler.
+ * The standard output stream is closed once fully processed.
+ *
+ * @param process process
+ * @param xhtml XHTML content handler
+ * @throws SAXException if the XHTML SAX events could not be handled
+ * @throws IOException if an input error occurred
+ */
+ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
+ throws SAXException, IOException {
+ Reader reader = new InputStreamReader(stream);
+ try {
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ xhtml.characters(buffer, 0, n);
+ }
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ } finally {
+ reader.close();
+ }
+ }
+
+ /**
+ * Starts a thread that sends the contents of the given input stream
+ * to the standard input stream of the given process. Potential
+ * exceptions are ignored, and the standard input stream is closed
+ * once fully processed. Note that the given input stream is <em>not</em>
+ * closed by this method.
+ *
+ * @param process process
+ * @param stream input stream
+ */
+ private void sendInput(final Process process, final InputStream stream) {
+ new Thread() {
+ public void run() {
+ OutputStream stdin = process.getOutputStream();
+ try {
+ IOUtils.copy(stream, stdin);
+ } catch (IOException e) {
+ } finally {
+ IOUtils.closeQuietly(stdin);
+ }
+ }
+ }.start();
+ }
+
+ /**
+ * Starts a thread that reads and discards the contents of the
+ * standard stream of the given process. Potential exceptions
+ * are ignored, and the stream is closed once fully processed.
+ *
+ * @param process process
+ */
+ private void ignoreStream(final InputStream stream) {
+ new Thread() {
+ public void run() {
+ try {
+ IOUtils.copy(stream, new NullOutputStream());
+ } catch (IOException e) {
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ }
+ }.start();
+ }
+
+ private void extractMetadata(final InputStream stream, final Metadata metadata) {
+ new Thread() {
+ public void run() {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
+ try {
+ String line;
+ while ( (line = reader.readLine()) != null ) {
+ for(Pattern p : metadataPatterns.keySet()) {
+ Matcher m = p.matcher(line);
+ if(m.find()) {
+ metadata.add( metadataPatterns.get(p), m.group(1) );
+ }
+ }
+ }
+ } catch (IOException e) {
+ } finally {
+ IOUtils.closeQuietly(reader);
+ IOUtils.closeQuietly(stream);
+ }
+ }
+ }.start();
+ }
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java Wed Apr 6 16:19:17 2011
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * Builds up ExternalParser instances based on XML file(s)
+ * which define what to run, for what, and how to process
+ * any output metadata.
+ * Typically used to configure up a series of external programs
+ * (like catdoc or pdf2txt) to extract text content from documents.
+ *
+ * <pre>
+ * TODO XML DTD Here
+ * </pre>
+ */
+public final class ExternalParsersConfigReader implements ExternalParsersConfigReaderMetKeys {
+
+ public static List<ExternalParser> read(InputStream stream) throws TikaException, IOException {
+ try {
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document document = builder.parse(new InputSource(stream));
+ return read(document);
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("Unable to create an XML parser", e);
+ } catch (SAXException e) {
+ throw new TikaException("Invalid parser configuration", e);
+ }
+ }
+
+ public static List<ExternalParser> read(Document document) throws TikaException, IOException {
+ return read(document.getDocumentElement());
+ }
+
+ public static List<ExternalParser> read(Element element) throws TikaException, IOException {
+ List<ExternalParser> parsers = new ArrayList<ExternalParser>();
+
+ if (element != null && element.getTagName().equals(EXTERNAL_PARSERS_TAG)) {
+ NodeList nodes = element.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element child = (Element) node;
+ if (child.getTagName().equals(PARSER_TAG)) {
+ ExternalParser p = readParser(child);
+ if(p != null) {
+ parsers.add( p );
+ }
+ }
+ }
+ }
+ } else {
+ throw new MimeTypeException(
+ "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: "
+ + element.getTagName());
+ }
+
+ return parsers;
+ }
+
+ /**
+ * Builds and Returns an ExternalParser, or null if a check
+ * command was given that didn't match.
+ */
+ private static ExternalParser readParser(Element parserDef) throws TikaException {
+ ExternalParser parser = new ExternalParser();
+
+ NodeList children = parserDef.getChildNodes();
+ for(int i=0; i<children.getLength(); i++) {
+ Node node = children.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element child = (Element) node;
+ if (child.getTagName().equals(CHECK_TAG)) {
+ // TODO
+ }
+ else if (child.getTagName().equals(COMMAND_TAG)) {
+ parser.setCommand(
+ child.getFirstChild().getNodeValue()
+ );
+ }
+ else if (child.getTagName().equals(MIMETYPES_TAG)) {
+ parser.setSupportedTypes(
+ readMimeTypes(child)
+ );
+ }
+ else if (child.getTagName().equals(METADATA_TAG)) {
+ parser.setMetadataExtractionPatterns(
+ readMetadataPatterns(child)
+ );
+ }
+ }
+ }
+
+ return parser;
+ }
+
+ private static Set<MediaType> readMimeTypes(Element mimeTypes) {
+ Set<MediaType> types = new HashSet<MediaType>();
+ return types;
+ }
+
+ private static Map<Pattern,String> readMetadataPatterns(Element metadataDef) {
+ Map<Pattern, String> metadata = new HashMap<Pattern, String>();
+
+ NodeList children = metadataDef.getChildNodes();
+ for(int i=0; i<children.getLength(); i++) {
+ Node node = children.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element child = (Element) node;
+ if (child.getTagName().equals(METADATA_MATCH_TAG)) {
+ String metadataKey = child.getAttribute(METADATA_KEY_ATTR);
+ Pattern pattern = Pattern.compile( child.getFirstChild().getNodeValue() );
+ metadata.put(pattern, metadataKey);
+ }
+ }
+ }
+
+ return metadata;
+ }
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java Wed Apr 6 16:19:17 2011
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+/**
+ * Met Keys used by the {@link ExternalParsersConfigReader}.
+ */
+public interface ExternalParsersConfigReaderMetKeys {
+
+ String EXTERNAL_PARSERS_TAG = "external-parsers";
+
+ String PARSER_TAG = "parser";
+
+ String COMMAND_TAG = "command";
+
+ String CHECK_TAG = "check";
+
+ String MIMETYPES_TAG = "mime-types";
+
+ String MIMETYPE_TAG = "mime-type";
+
+ String METADATA_TAG = "metadata";
+
+ String METADATA_MATCH_TAG = "match";
+
+ String METADATA_KEY_ATTR = "key";
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java?rev=1089516&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java Wed Apr 6 16:19:17 2011
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.List;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Creates instances of ExternalParser based on XML
+ * configuration files.
+ *
+ * @see ExternalParsersConfigReader
+ */
+public class ExternalParsersFactory {
+
+ public static List<ExternalParser> create() throws IOException, TikaException {
+ return create(new ServiceLoader());
+ }
+
+ public static List<ExternalParser> create(ServiceLoader loader)
+ throws IOException, TikaException {
+ return create("tika-external-parsers.xml", loader);
+ }
+
+ public static List<ExternalParser> create(String filename, ServiceLoader loader)
+ throws IOException, TikaException {
+ String filepath = ExternalParsersFactory.class.getPackage().getName().replace('.', '/') +
+ "/" + filename;
+ Enumeration<URL> files = loader.findServiceResources(filepath);
+ ArrayList<URL> list = Collections.list(files);
+ URL[] urls = list.toArray(new URL[list.size()]);
+ return create(urls);
+ }
+
+ public static List<ExternalParser> create(URL... urls) throws IOException, TikaException {
+ List<ExternalParser> parsers = new ArrayList<ExternalParser>();
+ for(URL url : urls) {
+ InputStream stream = url.openStream();
+ try {
+ parsers.addAll(
+ ExternalParsersConfigReader.read(stream)
+ );
+ } finally {
+ stream.close();
+ }
+ }
+ return parsers;
+ }
+
+ public static void attachExternalParsers(TikaConfig config) throws IOException, TikaException {
+ attachExternalParsers( create(), config );
+ }
+
+ public static void attachExternalParsers(List<ExternalParser> parsers, TikaConfig config) {
+ // TODO
+ }
+}