You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/04/06 19:39:33 UTC
svn commit: r1089543 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external:
CompositeExternalParser.java ExternalParser.java
ExternalParsersConfigReader.java ExternalParsersConfigReaderMetKeys.java
Author: nick
Date: Wed Apr 6 17:39:32 2011
New Revision: 1089543
URL: http://svn.apache.org/viewvc?rev=1089543&view=rev
Log:
TIKA-634 - Add support for checking if the external command is there, for collecting the output from a file, and a wrapper CompositeParser that loads all available External Parsers
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java?rev=1089543&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java Wed Apr 6 17:39:32 2011
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Composite Parser that wraps up all the available External Parsers,
+ * and provides an easy way to access them.
+ * Parser that uses an external program (like catdoc or pdf2txt) to extract
+ * text content and metadata from a given document.
+ */
+public class CompositeExternalParser extends CompositeParser {
+ private static final long serialVersionUID = 6962436916649024024L;
+
+ public CompositeExternalParser() throws IOException, TikaException {
+ this(new MediaTypeRegistry());
+ }
+
+ @SuppressWarnings("unchecked")
+ public CompositeExternalParser(MediaTypeRegistry registry) throws IOException, TikaException {
+ super(
+ registry,
+ (List<Parser>)(List<? extends Parser>)ExternalParsersFactory.create()
+ );
+ }
+}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java?rev=1089543&r1=1089542&r2=1089543&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java Wed Apr 6 17:39:32 2011
@@ -17,6 +17,8 @@
package org.apache.tika.parser.external;
import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@@ -32,6 +34,7 @@ import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.NullOutputStream;
+import org.apache.tika.io.TemporaryFiles;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -78,6 +81,7 @@ public class ExternalParser extends Abst
*/
private String[] command = new String[] { "cat" };
+ private TemporaryFiles tmp = new TemporaryFiles();
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -142,17 +146,19 @@ public class ExternalParser extends Abst
boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
TikaInputStream tikaStream = TikaInputStream.get(stream);
+ File output = null;
// Build our command
String[] cmd = new String[command.length];
System.arraycopy(command, 0, cmd, 0, command.length);
for(int i=0; i<cmd.length; i++) {
if(cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
- cmd[i].replace(INPUT_FILE_TOKEN, tikaStream.getFile().toString());
+ cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, tikaStream.getFile().toString());
inputToStdIn = false;
}
if(cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
- // TODO
+ output = tmp.createTemporaryFile();
+ outputFromStdOut = false;
}
}
@@ -199,7 +205,11 @@ public class ExternalParser extends Abst
}
// Grab the output if we haven't already
- // TODO
+ if(!outputFromStdOut) {
+ FileInputStream out = new FileInputStream(output);
+ extractOutput(out, xhtml);
+ tmp.dispose();
+ }
}
/**
@@ -295,4 +305,42 @@ public class ExternalParser extends Abst
}
}.start();
}
+
+ /**
+ * Checks to see if the command can be run. Typically used with
+ * something like "myapp --version" to check to see if "myapp"
+ * is installed and on the path.
+ *
+ * @param checkCmd The check command to run
+ * @param errorValue What is considered an error value?
+ */
+ public static boolean check(String checkCmd, int... errorValue) {
+ return check(new String[] {checkCmd}, errorValue);
+ }
+ public static boolean check(String[] checkCmd, int... errorValue) {
+ if(errorValue.length == 0) {
+ errorValue = new int[] { 127 };
+ }
+
+ try {
+ Process process;
+ if(checkCmd.length == 1) {
+ process = Runtime.getRuntime().exec(checkCmd[0]);
+ } else {
+ process = Runtime.getRuntime().exec(checkCmd);
+ }
+ int result = process.waitFor();
+
+ for(int err : errorValue) {
+ if(result == err) return false;
+ }
+ return true;
+ } catch(IOException e) {
+ // Some problem, command is there or is broken
+ return false;
+ } catch (InterruptedException ie) {
+ // Some problem, command is there or is broken
+ return false;
+ }
+ }
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java?rev=1089543&r1=1089542&r2=1089543&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java Wed Apr 6 17:39:32 2011
@@ -24,6 +24,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.StringTokenizer;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
@@ -109,12 +110,13 @@ public final class ExternalParsersConfig
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element child = (Element) node;
if (child.getTagName().equals(CHECK_TAG)) {
- // TODO
+ boolean present = readCheckTagAndCheck(child);
+ if(! present) {
+ return null;
+ }
}
else if (child.getTagName().equals(COMMAND_TAG)) {
- parser.setCommand(
- child.getFirstChild().getNodeValue()
- );
+ parser.setCommand( getString(child) );
}
else if (child.getTagName().equals(MIMETYPES_TAG)) {
parser.setSupportedTypes(
@@ -134,6 +136,18 @@ public final class ExternalParsersConfig
private static Set<MediaType> readMimeTypes(Element mimeTypes) {
Set<MediaType> types = new HashSet<MediaType>();
+
+ NodeList children = mimeTypes.getChildNodes();
+ for(int i=0; i<children.getLength(); i++) {
+ Node node = children.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element child = (Element) node;
+ if (child.getTagName().equals(MIMETYPE_TAG)) {
+ types.add( MediaType.parse( getString(child) ) );
+ }
+ }
+ }
+
return types;
}
@@ -147,7 +161,7 @@ public final class ExternalParsersConfig
Element child = (Element) node;
if (child.getTagName().equals(METADATA_MATCH_TAG)) {
String metadataKey = child.getAttribute(METADATA_KEY_ATTR);
- Pattern pattern = Pattern.compile( child.getFirstChild().getNodeValue() );
+ Pattern pattern = Pattern.compile( getString(child) );
metadata.put(pattern, metadataKey);
}
}
@@ -155,4 +169,56 @@ public final class ExternalParsersConfig
return metadata;
}
+
+ private static boolean readCheckTagAndCheck(Element checkDef) {
+ String command = null;
+ List<Integer> errorVals = new ArrayList<Integer>();
+
+ NodeList children = checkDef.getChildNodes();
+ for(int i=0; i<children.getLength(); i++) {
+ Node node = children.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element child = (Element) node;
+ if (child.getTagName().equals(COMMAND_TAG)) {
+ command = getString(child);
+ }
+ if (child.getTagName().equals(ERROR_CODES_TAG)) {
+ String errs = getString(child);
+ StringTokenizer st = new StringTokenizer(errs);
+ while(st.hasMoreElements()) {
+ try {
+ String s = st.nextToken();
+ errorVals.add(Integer.parseInt(s));
+ } catch(NumberFormatException e) {}
+ }
+ }
+ }
+ }
+
+ if(command != null) {
+ int[] errVals = new int[errorVals.size()];
+ for(int i=0; i<errVals.length; i++) {
+ errVals[i] = errorVals.get(i);
+ }
+
+ return ExternalParser.check(command, errVals);
+ }
+
+ // No check command, so assume it's there
+ return true;
+ }
+
+ private static String getString(Element element) {
+ StringBuffer s = new StringBuffer();
+
+ NodeList children = element.getChildNodes();
+ for(int i=0; i<children.getLength(); i++) {
+ Node node = children.item(i);
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ s.append( node.getNodeValue() );
+ }
+ }
+
+ return s.toString();
+ }
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java?rev=1089543&r1=1089542&r2=1089543&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java Wed Apr 6 17:39:32 2011
@@ -29,6 +29,8 @@ public interface ExternalParsersConfigRe
String CHECK_TAG = "check";
+ String ERROR_CODES_TAG = "error-codes";
+
String MIMETYPES_TAG = "mime-types";
String MIMETYPE_TAG = "mime-type";