You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/04/06 19:39:33 UTC

svn commit: r1089543 - in /tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external: CompositeExternalParser.java ExternalParser.java ExternalParsersConfigReader.java ExternalParsersConfigReaderMetKeys.java

Author: nick
Date: Wed Apr  6 17:39:32 2011
New Revision: 1089543

URL: http://svn.apache.org/viewvc?rev=1089543&view=rev
Log:
TIKA-634 - Add support for checking if the external command is there, for collecting the output from a file, and a wrapper CompositeParser that loads all available External Parsers

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java?rev=1089543&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java Wed Apr  6 17:39:32 2011
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Composite Parser that wraps up all the available External Parsers,
+ *  and provides an easy way to access them.
+ * Parser that uses an external program (like catdoc or pdf2txt) to extract
+ *  text content and metadata from a given document.
+ */
+public class CompositeExternalParser extends CompositeParser {
+   private static final long serialVersionUID = 6962436916649024024L;
+
+   public CompositeExternalParser() throws IOException, TikaException {
+      this(new MediaTypeRegistry());
+   }
+   
+   @SuppressWarnings("unchecked")
+   public CompositeExternalParser(MediaTypeRegistry registry)  throws IOException, TikaException {
+      super(
+            registry, 
+            (List<Parser>)(List<? extends Parser>)ExternalParsersFactory.create()
+      );
+   }
+}

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java?rev=1089543&r1=1089542&r2=1089543&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java Wed Apr  6 17:39:32 2011
@@ -17,6 +17,8 @@
 package org.apache.tika.parser.external;
 
 import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -32,6 +34,7 @@ import java.util.regex.Pattern;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.NullOutputStream;
+import org.apache.tika.io.TemporaryFiles;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -78,6 +81,7 @@ public class ExternalParser extends Abst
      */
     private String[] command = new String[] { "cat" };
     
+    private TemporaryFiles tmp = new TemporaryFiles();
     
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -142,17 +146,19 @@ public class ExternalParser extends Abst
         boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
         
         TikaInputStream tikaStream = TikaInputStream.get(stream);
+        File output = null;
         
         // Build our command
         String[] cmd = new String[command.length];
         System.arraycopy(command, 0, cmd, 0, command.length);
         for(int i=0; i<cmd.length; i++) {
            if(cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
-              cmd[i].replace(INPUT_FILE_TOKEN, tikaStream.getFile().toString());
+              cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, tikaStream.getFile().toString());
               inputToStdIn = false;
            }
            if(cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
-              // TODO
+              output = tmp.createTemporaryFile();
+              outputFromStdOut = false;
            }
         }
 
@@ -199,7 +205,11 @@ public class ExternalParser extends Abst
         }
         
         // Grab the output if we haven't already
-        // TODO
+        if(!outputFromStdOut) {
+           FileInputStream out = new FileInputStream(output);
+           extractOutput(out, xhtml);
+           tmp.dispose();
+        }
     }
 
     /**
@@ -295,4 +305,42 @@ public class ExternalParser extends Abst
           }
        }.start();
     }
+    
+    /**
+     * Checks to see if the command can be run. Typically used with
+     *  something like "myapp --version" to check to see if "myapp"
+     *  is installed and on the path.
+     *  
+     * @param checkCmd The check command to run
+     * @param errorValue What is considered an error value? 
+     */
+    public static boolean check(String checkCmd, int... errorValue) {
+       return check(new String[] {checkCmd}, errorValue);
+    }
+    public static boolean check(String[] checkCmd, int... errorValue) {
+       if(errorValue.length == 0) {
+          errorValue = new int[] { 127 };
+       }
+       
+       try {
+          Process process;
+          if(checkCmd.length == 1) {
+             process = Runtime.getRuntime().exec(checkCmd[0]);
+          } else {
+             process = Runtime.getRuntime().exec(checkCmd);
+          }
+          int result = process.waitFor();
+          
+          for(int err : errorValue) {
+             if(result == err) return false;
+          }
+          return true;
+       } catch(IOException e) {
+          // Some problem, command is there or is broken
+          return false;
+       } catch (InterruptedException ie) {
+          // Some problem, command is there or is broken
+          return false;
+      }
+    }
 }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java?rev=1089543&r1=1089542&r2=1089543&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java Wed Apr  6 17:39:32 2011
@@ -24,6 +24,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.StringTokenizer;
 import java.util.regex.Pattern;
 
 import javax.xml.parsers.DocumentBuilder;
@@ -109,12 +110,13 @@ public final class ExternalParsersConfig
          if (node.getNodeType() == Node.ELEMENT_NODE) {
             Element child = (Element) node;
             if (child.getTagName().equals(CHECK_TAG)) {
-               // TODO
+               boolean present = readCheckTagAndCheck(child);
+               if(! present) {
+                  return null;
+               }
             }
             else if (child.getTagName().equals(COMMAND_TAG)) {
-               parser.setCommand(
-                     child.getFirstChild().getNodeValue()
-               );
+               parser.setCommand( getString(child) );
             }
             else if (child.getTagName().equals(MIMETYPES_TAG)) {
                parser.setSupportedTypes(
@@ -134,6 +136,18 @@ public final class ExternalParsersConfig
    
    private static Set<MediaType> readMimeTypes(Element mimeTypes) {
       Set<MediaType> types = new HashSet<MediaType>();
+      
+      NodeList children = mimeTypes.getChildNodes();
+      for(int i=0; i<children.getLength(); i++) {
+         Node node = children.item(i);
+         if (node.getNodeType() == Node.ELEMENT_NODE) {
+            Element child = (Element) node;
+            if (child.getTagName().equals(MIMETYPE_TAG)) {
+               types.add( MediaType.parse( getString(child) ) );
+            }
+         }
+      }
+      
       return types;
    }
    
@@ -147,7 +161,7 @@ public final class ExternalParsersConfig
             Element child = (Element) node;
             if (child.getTagName().equals(METADATA_MATCH_TAG)) {
                String metadataKey = child.getAttribute(METADATA_KEY_ATTR);
-               Pattern pattern = Pattern.compile( child.getFirstChild().getNodeValue() );
+               Pattern pattern = Pattern.compile( getString(child) );
                metadata.put(pattern, metadataKey);
             }
          }
@@ -155,4 +169,56 @@ public final class ExternalParsersConfig
       
       return metadata;
    }
+   
+   private static boolean readCheckTagAndCheck(Element checkDef) {
+      String command = null;
+      List<Integer> errorVals = new ArrayList<Integer>(); 
+      
+      NodeList children = checkDef.getChildNodes();
+      for(int i=0; i<children.getLength(); i++) {
+         Node node = children.item(i);
+         if (node.getNodeType() == Node.ELEMENT_NODE) {
+            Element child = (Element) node;
+            if (child.getTagName().equals(COMMAND_TAG)) {
+               command = getString(child);
+            }
+            if (child.getTagName().equals(ERROR_CODES_TAG)) {
+               String errs = getString(child);
+               StringTokenizer st = new StringTokenizer(errs);
+               while(st.hasMoreElements()) {
+                  try {
+                     String s = st.nextToken();
+                     errorVals.add(Integer.parseInt(s));
+                  } catch(NumberFormatException e) {}
+               }
+            }
+         }
+      }
+      
+      if(command != null) {
+         int[] errVals = new int[errorVals.size()];
+         for(int i=0; i<errVals.length; i++) {
+            errVals[i] = errorVals.get(i);
+         }
+         
+         return ExternalParser.check(command, errVals);
+      }
+      
+      // No check command, so assume it's there
+      return true;
+   }
+   
+   private static String getString(Element element) {
+      StringBuffer s = new StringBuffer();
+      
+      NodeList children = element.getChildNodes();
+      for(int i=0; i<children.getLength(); i++) {
+         Node node = children.item(i);
+         if (node.getNodeType() == Node.TEXT_NODE) {
+            s.append( node.getNodeValue() );
+         }
+      }
+      
+      return s.toString();
+   }
 }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java?rev=1089543&r1=1089542&r2=1089543&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java Wed Apr  6 17:39:32 2011
@@ -29,6 +29,8 @@ public interface ExternalParsersConfigRe
     
     String CHECK_TAG = "check";
     
+    String ERROR_CODES_TAG = "error-codes";
+    
     String MIMETYPES_TAG = "mime-types";
     
     String MIMETYPE_TAG = "mime-type";