You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/01/26 00:26:26 UTC

svn commit: r615395 - in /incubator/tika/trunk: CHANGES.txt pom.xml src/main/assembly/bin.xml src/main/java/org/apache/tika/cli/ src/main/java/org/apache/tika/cli/TikaCLI.java src/main/shell/ src/main/shell/tika.bat src/main/shell/tika.sh

Author: jukka
Date: Fri Jan 25 15:26:24 2008
New Revision: 615395

URL: http://svn.apache.org/viewvc?rev=615395&view=rev
Log:
TIKA-96: Tika CLI
    - Added the o.a.tika.cli.TikaCLI command line class
    - Initial features:
      + four output formats (xml, html, text, metadata)
      + three input sources (files, URLs, standard input)
      + two logging levels (info and debug)
      + usage message
      + GUI mode
    - Added simple Unix and DOS start scripts
    - Added required packaging and manifest settings

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/cli/
    incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
    incubator/tika/trunk/src/main/shell/
    incubator/tika/trunk/src/main/shell/tika.bat
    incubator/tika/trunk/src/main/shell/tika.sh
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/pom.xml
    incubator/tika/trunk/src/main/assembly/bin.xml

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=615395&r1=615394&r2=615395&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Jan 25 15:26:24 2008
@@ -15,6 +15,8 @@
 
 6. TIKA-97  - Tika GUI (Jukka Zitting)
 
+7. TIKA-96  - Tika CLI (Jukka Zitting)
+
 
 Release 0.1-incubating - 12/27/2007
 

Modified: incubator/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=615395&r1=615394&r2=615395&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Fri Jan 25 15:26:24 2008
@@ -266,6 +266,10 @@
               <Implementation-Vendor>${project.organization.name}</Implementation-Vendor>
               <Implementation-Vendor-Id>org.apache</Implementation-Vendor-Id>
             </manifestEntries>
+            <manifest>
+              <addClasspath>true</addClasspath>
+              <mainClass>org.apache.tika.cli.TikaCLI</mainClass>
+            </manifest>
           </archive>
         </configuration>
       </plugin>

Modified: incubator/tika/trunk/src/main/assembly/bin.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/assembly/bin.xml?rev=615395&r1=615394&r2=615395&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/assembly/bin.xml (original)
+++ incubator/tika/trunk/src/main/assembly/bin.xml Fri Jan 25 15:26:24 2008
@@ -48,4 +48,20 @@
             <outputDirectory>apidocs</outputDirectory>
         </fileSet>
     </fileSets>
+    <files>
+        <file>
+            <source>src/main/shell/tika.sh</source>
+            <outputDirectory>bin</outputDirectory>
+            <destName>tika</destName>
+            <fileMode>0755</fileMode>
+            <lineEnding>unix</lineEnding>
+            <filtered>true</filtered>
+        </file>
+        <file>
+            <source>src/main/shell/tika.bat</source>
+            <outputDirectory>bin</outputDirectory>
+            <lineEnding>dos</lineEnding>
+            <filtered>true</filtered>
+        </file>
+    </files>
 </assembly>

Added: incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=615395&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Jan 25 15:26:24 2008
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.Writer;
+import java.net.URL;
+import java.util.Arrays;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.log4j.BasicConfigurator;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.log4j.SimpleLayout;
+import org.apache.log4j.WriterAppender;
+import org.apache.tika.gui.TikaGUI;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Simple command line interface for Apache Tika.
+ */
+public class TikaCLI {
+
+    public static void main(String[] args) throws Exception {
+        BasicConfigurator.configure(
+                new WriterAppender(new SimpleLayout(), System.err));
+        Logger.getRootLogger().setLevel(Level.INFO);
+
+        TikaCLI cli = new TikaCLI();
+        for (int i = 0; i < args.length; i++) {
+            cli.process(args[i]);
+        }
+        if (args.length == 0) {
+            cli.process("-");
+        }
+    }
+
+    private Parser parser;
+
+    private Metadata metadata;
+
+    private ContentHandler handler;
+
+    public TikaCLI() throws TransformerConfigurationException {
+        parser = new AutoDetectParser();
+        handler = getXmlContentHandler();
+    }
+
+    public void process(String arg) throws Exception {
+        if (arg.equals("-?") || arg.equals("--help")) {
+            usage();
+        } else if (arg.equals("-v") || arg.equals("--verbose")) {
+            Logger.getRootLogger().setLevel(Level.DEBUG);
+        } else if (arg.equals("-g") || arg.equals("--gui")) {
+            TikaGUI.main(new String[0]);
+        } else if (arg.equals("-x") || arg.equals("--xml")) {
+            handler = getXmlContentHandler();
+        } else if (arg.equals("-h") || arg.equals("--html")) {
+            handler = getHtmlContentHandler();
+        } else if (arg.equals("-t") || arg.equals("--text")) {
+            handler = getTextContentHandler();
+        } else if (arg.equals("-m") || arg.equals("--metadata")) {
+            handler = getMetadataContentHandler();
+        } else {
+            metadata = new Metadata();
+            if (arg.equals("-")) {
+                parser.parse(System.in, handler, metadata);
+            } else {
+                InputStream input;
+                File file = new File(arg);
+                if (file.isFile()) {
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
+                    input = new FileInputStream(file);
+                } else {
+                    URL url = new URL(arg);
+                    String path = url.getPath();
+                    int slash = path.lastIndexOf('/');
+                    String name = path.substring(slash + 1);
+                    if (name.length() > 0) {
+                        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+                    }
+                    input = url.openStream();
+                }
+                try {
+                    parser.parse(input, handler, metadata);
+                } finally {
+                    input.close();
+                }
+            }
+        }
+    }
+
+    private void usage() {
+        PrintStream out = System.out;
+        out.println("usage: tika [option] file");
+        out.println();
+        out.println("Options:");
+        out.println("    -? or --help       Print this usage message");
+        out.println("    -v or --verbose    Print debug level messages");
+        out.println("    -g or --gui        Start the Apache Tika GUI");
+        out.println("    -x or --xml        Output XHTML content (default)");
+        out.println("    -h or --html       Output HTML content");
+        out.println("    -t or --text       Output plain text content");
+        out.println("    -m or --metadata   Output only metadata");
+        out.println();
+        out.println("Description:");
+        out.println("    Apache Tika will parse the file(s) specified on the");
+        out.println("    command line and output the extracted text content");
+        out.println("    or metadata to standard output.");
+        out.println();
+        out.println("    Instead of a file name you can also specify the URL");
+        out.println("    of a document to be parsed.");
+        out.println();
+        out.println("    Use \"-\" as the file name to parse the standard");
+        out.println("    input stream.");
+        out.println();
+        out.println("    Use the \"--gui\" (or \"-g\") option to start");
+        out.println("    the Apache Tika GUI. You can drag and drop files");
+        out.println("    from a normal file explorer to the GUI window to");
+        out.println("    extract text content and metadata from the files.");
+    }
+
+    private ContentHandler getXmlContentHandler()
+            throws TransformerConfigurationException {
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+            SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(System.out));
+        return handler;
+    }
+
+    private ContentHandler getHtmlContentHandler()
+            throws TransformerConfigurationException {
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+        SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(System.out));
+        return handler;
+    }
+
+    private ContentHandler getTextContentHandler() {
+        final Writer writer = new OutputStreamWriter(System.out);
+        XPathParser parser =
+            new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+        return new MatchingContentHandler(
+                new WriteOutContentHandler(writer),
+                parser.parse("/xhtml:html/xhtml:body//text()")) {
+            public void endDocument() throws SAXException {
+                super.endDocument();
+                try { writer.flush(); } catch (IOException e) {}
+            }
+        };
+    }
+
+    private ContentHandler getMetadataContentHandler() {
+        return new DefaultHandler() {
+            public void endDocument() {
+                String[] names = metadata.names();
+                Arrays.sort(names);
+                for (String name : names) {
+                    System.out.println(name + ": " + metadata.get(name));
+                }
+            }
+        };
+    }
+
+}

Added: incubator/tika/trunk/src/main/shell/tika.bat
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/shell/tika.bat?rev=615395&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/shell/tika.bat (added)
+++ incubator/tika/trunk/src/main/shell/tika.bat Fri Jan 25 15:26:24 2008
@@ -0,0 +1,18 @@
+@echo off
+
+REM  Licensed to the Apache Software Foundation (ASF) under one or more
+REM  contributor license agreements.  See the NOTICE file distributed with
+REM  this work for additional information regarding copyright ownership.
+REM  The ASF licenses this file to You under the Apache License, Version 2.0
+REM  (the "License"); you may not use this file except in compliance with
+REM  the License.  You may obtain a copy of the License at
+REM 
+REM      http://www.apache.org/licenses/LICENSE-2.0
+REM 
+REM  Unless required by applicable law or agreed to in writing, software
+REM  distributed under the License is distributed on an "AS IS" BASIS,
+REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+REM  See the License for the specific language governing permissions and
+REM  limitations under the License.
+
+java -jar lib/${project.build.finalName}.jar %1 %2 %3 %4 %5 %6 %7 %8 %9

Added: incubator/tika/trunk/src/main/shell/tika.sh
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/shell/tika.sh?rev=615395&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/shell/tika.sh (added)
+++ incubator/tika/trunk/src/main/shell/tika.sh Fri Jan 25 15:26:24 2008
@@ -0,0 +1,18 @@
+#! /bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+java -jar lib/${pom.build.finalName}.jar $*