You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/01/26 00:26:26 UTC
svn commit: r615395 - in /incubator/tika/trunk: CHANGES.txt pom.xml
src/main/assembly/bin.xml src/main/java/org/apache/tika/cli/
src/main/java/org/apache/tika/cli/TikaCLI.java src/main/shell/
src/main/shell/tika.bat src/main/shell/tika.sh
Author: jukka
Date: Fri Jan 25 15:26:24 2008
New Revision: 615395
URL: http://svn.apache.org/viewvc?rev=615395&view=rev
Log:
TIKA-96: Tika CLI
- Added the o.a.tika.cli.TikaCLI command line class
- Initial features:
+ four output formats (xml, html, text, metadata)
+ three input sources (files, URLs, standard input)
+ two logging levels (info and debug)
+ usage message
+ GUI mode
- Added simple Unix and DOS start scripts
- Added required packaging and manifest settings
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/cli/
incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
incubator/tika/trunk/src/main/shell/
incubator/tika/trunk/src/main/shell/tika.bat
incubator/tika/trunk/src/main/shell/tika.sh
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/pom.xml
incubator/tika/trunk/src/main/assembly/bin.xml
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=615395&r1=615394&r2=615395&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Jan 25 15:26:24 2008
@@ -15,6 +15,8 @@
6. TIKA-97 - Tika GUI (Jukka Zitting)
+7. TIKA-96 - Tika CLI (Jukka Zitting)
+
Release 0.1-incubating - 12/27/2007
Modified: incubator/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=615395&r1=615394&r2=615395&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Fri Jan 25 15:26:24 2008
@@ -266,6 +266,10 @@
<Implementation-Vendor>${project.organization.name}</Implementation-Vendor>
<Implementation-Vendor-Id>org.apache</Implementation-Vendor-Id>
</manifestEntries>
+ <manifest>
+ <addClasspath>true</addClasspath>
+ <mainClass>org.apache.tika.cli.TikaCLI</mainClass>
+ </manifest>
</archive>
</configuration>
</plugin>
Modified: incubator/tika/trunk/src/main/assembly/bin.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/assembly/bin.xml?rev=615395&r1=615394&r2=615395&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/assembly/bin.xml (original)
+++ incubator/tika/trunk/src/main/assembly/bin.xml Fri Jan 25 15:26:24 2008
@@ -48,4 +48,20 @@
<outputDirectory>apidocs</outputDirectory>
</fileSet>
</fileSets>
+ <files>
+ <file>
+ <source>src/main/shell/tika.sh</source>
+ <outputDirectory>bin</outputDirectory>
+ <destName>tika</destName>
+ <fileMode>0755</fileMode>
+ <lineEnding>unix</lineEnding>
+ <filtered>true</filtered>
+ </file>
+ <file>
+ <source>src/main/shell/tika.bat</source>
+ <outputDirectory>bin</outputDirectory>
+ <lineEnding>dos</lineEnding>
+ <filtered>true</filtered>
+ </file>
+ </files>
</assembly>
Added: incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=615395&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Jan 25 15:26:24 2008
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.Writer;
+import java.net.URL;
+import java.util.Arrays;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.log4j.BasicConfigurator;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.log4j.SimpleLayout;
+import org.apache.log4j.WriterAppender;
+import org.apache.tika.gui.TikaGUI;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Simple command line interface for Apache Tika.
+ */
+public class TikaCLI {
+
+ public static void main(String[] args) throws Exception {
+ BasicConfigurator.configure(
+ new WriterAppender(new SimpleLayout(), System.err));
+ Logger.getRootLogger().setLevel(Level.INFO);
+
+ TikaCLI cli = new TikaCLI();
+ for (int i = 0; i < args.length; i++) {
+ cli.process(args[i]);
+ }
+ if (args.length == 0) {
+ cli.process("-");
+ }
+ }
+
+ private Parser parser;
+
+ private Metadata metadata;
+
+ private ContentHandler handler;
+
+ public TikaCLI() throws TransformerConfigurationException {
+ parser = new AutoDetectParser();
+ handler = getXmlContentHandler();
+ }
+
+ public void process(String arg) throws Exception {
+ if (arg.equals("-?") || arg.equals("--help")) {
+ usage();
+ } else if (arg.equals("-v") || arg.equals("--verbose")) {
+ Logger.getRootLogger().setLevel(Level.DEBUG);
+ } else if (arg.equals("-g") || arg.equals("--gui")) {
+ TikaGUI.main(new String[0]);
+ } else if (arg.equals("-x") || arg.equals("--xml")) {
+ handler = getXmlContentHandler();
+ } else if (arg.equals("-h") || arg.equals("--html")) {
+ handler = getHtmlContentHandler();
+ } else if (arg.equals("-t") || arg.equals("--text")) {
+ handler = getTextContentHandler();
+ } else if (arg.equals("-m") || arg.equals("--metadata")) {
+ handler = getMetadataContentHandler();
+ } else {
+ metadata = new Metadata();
+ if (arg.equals("-")) {
+ parser.parse(System.in, handler, metadata);
+ } else {
+ InputStream input;
+ File file = new File(arg);
+ if (file.isFile()) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
+ input = new FileInputStream(file);
+ } else {
+ URL url = new URL(arg);
+ String path = url.getPath();
+ int slash = path.lastIndexOf('/');
+ String name = path.substring(slash + 1);
+ if (name.length() > 0) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+ input = url.openStream();
+ }
+ try {
+ parser.parse(input, handler, metadata);
+ } finally {
+ input.close();
+ }
+ }
+ }
+ }
+
+ private void usage() {
+ PrintStream out = System.out;
+ out.println("usage: tika [option] file");
+ out.println();
+ out.println("Options:");
+ out.println(" -? or --help Print this usage message");
+ out.println(" -v or --verbose Print debug level messages");
+ out.println(" -g or --gui Start the Apache Tika GUI");
+ out.println(" -x or --xml Output XHTML content (default)");
+ out.println(" -h or --html Output HTML content");
+ out.println(" -t or --text Output plain text content");
+ out.println(" -m or --metadata Output only metadata");
+ out.println();
+ out.println("Description:");
+ out.println(" Apache Tika will parse the file(s) specified on the");
+ out.println(" command line and output the extracted text content");
+ out.println(" or metadata to standard output.");
+ out.println();
+ out.println(" Instead of a file name you can also specify the URL");
+ out.println(" of a document to be parsed.");
+ out.println();
+ out.println(" Use \"-\" as the file name to parse the standard");
+ out.println(" input stream.");
+ out.println();
+ out.println(" Use the \"--gui\" (or \"-g\") option to start");
+ out.println(" the Apache Tika GUI. You can drag and drop files");
+ out.println(" from a normal file explorer to the GUI window to");
+ out.println(" extract text content and metadata from the files.");
+ }
+
+ private ContentHandler getXmlContentHandler()
+ throws TransformerConfigurationException {
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(System.out));
+ return handler;
+ }
+
+ private ContentHandler getHtmlContentHandler()
+ throws TransformerConfigurationException {
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(System.out));
+ return handler;
+ }
+
+ private ContentHandler getTextContentHandler() {
+ final Writer writer = new OutputStreamWriter(System.out);
+ XPathParser parser =
+ new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+ return new MatchingContentHandler(
+ new WriteOutContentHandler(writer),
+ parser.parse("/xhtml:html/xhtml:body//text()")) {
+ public void endDocument() throws SAXException {
+ super.endDocument();
+ try { writer.flush(); } catch (IOException e) {}
+ }
+ };
+ }
+
+ private ContentHandler getMetadataContentHandler() {
+ return new DefaultHandler() {
+ public void endDocument() {
+ String[] names = metadata.names();
+ Arrays.sort(names);
+ for (String name : names) {
+ System.out.println(name + ": " + metadata.get(name));
+ }
+ }
+ };
+ }
+
+}
Added: incubator/tika/trunk/src/main/shell/tika.bat
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/shell/tika.bat?rev=615395&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/shell/tika.bat (added)
+++ incubator/tika/trunk/src/main/shell/tika.bat Fri Jan 25 15:26:24 2008
@@ -0,0 +1,18 @@
+@echo off
+
+REM Licensed to the Apache Software Foundation (ASF) under one or more
+REM contributor license agreements. See the NOTICE file distributed with
+REM this work for additional information regarding copyright ownership.
+REM The ASF licenses this file to You under the Apache License, Version 2.0
+REM (the "License"); you may not use this file except in compliance with
+REM the License. You may obtain a copy of the License at
+REM
+REM http://www.apache.org/licenses/LICENSE-2.0
+REM
+REM Unless required by applicable law or agreed to in writing, software
+REM distributed under the License is distributed on an "AS IS" BASIS,
+REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+REM See the License for the specific language governing permissions and
+REM limitations under the License.
+
+java -jar lib/${project.build.finalName}.jar %1 %2 %3 %4 %5 %6 %7 %8 %9
Added: incubator/tika/trunk/src/main/shell/tika.sh
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/shell/tika.sh?rev=615395&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/shell/tika.sh (added)
+++ incubator/tika/trunk/src/main/shell/tika.sh Fri Jan 25 15:26:24 2008
@@ -0,0 +1,18 @@
+#! /bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+java -jar lib/${pom.build.finalName}.jar $*