You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/04/13 17:55:24 UTC
svn commit: r1091833 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
Author: jukka
Date: Wed Apr 13 15:55:24 2011
New Revision: 1091833
URL: http://svn.apache.org/viewvc?rev=1091833&view=rev
Log:
TIKA-593: Tika network server
Add a basic network client parser and initial CLI support for it. Work in progress.
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1091833&r1=1091832&r2=1091833&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Apr 13 15:55:24 2011
@@ -29,6 +29,7 @@ import java.io.Writer;
import java.lang.reflect.Field;
import java.net.ServerSocket;
import java.net.Socket;
+import java.net.URI;
import java.net.URL;
import java.util.Arrays;
import java.util.Comparator;
@@ -66,6 +67,7 @@ import org.apache.tika.mime.MediaTypeReg
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.NetworkParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
@@ -75,7 +77,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-
/**
* Simple command line interface for Apache Tika.
*/
@@ -216,7 +217,7 @@ public class TikaCLI {
private Detector detector;
- private AutoDetectParser parser;
+ private Parser parser;
private Metadata metadata;
@@ -229,7 +230,7 @@ public class TikaCLI {
private boolean pipeMode = true;
- private boolean portMode = false;
+ private boolean serverMode = false;
private boolean fork = false;
@@ -287,13 +288,20 @@ public class TikaCLI {
} else if (arg.equals("-z") || arg.equals("--extract")) {
type = NO_OUTPUT;
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
- } else if (arg.equals("-p") || arg.equals("--port")) {
- portMode = true;
+ } else if (arg.equals("-p") || arg.equals("--port")
+ || arg.equals("-s") || arg.equals("--server")) {
+ serverMode = true;
pipeMode = false;
+ } else if (arg.startsWith("-c")) {
+ URI uri = new URI(arg.substring("-c".length()));
+ parser = new NetworkParser(uri);
+ } else if (arg.startsWith("--client=")) {
+ URI uri = new URI(arg.substring("--client=".length()));
+ parser = new NetworkParser(uri);
} else {
pipeMode = false;
metadata = new Metadata();
- if (portMode) {
+ if (serverMode) {
new TikaServer(Integer.parseInt(arg)).start();
} else if (arg.equals("-")) {
InputStream stream =
@@ -467,6 +475,7 @@ public class TikaCLI {
* Prints all the known media types, aliases and matching parser classes.
*/
private void displaySupportedTypes() {
+ AutoDetectParser parser = new AutoDetectParser();
MediaTypeRegistry registry = parser.getMediaTypeRegistry();
Map<MediaType, Parser> parsers = parser.getParsers();
@@ -479,9 +488,9 @@ public class TikaCLI {
if (supertype != null) {
System.out.println(" supertype: " + supertype);
}
- Parser parser = parsers.get(type);
- if (parser != null) {
- System.out.println(" parser: " + parser.getClass().getName());
+ Parser p = parsers.get(type);
+ if (p != null) {
+ System.out.println(" parser: " + p.getClass().getName());
}
}
}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java?rev=1091833&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java Wed Apr 13 15:55:24 2011
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.FilterOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.Socket;
+import java.net.URI;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TaggedInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class NetworkParser extends AbstractParser {
+
+ private final URI uri;
+
+ private final Set<MediaType> supportedTypes;
+
+ public NetworkParser(URI uri, Set<MediaType> supportedTypes) {
+ this.uri = uri;
+ this.supportedTypes = supportedTypes;
+ }
+
+ public NetworkParser(URI uri) {
+ this(uri, Collections.singleton(MediaType.OCTET_STREAM));
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return supportedTypes;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ if ("telnet".equals(uri.getScheme())) {
+ final Socket socket = new Socket(uri.getHost(), uri.getPort());
+ try {
+ new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) {
+ @Override
+ public void close() throws IOException {
+ socket.shutdownOutput();
+ }
+ }).parse(
+ socket.getInputStream(), handler, metadata, context);
+ } finally {
+ socket.close();
+ }
+ } else {
+ URL url = uri.toURL();
+ URLConnection connection = url.openConnection();
+ connection.setDoOutput(true);
+ connection.connect();
+ InputStream input = connection.getInputStream();
+ try {
+ new ParsingTask(stream, connection.getOutputStream()).parse(
+ new CloseShieldInputStream(input),
+ handler, metadata, context);
+ } finally {
+ input.close();
+ }
+ }
+
+ }
+
+ private static class ParsingTask implements Runnable {
+
+ private final TaggedInputStream input;
+
+ private final OutputStream output;
+
+ private volatile Exception exception = null;
+
+ public ParsingTask(InputStream input, OutputStream output) {
+ this.input = new TaggedInputStream(input);
+ this.output = output;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ Thread thread = new Thread(this, "Tika network parser");
+ thread.start();
+
+ TaggedContentHandler tagged = new TaggedContentHandler(handler);
+ try {
+ context.getSAXParser().parse(
+ stream, new TeeContentHandler(
+ tagged, new MetaHandler(metadata)));
+ } catch (SAXException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException(
+ "Invalid network parser output", e);
+ } catch (IOException e) {
+ throw new TikaException(
+ "Unable to read network parser output", e);
+ } finally {
+ try {
+ thread.join(1000);
+ } catch (InterruptedException e) {
+ throw new TikaException("Network parser interrupted", e);
+ }
+
+ if (exception != null) {
+ input.throwIfCauseOf(exception);
+ throw new TikaException(
+ "Unexpected network parser error", exception);
+ }
+ }
+ }
+
+ //----------------------------------------------------------<Runnable>
+
+ public void run() {
+ try {
+ try {
+ IOUtils.copy(input, output);
+ } finally {
+ output.close();
+ }
+ } catch (Exception e) {
+ exception = e;
+ }
+ }
+
+ }
+
+ private static class MetaHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ public MetaHandler(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName,
+ Attributes attributes) throws SAXException {
+ if ("http://www.w3.org/1999/xhtml".equals(uri)
+ && "meta".equals(localName)) {
+ String name = attributes.getValue("", "name");
+ String content = attributes.getValue("", "content");
+ if (name != null && content != null) {
+ metadata.add(name, content);
+ }
+ }
+ }
+
+ }
+
+}