You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/04/13 17:55:24 UTC

svn commit: r1091833 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java

Author: jukka
Date: Wed Apr 13 15:55:24 2011
New Revision: 1091833

URL: http://svn.apache.org/viewvc?rev=1091833&view=rev
Log:
TIKA-593: Tika network server

Add a basic network client parser and initial CLI support for it. Work in progress.

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1091833&r1=1091832&r2=1091833&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Apr 13 15:55:24 2011
@@ -29,6 +29,7 @@ import java.io.Writer;
 import java.lang.reflect.Field;
 import java.net.ServerSocket;
 import java.net.Socket;
+import java.net.URI;
 import java.net.URL;
 import java.util.Arrays;
 import java.util.Comparator;
@@ -66,6 +67,7 @@ import org.apache.tika.mime.MediaTypeReg
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.NetworkParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
@@ -75,7 +77,6 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-
 /**
  * Simple command line interface for Apache Tika.
  */
@@ -216,7 +217,7 @@ public class TikaCLI {
     
     private Detector detector;
 
-    private AutoDetectParser parser;
+    private Parser parser;
 
     private Metadata metadata;
 
@@ -229,7 +230,7 @@ public class TikaCLI {
 
     private boolean pipeMode = true;
 
-    private boolean portMode = false;
+    private boolean serverMode = false;
 
     private boolean fork = false;
 
@@ -287,13 +288,20 @@ public class TikaCLI {
         } else if (arg.equals("-z") || arg.equals("--extract")) {
             type = NO_OUTPUT;
             context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
-        } else if (arg.equals("-p") || arg.equals("--port")) {
-            portMode = true;
+        } else if (arg.equals("-p") || arg.equals("--port")
+                || arg.equals("-s") || arg.equals("--server")) {
+            serverMode = true;
             pipeMode = false;
+        } else if (arg.startsWith("-c")) {
+            URI uri = new URI(arg.substring("-c".length()));
+            parser = new NetworkParser(uri);
+        } else if (arg.startsWith("--client=")) {
+            URI uri = new URI(arg.substring("--client=".length()));
+            parser = new NetworkParser(uri);
         } else {
             pipeMode = false;
             metadata = new Metadata();
-            if (portMode) {
+            if (serverMode) {
                 new TikaServer(Integer.parseInt(arg)).start();
             } else if (arg.equals("-")) {
                 InputStream stream =
@@ -467,6 +475,7 @@ public class TikaCLI {
      * Prints all the known media types, aliases and matching parser classes.
      */
     private void displaySupportedTypes() {
+        AutoDetectParser parser = new AutoDetectParser();
         MediaTypeRegistry registry = parser.getMediaTypeRegistry();
         Map<MediaType, Parser> parsers = parser.getParsers();
 
@@ -479,9 +488,9 @@ public class TikaCLI {
             if (supertype != null) {
                 System.out.println("  supertype: " + supertype);
             }
-            Parser parser = parsers.get(type);
-            if (parser != null) {
-                System.out.println("  parser:    " + parser.getClass().getName());
+            Parser p = parsers.get(type);
+            if (p != null) {
+                System.out.println("  parser:    " + p.getClass().getName());
             }
         }
     }

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java?rev=1091833&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java Wed Apr 13 15:55:24 2011
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.FilterOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.Socket;
+import java.net.URI;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TaggedInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class NetworkParser extends AbstractParser {
+
+    private final URI uri;
+
+    private final Set<MediaType> supportedTypes;
+
+    public NetworkParser(URI uri, Set<MediaType> supportedTypes) {
+        this.uri = uri;
+        this.supportedTypes = supportedTypes;
+    }
+
+    public NetworkParser(URI uri) {
+        this(uri, Collections.singleton(MediaType.OCTET_STREAM));
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return supportedTypes;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if ("telnet".equals(uri.getScheme())) {
+            final Socket socket = new Socket(uri.getHost(), uri.getPort());
+            try {
+                new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) {
+                    @Override
+                    public void close() throws IOException {
+                        socket.shutdownOutput();
+                    }
+                }).parse(
+                        socket.getInputStream(), handler, metadata, context);
+            } finally {
+                socket.close();
+            }
+        } else {
+            URL url = uri.toURL();
+            URLConnection connection = url.openConnection();
+            connection.setDoOutput(true);
+            connection.connect();
+            InputStream input = connection.getInputStream();
+            try {
+                new ParsingTask(stream, connection.getOutputStream()).parse(
+                        new CloseShieldInputStream(input),
+                        handler, metadata, context);
+            } finally {
+                input.close();
+            }
+        }
+
+    }
+
+    private static class ParsingTask implements Runnable {
+
+        private final TaggedInputStream input;
+
+        private final OutputStream output;
+
+        private volatile Exception exception = null;
+
+        public ParsingTask(InputStream input, OutputStream output) {
+            this.input = new TaggedInputStream(input);
+            this.output = output;
+        }
+
+        public void parse(
+                InputStream stream, ContentHandler handler,
+                Metadata metadata, ParseContext context)
+                throws IOException, SAXException, TikaException {
+            Thread thread = new Thread(this, "Tika network parser");
+            thread.start();
+
+            TaggedContentHandler tagged = new TaggedContentHandler(handler);
+            try {
+                context.getSAXParser().parse(
+                        stream, new TeeContentHandler(
+                                tagged, new MetaHandler(metadata)));
+            } catch (SAXException e) {
+                tagged.throwIfCauseOf(e);
+                throw new TikaException(
+                        "Invalid network parser output", e);
+            } catch (IOException e) {
+                throw new TikaException(
+                        "Unable to read network parser output", e);
+            } finally {
+                try {
+                    thread.join(1000);
+                } catch (InterruptedException e) {
+                    throw new TikaException("Network parser interrupted", e);
+                }
+
+                if (exception != null) {
+                    input.throwIfCauseOf(exception);
+                    throw new TikaException(
+                            "Unexpected network parser error", exception);
+                }
+            }
+        }
+
+        //----------------------------------------------------------<Runnable>
+
+        public void run() {
+            try {
+                try {
+                    IOUtils.copy(input, output);
+                } finally {
+                    output.close();
+                }
+            } catch (Exception e) {
+                exception = e;
+            }
+        }
+
+    }
+
+    private static class MetaHandler extends DefaultHandler {
+
+        private final Metadata metadata;
+
+        public MetaHandler(Metadata metadata) {
+            this.metadata = metadata;
+        }
+
+        @Override
+        public void startElement(
+                String uri, String localName, String qName,
+                Attributes attributes) throws SAXException {
+            if ("http://www.w3.org/1999/xhtml".equals(uri)
+                    && "meta".equals(localName)) {
+                String name = attributes.getValue("", "name");
+                String content = attributes.getValue("", "content");
+                if (name != null && content != null) {
+                    metadata.add(name, content);
+                }
+            }
+        }
+
+    }
+
+}