You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/05/17 02:08:57 UTC

svn commit: r944945 - in /tika/trunk/tika-core/src/main/java/org/apache/tika/parser: EmptyParser.java OutOfProcessClient.java OutOfProcessParser.java OutOfProcessSerializer.java OutOfProcessServer.java

Author: jukka
Date: Mon May 17 00:08:57 2010
New Revision: 944945

URL: http://svn.apache.org/viewvc?rev=944945&view=rev
Log:
TIKA-416: Out-of-process text extraction

Initial tooling for running Tika code in an external Java process without having to set the classpath. Work in progress.

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java?rev=944945&r1=944944&r2=944945&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java Mon May 17 00:08:57 2010
@@ -18,6 +18,7 @@ package org.apache.tika.parser;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Serializable;
 import java.util.Collections;
 import java.util.Set;
 
@@ -33,7 +34,12 @@ import org.xml.sax.SAXException;
  * attempting to parse the given document stream. Useful as a sentinel parser
  * for unknown document types.
  */
-public class EmptyParser implements Parser {
+public class EmptyParser implements Parser, Serializable {
+
+    /**
+     * Serial version UID.
+     */
+    private static final long serialVersionUID = -4218649699095732123L;
 
     /**
      * Singleton instance of this class.

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java Mon May 17 00:08:57 2010
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.Enumeration;
+
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.IOUtils;
+
+class OutOfProcessClient {
+
+    private final ClassLoader loader;
+
+    private final File directory;
+
+    private final Process process;
+
+    private final DataOutputStream output;
+
+    private final DataInputStream input;
+
+    private final InputStream error;
+
+    public OutOfProcessClient(ClassLoader loader) throws IOException {
+        this.loader = loader;
+
+        this.directory = File.createTempFile("apache-tika-", "-oop");
+        directory.delete();
+        directory.mkdir();
+
+        boolean ok = false;
+        try {
+            copyClassToDirectory(OutOfProcessServer.class);
+            copyClassToDirectory(OutOfProcessSerializer.class);
+
+            ProcessBuilder builder = new ProcessBuilder();
+            builder.directory(directory);
+            builder.command("java", OutOfProcessServer.class.getName());
+            this.process = builder.start();
+            this.output = new DataOutputStream(process.getOutputStream());
+            this.input = new DataInputStream(process.getInputStream());
+            this.error = process.getErrorStream();
+
+            ok = true;
+        } finally {
+            if (!ok) {
+                delete(directory);
+            }
+        }
+    }
+
+    private void copyClassToDirectory(Class<?> klass)
+            throws FileNotFoundException, IOException {
+        String path = klass.getName().replace('.', '/') + ".class";
+        InputStream input = loader.getResourceAsStream(path);
+        try {
+            File file = new File(directory, path);
+            file.getParentFile().mkdirs();
+            OutputStream output = new FileOutputStream(file);
+            try {
+                IOUtils.copy(input, output);
+            } finally {
+                output.close();
+            }
+        } finally {
+            input.close();
+        }
+    }
+
+    public synchronized Object echo(Object message) throws IOException {
+        consumeErrors();
+        output.write(OutOfProcessServer.ECHO);
+        OutOfProcessSerializer.serialize(output, message);
+        output.flush();
+
+        readResponseType();
+        try {
+            return OutOfProcessSerializer.deserialize(input, loader).toString();
+        } catch (ClassNotFoundException e) {
+            throw new IOExceptionWithCause("Unable to read echo response", e);
+        }
+    }
+
+
+    public synchronized void close() {
+        try {
+            output.close();
+            input.close();
+            error.close();
+        } catch (IOException ignore) {
+        }
+        process.destroy();
+        delete(directory);
+    }
+
+    private byte readResponseType() throws IOException {
+        while (true) {
+            consumeErrors();
+            int type = input.read();
+            if (type == -1) {
+                throw new IOException("Unexpected end of stream encountered");
+            } else if (type == OutOfProcessServer.FIND_RESOURCE) {
+                findResource(input.readUTF());
+            } else if (type == OutOfProcessServer.FIND_RESOURCES) {
+                findResources(input.readUTF());
+            } else {
+                return (byte) type;
+            }
+        }
+    }
+
+    private void findResource(String name) throws IOException {
+        InputStream stream = loader.getResourceAsStream(name);
+        if (stream != null) {
+            output.writeBoolean(true);
+            writeAndCloseStream(stream);
+        } else {
+            output.writeBoolean(false);
+        }
+        output.flush();
+    }
+
+    private void findResources(String name) throws IOException {
+        Enumeration<URL> resources = loader.getResources(name);
+        while (resources.hasMoreElements()) {
+            output.writeBoolean(true);
+            writeAndCloseStream(resources.nextElement().openStream());
+        }
+        output.writeBoolean(false);
+        output.flush();
+    }
+
+    private void writeAndCloseStream(InputStream stream) throws IOException {
+        try {
+            byte[] buffer = new byte[0xffff];
+            int n;
+            while ((n = stream.read(buffer)) != -1) {
+                output.writeShort(n);
+                output.write(buffer, 0, n);
+            }
+            output.writeShort(0);
+        } finally {
+            stream.close();
+        }
+    }
+
+    private void consumeErrors() throws IOException {
+        int n;
+        while ((n = error.available()) > 0) {
+            byte[] b = new byte[n];
+            n = error.read(b);
+            if (n > 0) {
+                System.err.write(b, 0, n);
+            }
+        }
+    }
+
+    private void delete(File file) {
+        File[] children = file.listFiles();
+        if (children != null) {
+            for (File child : children) {
+                delete(child);
+            }
+        }
+        file.delete();
+    }
+
+}

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java Mon May 17 00:08:57 2010
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.Queue;
+
+public class OutOfProcessParser {
+
+    private final ClassLoader loader;
+
+    private final Queue<OutOfProcessClient> pool =
+        new LinkedList<OutOfProcessClient>();
+
+    private int poolSize = 5;
+
+    public static void main(String[] args) throws Exception {
+        OutOfProcessParser parser = new OutOfProcessParser(
+                Thread.currentThread().getContextClassLoader());
+        try {
+            OutOfProcessClient client = parser.acquireClient();
+            System.out.println(client.echo(EmptyParser.INSTANCE));
+            parser.releaseClient(client);
+        } finally {
+            parser.close();
+        }
+    }
+
+    public OutOfProcessParser(ClassLoader loader) {
+        this.loader = loader;
+    }
+
+    public synchronized void close() {
+        for (OutOfProcessClient client : pool) {
+            client.close();
+        }
+        pool.clear();
+    }
+
+    private OutOfProcessClient acquireClient() throws IOException {
+        OutOfProcessClient client = pool.poll();
+        if (client == null) {
+            client = new OutOfProcessClient(loader);
+        }
+        return client;
+    }
+
+    private synchronized void releaseClient(OutOfProcessClient client) {
+        if (pool.size() < poolSize) {
+            pool.offer(client);
+        } else {
+            client.close();
+        }
+    }
+
+}

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java Mon May 17 00:08:57 2010
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.ObjectStreamClass;
+
+class OutOfProcessSerializer extends ObjectInputStream {
+
+    private final ClassLoader loader;
+
+    public OutOfProcessSerializer(InputStream input, ClassLoader loader)
+            throws IOException {
+        super(input);
+        this.loader = loader;
+    }
+
+    @Override
+    protected Class<?> resolveClass(ObjectStreamClass desc)
+            throws IOException, ClassNotFoundException {
+        return Class.forName(desc.getName(), false, loader);
+    }
+
+    static void serialize(DataOutputStream output, Object object)
+            throws IOException {
+        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+        ObjectOutputStream serializer = new ObjectOutputStream(buffer);
+        serializer.writeObject(object);
+        serializer.close();
+
+        byte[] data = buffer.toByteArray();
+        output.writeInt(data.length);
+        output.write(data);
+    }
+
+    static Object deserialize(DataInputStream input, ClassLoader loader)
+            throws IOException, ClassNotFoundException {
+        int n = input.readInt();
+        byte[] data = new byte[n];
+        input.readFully(data);
+
+        ObjectInputStream deserializer =
+            new OutOfProcessSerializer(new ByteArrayInputStream(data), loader);
+        return deserializer.readObject();
+    }
+
+}

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java Mon May 17 00:08:57 2010
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.List;
+
+class OutOfProcessServer extends ClassLoader {
+
+    public static final byte ERROR = -1;
+
+    public static final byte REPLY = 0;
+
+    public static final byte ECHO = 1;
+
+    public static final byte FIND_RESOURCE = 2;
+
+    public static final byte FIND_RESOURCES = 3;
+
+    public static void main(String[] args) throws Exception {
+        OutOfProcessServer server =
+            new OutOfProcessServer(System.in, System.out);
+        Thread.currentThread().setContextClassLoader(server);
+
+        // Redirect standard input and output streams to prevent
+        // stray code from interfering with the message stream
+        System.setIn(new ByteArrayInputStream(new byte[0]));
+        System.setOut(System.err);
+
+        server.run();
+    }
+
+    private final DataInputStream input;
+
+    private final DataOutputStream output;
+
+    private int count = 0;
+
+    public OutOfProcessServer(InputStream input, OutputStream output)
+            throws IOException {
+        this.input = new DataInputStream(input);
+        this.output = new DataOutputStream(output);
+    }
+
+    public void run() throws IOException {
+        int b;
+        while ((b = input.read()) != -1) {
+            if (b == ECHO) {
+                try {
+                    Object message =
+                        OutOfProcessSerializer.deserialize(input, this);
+                    output.write(ECHO);
+                    OutOfProcessSerializer.serialize(output, "echo: " + message);
+                } catch (ClassNotFoundException e) {
+                    output.write(ERROR);
+                    OutOfProcessSerializer.serialize(output, e);
+                }
+                output.flush();
+            }
+        }
+    }
+
+    @Override
+    protected synchronized URL findResource(String name) {
+        try {
+            // Send a request to load the resource data
+            output.write(FIND_RESOURCE);
+            output.writeUTF(name);
+            output.flush();
+
+            // Receive the response
+            if (input.readBoolean()) {
+                return readStreamToFile().toURI().toURL();
+            } else {
+                return null;
+            }
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    @Override
+    protected synchronized Enumeration<URL> findResources(String name)
+            throws IOException {
+        // Send a request to load the resources
+        output.write(FIND_RESOURCE);
+        output.writeUTF(name);
+        output.flush();
+
+        // Receive the response
+        List<URL> resources = new ArrayList<URL>();
+        while (input.readBoolean()) {
+            resources.add(readStreamToFile().toURI().toURL());
+        }
+        return Collections.enumeration(resources);
+    }
+
+    @Override
+    protected synchronized Class<?> findClass(String name)
+            throws ClassNotFoundException {
+        try {
+            // Send a request to load the class data
+            output.write(FIND_RESOURCE);
+            output.writeUTF(name.replace('.', '/') + ".class");
+            output.flush();
+
+            // Receive the response
+            if (input.readBoolean()) {
+                byte[] data = readStreamToMemory();
+                return defineClass(name, data, 0, data.length);
+            } else {
+                return null;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+            throw new ClassNotFoundException("Unable load class " + name, e);
+        }
+    }
+
+    private byte[] readStreamToMemory() throws IOException {
+        ByteArrayOutputStream stream = new ByteArrayOutputStream();
+        byte[] buffer = new byte[0xffff];
+        int n;
+        while ((n = input.readUnsignedShort()) > 0) {
+            input.readFully(buffer, 0, n);
+            stream.write(buffer, 0, n);
+        }
+        return stream.toByteArray();
+    }
+
+    private File readStreamToFile() throws IOException {
+        File file = new File("resource-" + count++ + ".bin");
+
+        OutputStream stream = new FileOutputStream(file);
+        try {
+            byte[] buffer = new byte[0xffff];
+            int n;
+            while ((n = input.readUnsignedShort()) > 0) {
+                input.readFully(buffer, 0, n);
+                stream.write(buffer, 0, n);
+            }
+        } finally {
+            stream.close();
+        }
+
+        return file;
+    }
+
+}