You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/05/17 02:08:57 UTC
svn commit: r944945 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika/parser:
EmptyParser.java OutOfProcessClient.java OutOfProcessParser.java
OutOfProcessSerializer.java OutOfProcessServer.java
Author: jukka
Date: Mon May 17 00:08:57 2010
New Revision: 944945
URL: http://svn.apache.org/viewvc?rev=944945&view=rev
Log:
TIKA-416: Out-of-process text extraction
Initial tooling for running Tika code in an external Java process without having to set the classpath. Work in progress.
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java?rev=944945&r1=944944&r2=944945&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java Mon May 17 00:08:57 2010
@@ -18,6 +18,7 @@ package org.apache.tika.parser;
import java.io.IOException;
import java.io.InputStream;
+import java.io.Serializable;
import java.util.Collections;
import java.util.Set;
@@ -33,7 +34,12 @@ import org.xml.sax.SAXException;
* attempting to parse the given document stream. Useful as a sentinel parser
* for unknown document types.
*/
-public class EmptyParser implements Parser {
+public class EmptyParser implements Parser, Serializable {
+
+ /**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = -4218649699095732123L;
/**
* Singleton instance of this class.
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessClient.java Mon May 17 00:08:57 2010
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.Enumeration;
+
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.IOUtils;
+
+class OutOfProcessClient {
+
+ private final ClassLoader loader;
+
+ private final File directory;
+
+ private final Process process;
+
+ private final DataOutputStream output;
+
+ private final DataInputStream input;
+
+ private final InputStream error;
+
+ public OutOfProcessClient(ClassLoader loader) throws IOException {
+ this.loader = loader;
+
+ this.directory = File.createTempFile("apache-tika-", "-oop");
+ directory.delete();
+ directory.mkdir();
+
+ boolean ok = false;
+ try {
+ copyClassToDirectory(OutOfProcessServer.class);
+ copyClassToDirectory(OutOfProcessSerializer.class);
+
+ ProcessBuilder builder = new ProcessBuilder();
+ builder.directory(directory);
+ builder.command("java", OutOfProcessServer.class.getName());
+ this.process = builder.start();
+ this.output = new DataOutputStream(process.getOutputStream());
+ this.input = new DataInputStream(process.getInputStream());
+ this.error = process.getErrorStream();
+
+ ok = true;
+ } finally {
+ if (!ok) {
+ delete(directory);
+ }
+ }
+ }
+
+ private void copyClassToDirectory(Class<?> klass)
+ throws FileNotFoundException, IOException {
+ String path = klass.getName().replace('.', '/') + ".class";
+ InputStream input = loader.getResourceAsStream(path);
+ try {
+ File file = new File(directory, path);
+ file.getParentFile().mkdirs();
+ OutputStream output = new FileOutputStream(file);
+ try {
+ IOUtils.copy(input, output);
+ } finally {
+ output.close();
+ }
+ } finally {
+ input.close();
+ }
+ }
+
+ public synchronized Object echo(Object message) throws IOException {
+ consumeErrors();
+ output.write(OutOfProcessServer.ECHO);
+ OutOfProcessSerializer.serialize(output, message);
+ output.flush();
+
+ readResponseType();
+ try {
+ return OutOfProcessSerializer.deserialize(input, loader).toString();
+ } catch (ClassNotFoundException e) {
+ throw new IOExceptionWithCause("Unable to read echo response", e);
+ }
+ }
+
+
+ public synchronized void close() {
+ try {
+ output.close();
+ input.close();
+ error.close();
+ } catch (IOException ignore) {
+ }
+ process.destroy();
+ delete(directory);
+ }
+
+ private byte readResponseType() throws IOException {
+ while (true) {
+ consumeErrors();
+ int type = input.read();
+ if (type == -1) {
+ throw new IOException("Unexpected end of stream encountered");
+ } else if (type == OutOfProcessServer.FIND_RESOURCE) {
+ findResource(input.readUTF());
+ } else if (type == OutOfProcessServer.FIND_RESOURCES) {
+ findResources(input.readUTF());
+ } else {
+ return (byte) type;
+ }
+ }
+ }
+
+ private void findResource(String name) throws IOException {
+ InputStream stream = loader.getResourceAsStream(name);
+ if (stream != null) {
+ output.writeBoolean(true);
+ writeAndCloseStream(stream);
+ } else {
+ output.writeBoolean(false);
+ }
+ output.flush();
+ }
+
+ private void findResources(String name) throws IOException {
+ Enumeration<URL> resources = loader.getResources(name);
+ while (resources.hasMoreElements()) {
+ output.writeBoolean(true);
+ writeAndCloseStream(resources.nextElement().openStream());
+ }
+ output.writeBoolean(false);
+ output.flush();
+ }
+
+ private void writeAndCloseStream(InputStream stream) throws IOException {
+ try {
+ byte[] buffer = new byte[0xffff];
+ int n;
+ while ((n = stream.read(buffer)) != -1) {
+ output.writeShort(n);
+ output.write(buffer, 0, n);
+ }
+ output.writeShort(0);
+ } finally {
+ stream.close();
+ }
+ }
+
+ private void consumeErrors() throws IOException {
+ int n;
+ while ((n = error.available()) > 0) {
+ byte[] b = new byte[n];
+ n = error.read(b);
+ if (n > 0) {
+ System.err.write(b, 0, n);
+ }
+ }
+ }
+
+ private void delete(File file) {
+ File[] children = file.listFiles();
+ if (children != null) {
+ for (File child : children) {
+ delete(child);
+ }
+ }
+ file.delete();
+ }
+
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessParser.java Mon May 17 00:08:57 2010
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.Queue;
+
+public class OutOfProcessParser {
+
+ private final ClassLoader loader;
+
+ private final Queue<OutOfProcessClient> pool =
+ new LinkedList<OutOfProcessClient>();
+
+ private int poolSize = 5;
+
+ public static void main(String[] args) throws Exception {
+ OutOfProcessParser parser = new OutOfProcessParser(
+ Thread.currentThread().getContextClassLoader());
+ try {
+ OutOfProcessClient client = parser.acquireClient();
+ System.out.println(client.echo(EmptyParser.INSTANCE));
+ parser.releaseClient(client);
+ } finally {
+ parser.close();
+ }
+ }
+
+ public OutOfProcessParser(ClassLoader loader) {
+ this.loader = loader;
+ }
+
+ public synchronized void close() {
+ for (OutOfProcessClient client : pool) {
+ client.close();
+ }
+ pool.clear();
+ }
+
+ private OutOfProcessClient acquireClient() throws IOException {
+ OutOfProcessClient client = pool.poll();
+ if (client == null) {
+ client = new OutOfProcessClient(loader);
+ }
+ return client;
+ }
+
+ private synchronized void releaseClient(OutOfProcessClient client) {
+ if (pool.size() < poolSize) {
+ pool.offer(client);
+ } else {
+ client.close();
+ }
+ }
+
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessSerializer.java Mon May 17 00:08:57 2010
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.ObjectStreamClass;
+
+class OutOfProcessSerializer extends ObjectInputStream {
+
+ private final ClassLoader loader;
+
+ public OutOfProcessSerializer(InputStream input, ClassLoader loader)
+ throws IOException {
+ super(input);
+ this.loader = loader;
+ }
+
+ @Override
+ protected Class<?> resolveClass(ObjectStreamClass desc)
+ throws IOException, ClassNotFoundException {
+ return Class.forName(desc.getName(), false, loader);
+ }
+
+ static void serialize(DataOutputStream output, Object object)
+ throws IOException {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+ ObjectOutputStream serializer = new ObjectOutputStream(buffer);
+ serializer.writeObject(object);
+ serializer.close();
+
+ byte[] data = buffer.toByteArray();
+ output.writeInt(data.length);
+ output.write(data);
+ }
+
+ static Object deserialize(DataInputStream input, ClassLoader loader)
+ throws IOException, ClassNotFoundException {
+ int n = input.readInt();
+ byte[] data = new byte[n];
+ input.readFully(data);
+
+ ObjectInputStream deserializer =
+ new OutOfProcessSerializer(new ByteArrayInputStream(data), loader);
+ return deserializer.readObject();
+ }
+
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java?rev=944945&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/OutOfProcessServer.java Mon May 17 00:08:57 2010
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.List;
+
+class OutOfProcessServer extends ClassLoader {
+
+ public static final byte ERROR = -1;
+
+ public static final byte REPLY = 0;
+
+ public static final byte ECHO = 1;
+
+ public static final byte FIND_RESOURCE = 2;
+
+ public static final byte FIND_RESOURCES = 3;
+
+ public static void main(String[] args) throws Exception {
+ OutOfProcessServer server =
+ new OutOfProcessServer(System.in, System.out);
+ Thread.currentThread().setContextClassLoader(server);
+
+ // Redirect standard input and output streams to prevent
+ // stray code from interfering with the message stream
+ System.setIn(new ByteArrayInputStream(new byte[0]));
+ System.setOut(System.err);
+
+ server.run();
+ }
+
+ private final DataInputStream input;
+
+ private final DataOutputStream output;
+
+ private int count = 0;
+
+ public OutOfProcessServer(InputStream input, OutputStream output)
+ throws IOException {
+ this.input = new DataInputStream(input);
+ this.output = new DataOutputStream(output);
+ }
+
+ public void run() throws IOException {
+ int b;
+ while ((b = input.read()) != -1) {
+ if (b == ECHO) {
+ try {
+ Object message =
+ OutOfProcessSerializer.deserialize(input, this);
+ output.write(ECHO);
+ OutOfProcessSerializer.serialize(output, "echo: " + message);
+ } catch (ClassNotFoundException e) {
+ output.write(ERROR);
+ OutOfProcessSerializer.serialize(output, e);
+ }
+ output.flush();
+ }
+ }
+ }
+
+ @Override
+ protected synchronized URL findResource(String name) {
+ try {
+ // Send a request to load the resource data
+ output.write(FIND_RESOURCE);
+ output.writeUTF(name);
+ output.flush();
+
+ // Receive the response
+ if (input.readBoolean()) {
+ return readStreamToFile().toURI().toURL();
+ } else {
+ return null;
+ }
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ @Override
+ protected synchronized Enumeration<URL> findResources(String name)
+ throws IOException {
+ // Send a request to load the resources
+ output.write(FIND_RESOURCE);
+ output.writeUTF(name);
+ output.flush();
+
+ // Receive the response
+ List<URL> resources = new ArrayList<URL>();
+ while (input.readBoolean()) {
+ resources.add(readStreamToFile().toURI().toURL());
+ }
+ return Collections.enumeration(resources);
+ }
+
+ @Override
+ protected synchronized Class<?> findClass(String name)
+ throws ClassNotFoundException {
+ try {
+ // Send a request to load the class data
+ output.write(FIND_RESOURCE);
+ output.writeUTF(name.replace('.', '/') + ".class");
+ output.flush();
+
+ // Receive the response
+ if (input.readBoolean()) {
+ byte[] data = readStreamToMemory();
+ return defineClass(name, data, 0, data.length);
+ } else {
+ return null;
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new ClassNotFoundException("Unable load class " + name, e);
+ }
+ }
+
+ private byte[] readStreamToMemory() throws IOException {
+ ByteArrayOutputStream stream = new ByteArrayOutputStream();
+ byte[] buffer = new byte[0xffff];
+ int n;
+ while ((n = input.readUnsignedShort()) > 0) {
+ input.readFully(buffer, 0, n);
+ stream.write(buffer, 0, n);
+ }
+ return stream.toByteArray();
+ }
+
+ private File readStreamToFile() throws IOException {
+ File file = new File("resource-" + count++ + ".bin");
+
+ OutputStream stream = new FileOutputStream(file);
+ try {
+ byte[] buffer = new byte[0xffff];
+ int n;
+ while ((n = input.readUnsignedShort()) > 0) {
+ input.readFully(buffer, 0, n);
+ stream.write(buffer, 0, n);
+ }
+ } finally {
+ stream.close();
+ }
+
+ return file;
+ }
+
+}