You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/01/21 12:30:55 UTC
svn commit: r1061757 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/fork/ test/java/org/apache/tika/fork/
Author: jukka
Date: Fri Jan 21 11:30:55 2011
New Revision: 1061757
URL: http://svn.apache.org/viewvc?rev=1061757&view=rev
Log:
TIKA-416: Out-of-process text extraction
Improved configurability of ForkParser.
Avoid access problems in the server by accessing methods through interfaces.
Add a simple test case.
Added:
tika/trunk/tika-core/src/test/java/org/apache/tika/fork/
tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java?rev=1061757&r1=1061756&r2=1061757&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java Fri Jan 21 11:30:55 2011
@@ -25,6 +25,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;
@@ -35,10 +36,6 @@ import org.xml.sax.ContentHandler;
class ForkClient {
- private final String java = "java"; // TODO: Make configurable
-
- private final String mx = "-Xmx32m"; // TODO: Make configurable
-
private final List<ForkResource> resources = new ArrayList<ForkResource>();
private final File jar;
@@ -51,13 +48,18 @@ class ForkClient {
private final InputStream error;
- public ForkClient(ClassLoader loader, Object object) throws IOException {
+ public ForkClient(ClassLoader loader, Object object, String java)
+ throws IOException {
boolean ok = false;
try {
this.jar = createBootstrapJar();
ProcessBuilder builder = new ProcessBuilder();
- builder.command(java, mx, "-jar", jar.getPath());
+ List<String> command = new ArrayList<String>();
+ command.addAll(Arrays.asList(java.split("\\s+")));
+ command.add("-jar");
+ command.add(jar.getPath());
+ builder.command(command);
this.process = builder.start();
this.output = new DataOutputStream(process.getOutputStream());
@@ -148,6 +150,7 @@ class ForkClient {
consumeErrorStream();
int type = input.read();
if (type == -1) {
+ consumeErrorStream();
throw new IOException(
"Lost connection to a forked server process");
} else if (type == ForkServer.RESOURCE) {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java?rev=1061757&r1=1061756&r2=1061757&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java Fri Jan 21 11:30:55 2011
@@ -16,7 +16,6 @@
*/
package org.apache.tika.fork;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
@@ -29,42 +28,79 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class ForkParser implements Parser {
+ /** Serial version UID */
+ private static final long serialVersionUID = -4962742892274663950L;
+
private final ClassLoader loader;
private final Parser parser;
- private final Queue<ForkClient> pool =
- new LinkedList<ForkClient>();
+ /** Java command line */
+ private String java = "java -Xmx32m";
+ /** Process pool size */
private int poolSize = 5;
- public static void main(String[] args) throws Exception {
- ForkParser parser = new ForkParser(
- Thread.currentThread().getContextClassLoader(),
- new AutoDetectParser());
- try {
- InputStream stream =
- new ByteArrayInputStream("Hello, World!".getBytes());
- ParseContext context = new ParseContext();
- parser.parse(
- stream, new WriteOutContentHandler(System.out),
- new Metadata(), context);
- } finally {
- parser.close();
- }
- }
+ private final Queue<ForkClient> pool =
+ new LinkedList<ForkClient>();
public ForkParser(ClassLoader loader, Parser parser) {
this.loader = loader;
this.parser = parser;
}
+ public ForkParser(ClassLoader loader) {
+ this(loader, new AutoDetectParser());
+ }
+
+ public ForkParser() {
+ this(ForkParser.class.getClassLoader());
+ }
+
+ /**
+ * Returns the size of the process pool.
+ *
+ * @return process pool size
+ */
+ public int getPoolSize() {
+ return poolSize;
+ }
+
+ /**
+ * Sets the size of the process pool.
+ *
+ * @param poolSize process pool size
+ */
+ public void setPoolSize(int poolSize) {
+ this.poolSize = poolSize;
+ }
+
+ /**
+ * Returns the command used to start the forked server process.
+ *
+ * @return java command line
+ */
+ public String getJavaCommand() {
+ return java;
+ }
+
+ /**
+ * Sets the command used to start the forked server process.
+ * The given command line is split on whitespace and the arguments
+ * "-jar" and "/path/to/bootstrap.jar" are appended to it when starting
+ * the process. The default setting is "java -Xmx32m".
+ *
+ * @param java java command line
+ */
+ public void setJavaCommand(String java) {
+ this.java = java;
+ }
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return parser.getSupportedTypes(context);
}
@@ -99,7 +135,7 @@ public class ForkParser implements Parse
throws IOException {
ForkClient client = pool.poll();
if (client == null) {
- client = new ForkClient(loader, parser);
+ client = new ForkClient(loader, parser, java);
}
return client;
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java?rev=1061757&r1=1061756&r2=1061757&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java Fri Jan 21 11:30:55 2011
@@ -125,13 +125,16 @@ class ForkServer implements Runnable, Ch
} catch (Throwable t) {
t.printStackTrace();
}
+ System.err.flush();
}
private Method getMethod(Object object, String name) {
Class<?> klass = object.getClass();
- for (Method method : klass.getMethods()) {
- if (name.equals(method.getName())) {
- return method;
+ for (Class<?> iface : klass.getInterfaces()) {
+ for (Method method : iface.getMethods()) {
+ if (name.equals(method.getName())) {
+ return method;
+ }
}
}
return null;
Added: tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java?rev=1061757&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java (added)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java Fri Jan 21 11:30:55 2011
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fork;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class ForkParserTest extends TestCase {
+
+ public void testHelloWorld() throws Exception {
+ ForkParser parser = new ForkParser(
+ ForkParserTest.class.getClassLoader(),
+ new ForkTestParser());
+ try {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+ assertEquals("Hello, World!", output.toString().trim());
+ } finally {
+ parser.close();
+ }
+ }
+
+
+}
Added: tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java?rev=1061757&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java (added)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java Fri Jan 21 11:30:55 2011
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fork;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+class ForkTestParser implements Parser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -5492269783593452319L;
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(MediaType.TEXT_PLAIN);
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ char[] ch = "Hello, World!".toCharArray();
+ xhtml.characters(ch, 0, ch.length);
+ xhtml.endDocument();
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+
+}
\ No newline at end of file