You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2011/04/12 16:53:36 UTC

svn commit: r1091439 - /jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java

Author: jukka
Date: Tue Apr 12 14:53:36 2011
New Revision: 1091439

URL: http://svn.apache.org/viewvc?rev=1091439&view=rev
Log:
JCR-2864: Use out-of-process text extraction

Add a forkJavaCommand configuration option that enables forked parsing.

Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?rev=1091439&r1=1091438&r2=1091439&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java Tue Apr 12 14:53:36 2011
@@ -61,11 +61,16 @@ import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fork.ForkParser;
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Element;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import javax.jcr.RepositoryException;
@@ -216,6 +221,12 @@ public class SearchIndex extends Abstrac
     private String tikaConfigPath = null;
 
     /**
+     * Java command used to fork external parser processes,
+     * or <code>null</code> (the default) for in-process text extraction.
+     */
+    private String forkJavaCommand = null;
+
+    /**
      * The Tika parser for extracting text content from binary properties.
      * Initialized by the {@link #getParser()} method during first access.
      */
@@ -895,6 +906,26 @@ public class SearchIndex extends Abstrac
     }
 
     /**
+     * Returns the java command used to fork external parser processes,
+     * or <code>null</code> (the default) for in-process text extraction.
+     *
+     * @return fork java command
+     */
+    public String getForkJavaCommand() {
+        return forkJavaCommand;
+    }
+
+    /**
+     * Sets the java command used to fork external parser processes.
+     *
+     * @param command fork java command,
+     *                or <code>null</code> for in-process extraction
+     */
+    public void setForkJavaCommand(String command) {
+        this.forkJavaCommand = command;
+    }
+
+    /**
      * Returns the parser used for extracting text content
      * from binary properties for full text indexing.
      *
@@ -932,7 +963,16 @@ public class SearchIndex extends Abstrac
                 config = TikaConfig.getDefaultConfig();
             }
 
-            parser = new AutoDetectParser(config);
+            if (forkJavaCommand == null) {
+                parser = new AutoDetectParser(config);
+            } else {
+                ForkParser forkParser = new ForkParser(
+                        SearchIndex.class.getClassLoader(),
+                        new AutoDetectParser(config));
+                forkParser.setJavaCommand(forkJavaCommand);
+                forkParser.setPoolSize(extractorPoolSize);
+                parser = forkParser;
+            }
         }
         return parser;
     }