You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2007/01/18 15:42:42 UTC

svn commit: r497449 - /jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorJob.java

Author: mreutegg
Date: Thu Jan 18 06:42:41 2007
New Revision: 497449

URL: http://svn.apache.org/viewvc?view=rev&rev=497449
Log:
JCR-390: Move text extraction into a background thread
- minimize memory usage of text extractors

Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorJob.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorJob.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorJob.java?view=diff&rev=497449&r1=497448&r2=497449
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorJob.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorJob.java Thu Jan 18 06:42:41 2007
@@ -19,12 +19,20 @@
 import EDU.oswego.cs.dl.util.concurrent.FutureResult;
 import EDU.oswego.cs.dl.util.concurrent.Callable;
 import org.apache.jackrabbit.extractor.TextExtractor;
+import org.apache.jackrabbit.util.LazyFileInputStream;
 import org.slf4j.LoggerFactory;
 import org.slf4j.Logger;
 
 import java.io.InputStream;
 import java.io.Reader;
 import java.io.IOException;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.io.BufferedWriter;
+import java.io.InputStreamReader;
+import java.io.StringReader;
 import java.lang.reflect.InvocationTargetException;
 
 /**
@@ -34,6 +42,11 @@
 public class TextExtractorJob extends FutureResult implements Runnable {
 
     /**
+     * UTF-8 encoding.
+     */
+    private static final String ENCODING_UTF8 = "UTF-8";
+
+    /**
      * The logger instance for this class.
      */
     private static final Logger log = LoggerFactory.getLogger(TextExtractorJob.class);
@@ -49,6 +62,11 @@
     private final String type;
 
     /**
+     * Set to <code>true</code> if this job timed out.
+     */
+    private transient boolean timedOut = false;
+
+    /**
      * <code>true</code> if this extractor job has been flaged as discarded.
      */
     private transient boolean discarded = false;
@@ -71,9 +89,14 @@
         this.cmd = setter(new Callable() {
             public Object call() throws Exception {
                 Reader r = extractor.extractText(stream, type, encoding);
-                if (discarded && r != null) {
-                    r.close();
-                    r = null;
+                if (r != null) {
+                    if (discarded) {
+                        r.close();
+                        r = null;
+                    } else if (timedOut) {
+                        // spool a temp file to save memory
+                        r = getSwappedOutReader(r);
+                    }
                 }
                 return r;
             }
@@ -100,6 +123,7 @@
             if (timeout > 0) {
                 log.info("Text extraction for {} timed out (>{}ms).",
                         type, new Long(timeout));
+                timedOut = true;
             }
         } catch (InvocationTargetException e) {
             // extraction failed
@@ -144,5 +168,82 @@
     public void run() {
         // forward to command
         cmd.run();
+    }
+
+    //----------------------------< internal >----------------------------------
+
+    /**
+     * Returns a <code>Reader</code> for <code>r</code> using a temp file.
+     *
+     * @param r the reader to swap out into a temp file.
+     * @return a reader to the temp file.
+     */
+    private Reader getSwappedOutReader(Reader r) {
+        final File temp;
+        try {
+            temp = File.createTempFile("extractor", null);
+        } catch (IOException e) {
+            // unable to create temp file
+            // return reader as is
+            return r;
+        }
+        Writer out;
+        try {
+            out = new BufferedWriter(new OutputStreamWriter(
+                            new FileOutputStream(temp), ENCODING_UTF8));
+        } catch (IOException e) {
+            // should never happend actually
+            if (!temp.delete()) {
+                temp.deleteOnExit();
+            }
+            return r;
+        }
+
+        // spool into temp file
+        char[] buffer = new char[1024];
+        int len;
+        InputStream in = null;
+        try {
+            try {
+                while ((len = r.read(buffer)) >= 0) {
+                    out.write(buffer, 0, len);
+                }
+                out.close();
+            } finally {
+                r.close();
+            }
+            in = new LazyFileInputStream(temp);
+
+            return new InputStreamReader(in, ENCODING_UTF8) {
+                public void close() throws IOException {
+                    super.close();
+                    // delete file
+                    if (!temp.delete()) {
+                        temp.deleteOnExit();
+                    }
+                }
+            };
+        } catch (IOException e) {
+            // do some clean up
+            try {
+                out.close();
+            } catch (IOException e1) {
+                // ignore
+            }
+
+            if (in != null) {
+                try {
+                    in.close();
+                } catch (IOException e1) {
+                    // ignore
+                }
+            }
+
+            if (!temp.delete()) {
+                temp.deleteOnExit();
+            }
+            // use empty string reader as fallback
+            return new StringReader("");
+        }
     }
 }