You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/05/13 07:15:14 UTC

svn commit: r1679144 - in /jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene: LuceneIndexEditor.java LuceneIndexEditorContext.java

Author: chetanm
Date: Wed May 13 05:15:13 2015
New Revision: 1679144

URL: http://svn.apache.org/r1679144
Log:
OAK-2865 - Log stats around time spent in extracting text from binaries

Modified:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java?rev=1679144&r1=1679143&r2=1679144&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java Wed May 13 05:15:13 2015
@@ -28,7 +28,6 @@ import static org.apache.jackrabbit.oak.
 import static org.apache.jackrabbit.oak.plugins.index.lucene.util.ConfigUtil.getPrimaryTypeName;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@@ -39,6 +38,7 @@ import com.google.common.collect.Iterabl
 import com.google.common.collect.Lists;
 
 import com.google.common.collect.Sets;
+import com.google.common.io.CountingInputStream;
 import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -800,11 +800,14 @@ public class LuceneIndexEditor implement
 
     private String parseStringValue(Blob v, Metadata metadata, String path) {
         WriteOutContentHandler handler = new WriteOutContentHandler(context.getDefinition().getMaxExtractLength());
+        long start = System.currentTimeMillis();
+        long size = 0;
         try {
-            InputStream stream = v.getNewStream();
+            CountingInputStream stream = new CountingInputStream(v.getNewStream());
             try {
                 context.getParser().parse(stream, handler, metadata, new ParseContext());
             } finally {
+                size = stream.getCount();
                 stream.close();
             }
         } catch (LinkageError e) {
@@ -824,7 +827,9 @@ public class LuceneIndexEditor implement
                 return "TextExtractionError";
             }
         }
-        return handler.toString();
+        String result = handler.toString();
+        context.recordTextExtractionStats(System.currentTimeMillis() - start, size);
+        return result;
     }
 
 }

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1679144&r1=1679143&r2=1679144&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java Wed May 13 05:15:13 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
+import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_DATA_CHILD_NAME;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PERSISTENCE_PATH;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.VERSION;
@@ -27,6 +28,7 @@ import java.io.InputStream;
 import java.net.URL;
 import java.util.Calendar;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -120,6 +122,8 @@ public class LuceneIndexEditorContext {
 
     private Directory directory;
 
+    private final TextExtractionStats textExtractionStats = new TextExtractionStats();
+
     /**
      * The media types supported by the parser used.
      */
@@ -180,6 +184,8 @@ public class LuceneIndexEditorContext {
             status.setProperty("lastUpdated", ISO8601.format(Calendar.getInstance()), Type.DATE);
             status.setProperty("indexedNodes",indexedNodes);
             PERF_LOGGER.end(start, -1, "Closed IndexWriter for directory {}", definition);
+
+            textExtractionStats.log(reindex);
         }
     }
 
@@ -249,6 +255,10 @@ public class LuceneIndexEditorContext {
         return definition;
     }
 
+    public void recordTextExtractionStats(long timeInMillis, long size) {
+        textExtractionStats.addStats(timeInMillis, size);
+    }
+
     private static Parser initializeTikaParser(IndexDefinition definition) {
         ClassLoader current = Thread.currentThread().getContextClassLoader();
         try {
@@ -295,4 +305,50 @@ public class LuceneIndexEditorContext {
         }
         return TikaConfig.getDefaultConfig();
     }
+
+    static class TextExtractionStats {
+        /**
+         * Log stats only if time spent is more than 2 min
+         */
+        private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(2);
+        private int count;
+        private long totalSize;
+        private long totalTime;
+
+        public void addStats(long timeInMillis, long size) {
+            count++;
+            totalSize += size;
+            totalTime += timeInMillis;
+        }
+
+        public void log(boolean reindex) {
+            if (log.isDebugEnabled()) {
+                log.debug("Text extraction stats {}", this);
+            } else if (anyParsingDone() && (reindex || isTakingLotsOfTime())) {
+                log.info("Text extraction stats {}", this);
+            }
+        }
+
+        private boolean isTakingLotsOfTime() {
+            return totalTime > LOGGING_THRESHOLD;
+        }
+
+        private boolean anyParsingDone() {
+            return count > 0;
+        }
+
+        @Override
+        public String toString() {
+            return String.format(" %d (%s, %s)", count,
+                    timeInWords(totalTime), humanReadableByteCount(totalSize));
+        }
+
+        private static String timeInWords(long millis) {
+            return String.format("%d min, %d sec",
+                    TimeUnit.MILLISECONDS.toMinutes(millis),
+                    TimeUnit.MILLISECONDS.toSeconds(millis) -
+                            TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))
+            );
+        }
+    }
 }