You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/05/13 07:15:14 UTC
svn commit: r1679144 - in
/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene:
LuceneIndexEditor.java LuceneIndexEditorContext.java
Author: chetanm
Date: Wed May 13 05:15:13 2015
New Revision: 1679144
URL: http://svn.apache.org/r1679144
Log:
OAK-2865 - Log stats around time spent in extracting text from binaries
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java?rev=1679144&r1=1679143&r2=1679144&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java Wed May 13 05:15:13 2015
@@ -28,7 +28,6 @@ import static org.apache.jackrabbit.oak.
import static org.apache.jackrabbit.oak.plugins.index.lucene.util.ConfigUtil.getPrimaryTypeName;
import java.io.IOException;
-import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@@ -39,6 +38,7 @@ import com.google.common.collect.Iterabl
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
+import com.google.common.io.CountingInputStream;
import org.apache.jackrabbit.JcrConstants;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -800,11 +800,14 @@ public class LuceneIndexEditor implement
private String parseStringValue(Blob v, Metadata metadata, String path) {
WriteOutContentHandler handler = new WriteOutContentHandler(context.getDefinition().getMaxExtractLength());
+ long start = System.currentTimeMillis();
+ long size = 0;
try {
- InputStream stream = v.getNewStream();
+ CountingInputStream stream = new CountingInputStream(v.getNewStream());
try {
context.getParser().parse(stream, handler, metadata, new ParseContext());
} finally {
+ size = stream.getCount();
stream.close();
}
} catch (LinkageError e) {
@@ -824,7 +827,9 @@ public class LuceneIndexEditor implement
return "TextExtractionError";
}
}
- return handler.toString();
+ String result = handler.toString();
+ context.recordTextExtractionStats(System.currentTimeMillis() - start, size);
+ return result;
}
}
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1679144&r1=1679143&r2=1679144&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java Wed May 13 05:15:13 2015
@@ -16,6 +16,7 @@
*/
package org.apache.jackrabbit.oak.plugins.index.lucene;
+import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount;
import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_DATA_CHILD_NAME;
import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PERSISTENCE_PATH;
import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.VERSION;
@@ -27,6 +28,7 @@ import java.io.InputStream;
import java.net.URL;
import java.util.Calendar;
import java.util.Set;
+import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -120,6 +122,8 @@ public class LuceneIndexEditorContext {
private Directory directory;
+ private final TextExtractionStats textExtractionStats = new TextExtractionStats();
+
/**
* The media types supported by the parser used.
*/
@@ -180,6 +184,8 @@ public class LuceneIndexEditorContext {
status.setProperty("lastUpdated", ISO8601.format(Calendar.getInstance()), Type.DATE);
status.setProperty("indexedNodes",indexedNodes);
PERF_LOGGER.end(start, -1, "Closed IndexWriter for directory {}", definition);
+
+ textExtractionStats.log(reindex);
}
}
@@ -249,6 +255,10 @@ public class LuceneIndexEditorContext {
return definition;
}
+ public void recordTextExtractionStats(long timeInMillis, long size) {
+ textExtractionStats.addStats(timeInMillis, size);
+ }
+
private static Parser initializeTikaParser(IndexDefinition definition) {
ClassLoader current = Thread.currentThread().getContextClassLoader();
try {
@@ -295,4 +305,50 @@ public class LuceneIndexEditorContext {
}
return TikaConfig.getDefaultConfig();
}
+
+ static class TextExtractionStats {
+ /**
+ * Log stats only if time spent is more than 2 min
+ */
+ private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(2);
+ private int count;
+ private long totalSize;
+ private long totalTime;
+
+ public void addStats(long timeInMillis, long size) {
+ count++;
+ totalSize += size;
+ totalTime += timeInMillis;
+ }
+
+ public void log(boolean reindex) {
+ if (log.isDebugEnabled()) {
+ log.debug("Text extraction stats {}", this);
+ } else if (anyParsingDone() && (reindex || isTakingLotsOfTime())) {
+ log.info("Text extraction stats {}", this);
+ }
+ }
+
+ private boolean isTakingLotsOfTime() {
+ return totalTime > LOGGING_THRESHOLD;
+ }
+
+ private boolean anyParsingDone() {
+ return count > 0;
+ }
+
+ @Override
+ public String toString() {
+ return String.format(" %d (%s, %s)", count,
+ timeInWords(totalTime), humanReadableByteCount(totalSize));
+ }
+
+ private static String timeInWords(long millis) {
+ return String.format("%d min, %d sec",
+ TimeUnit.MILLISECONDS.toMinutes(millis),
+ TimeUnit.MILLISECONDS.toSeconds(millis) -
+ TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))
+ );
+ }
+ }
}