You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/02/10 23:54:23 UTC
svn commit: r1658831 - in /lucene/dev/trunk/lucene: CHANGES.txt
core/src/java/org/apache/lucene/index/CheckIndex.java
Author: mikemccand
Date: Tue Feb 10 22:54:22 2015
New Revision: 1658831
URL: http://svn.apache.org/r1658831
Log:
LUCENE-6233: speed up CheckIndex when the index has term vectors
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1658831&r1=1658830&r2=1658831&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Feb 10 22:54:22 2015
@@ -68,6 +68,9 @@ Optimizations
* LUCENE-6218: Don't decode frequencies or match all positions when scoring
is not needed. (Robert Muir)
+* LUCENE-6233 Speed up CheckIndex when the index has term vectors
+ (Robert Muir, Mike McCandless)
+
API Changes
* LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files()
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java?rev=1658831&r1=1658830&r2=1658831&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java Tue Feb 10 22:54:22 2015
@@ -448,6 +448,7 @@ public class CheckIndex implements Close
* time to run. */
public Status checkIndex(List<String> onlySegments) throws IOException {
ensureOpen();
+ long startNS = System.nanoTime();
NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
SegmentInfos sis = null;
Status result = new Status();
@@ -625,17 +626,20 @@ public class CheckIndex implements Close
segInfoStat.hasDeletions = true;
segInfoStat.deletionsGen = info.getDelGen();
}
+
+ long startOpenReaderNS = System.nanoTime();
if (infoStream != null)
infoStream.print(" test: open reader.........");
reader = new SegmentReader(info, IOContext.DEFAULT);
- msg(infoStream, "OK");
+ msg(infoStream, String.format(Locale.ROOT, "OK [took %.3f sec]", nsToSec(System.nanoTime()-startOpenReaderNS)));
segInfoStat.openReaderPassed = true;
+ long startIntegrityNS = System.nanoTime();
if (infoStream != null)
infoStream.print(" test: check integrity.....");
reader.checkIntegrity();
- msg(infoStream, "OK");
+ msg(infoStream, String.format(Locale.ROOT, "OK [took %.3f sec]", nsToSec(System.nanoTime()-startIntegrityNS)));
if (reader.maxDoc() != info.info.getDocCount()) {
throw new RuntimeException("SegmentReader.maxDoc() " + reader.maxDoc() + " != SegmentInfos.docCount " + info.info.getDocCount());
@@ -743,6 +747,8 @@ public class CheckIndex implements Close
msg(infoStream, "No problems were detected with this index.\n");
}
+ msg(infoStream, String.format(Locale.ROOT, "Took %.3f sec total.", nsToSec(System.nanoTime()-startNS)));
+
return result;
}
@@ -751,6 +757,7 @@ public class CheckIndex implements Close
* @lucene.experimental
*/
public static Status.LiveDocStatus testLiveDocs(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
+ long startNS = System.nanoTime();
final Status.LiveDocStatus status = new Status.LiveDocStatus();
try {
@@ -774,7 +781,7 @@ public class CheckIndex implements Close
}
status.numDeleted = reader.numDeletedDocs();
- msg(infoStream, "OK [" + (status.numDeleted) + " deleted docs]");
+ msg(infoStream, String.format(Locale.ROOT, "OK [%d deleted docs] [took %.3f sec]", status.numDeleted, nsToSec(System.nanoTime()-startNS)));
} else {
Bits liveDocs = reader.getLiveDocs();
if (liveDocs != null) {
@@ -785,7 +792,7 @@ public class CheckIndex implements Close
}
}
}
- msg(infoStream, "OK");
+ msg(infoStream, String.format(Locale.ROOT, "OK [took %.3f sec]", (nsToSec(System.nanoTime()-startNS))));
}
} catch (Throwable e) {
@@ -807,6 +814,7 @@ public class CheckIndex implements Close
* @lucene.experimental
*/
public static Status.FieldInfoStatus testFieldInfos(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
+ long startNS = System.nanoTime();
final Status.FieldInfoStatus status = new Status.FieldInfoStatus();
try {
@@ -818,7 +826,7 @@ public class CheckIndex implements Close
for (FieldInfo f : fieldInfos) {
f.checkConsistency();
}
- msg(infoStream, "OK [" + fieldInfos.size() + " fields]");
+ msg(infoStream, String.format(Locale.ROOT, "OK [%d fields] [took %.3f sec]", fieldInfos.size(), nsToSec(System.nanoTime()-startNS)));
status.totFields = fieldInfos.size();
} catch (Throwable e) {
if (failFast) {
@@ -839,6 +847,7 @@ public class CheckIndex implements Close
* @lucene.experimental
*/
public static Status.FieldNormStatus testFieldNorms(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
+ long startNS = System.nanoTime();
final Status.FieldNormStatus status = new Status.FieldNormStatus();
try {
@@ -857,7 +866,7 @@ public class CheckIndex implements Close
}
}
- msg(infoStream, "OK [" + status.totFields + " fields]");
+ msg(infoStream, String.format(Locale.ROOT, "OK [%d fields] [took %.3f sec]", status.totFields, nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
@@ -878,6 +887,12 @@ public class CheckIndex implements Close
*/
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose) throws IOException {
// TODO: we should probably return our own stats thing...?!
+ long startNS;
+ if (doPrint) {
+ startNS = System.nanoTime();
+ } else {
+ startNS = 0;
+ }
final Status.TermIndexStatus status = new Status.TermIndexStatus();
int computedFieldCount = 0;
@@ -920,27 +935,33 @@ public class CheckIndex implements Close
final boolean hasPayloads = terms.hasPayloads();
final boolean hasOffsets = terms.hasOffsets();
- BytesRef bb = terms.getMin();
+ BytesRef maxTerm;
BytesRef minTerm;
- if (bb != null) {
- assert bb.isValid();
- minTerm = BytesRef.deepCopyOf(bb);
- } else {
+ if (isVectors) {
+ // Term vectors impls can be very slow for getMax
+ maxTerm = null;
minTerm = null;
- }
-
- BytesRef maxTerm;
- bb = terms.getMax();
- if (bb != null) {
- assert bb.isValid();
- maxTerm = BytesRef.deepCopyOf(bb);
- if (minTerm == null) {
- throw new RuntimeException("field \"" + field + "\" has null minTerm but non-null maxTerm");
- }
} else {
- maxTerm = null;
- if (minTerm != null) {
- throw new RuntimeException("field \"" + field + "\" has non-null minTerm but null maxTerm");
+ BytesRef bb = terms.getMin();
+ if (bb != null) {
+ assert bb.isValid();
+ minTerm = BytesRef.deepCopyOf(bb);
+ } else {
+ minTerm = null;
+ }
+
+ bb = terms.getMax();
+ if (bb != null) {
+ assert bb.isValid();
+ maxTerm = BytesRef.deepCopyOf(bb);
+ if (minTerm == null) {
+ throw new RuntimeException("field \"" + field + "\" has null minTerm but non-null maxTerm");
+ }
+ } else {
+ maxTerm = null;
+ if (minTerm != null) {
+ throw new RuntimeException("field \"" + field + "\" has non-null minTerm but null maxTerm");
+ }
}
}
@@ -975,7 +996,7 @@ public class CheckIndex implements Close
}
final TermsEnum termsEnum = terms.iterator(null);
-
+
boolean hasOrd = true;
final long termCountStart = status.delTermCount + status.termCount;
@@ -1005,19 +1026,21 @@ public class CheckIndex implements Close
}
lastTerm.copyBytes(term);
}
+
+ if (isVectors == false) {
+ if (minTerm == null) {
+ // We checked this above:
+ assert maxTerm == null;
+ throw new RuntimeException("field=\"" + field + "\": invalid term: term=" + term + ", minTerm=" + minTerm);
+ }
- if (minTerm == null) {
- // We checked this above:
- assert maxTerm == null;
- throw new RuntimeException("field=\"" + field + "\": invalid term: term=" + term + ", minTerm=" + minTerm);
- }
-
- if (term.compareTo(minTerm) < 0) {
- throw new RuntimeException("field=\"" + field + "\": invalid term: term=" + term + ", minTerm=" + minTerm);
- }
+ if (term.compareTo(minTerm) < 0) {
+ throw new RuntimeException("field=\"" + field + "\": invalid term: term=" + term + ", minTerm=" + minTerm);
+ }
- if (term.compareTo(maxTerm) > 0) {
- throw new RuntimeException("field=\"" + field + "\": invalid term: term=" + term + ", maxTerm=" + maxTerm);
+ if (term.compareTo(maxTerm) > 0) {
+ throw new RuntimeException("field=\"" + field + "\": invalid term: term=" + term + ", maxTerm=" + maxTerm);
+ }
}
final int docFreq = termsEnum.docFreq();
@@ -1243,6 +1266,11 @@ public class CheckIndex implements Close
throw new RuntimeException("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID);
}
}
+
+ if (isVectors) {
+ // Only 1 doc in the postings for term vectors, so we only test 1 advance:
+ break;
+ }
}
} else {
for(int idx=0;idx<7;idx++) {
@@ -1263,6 +1291,10 @@ public class CheckIndex implements Close
throw new RuntimeException("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID);
}
}
+ if (isVectors) {
+ // Only 1 doc in the postings for term vectors, so we only test 1 advance:
+ break;
+ }
}
}
}
@@ -1408,7 +1440,8 @@ public class CheckIndex implements Close
}
if (doPrint) {
- msg(infoStream, "OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
+ msg(infoStream, String.format(Locale.ROOT, "OK [%d terms; %d terms/docs pairs; %d tokens] [took %.3f sec]",
+ status.termCount, status.totFreq, status.totPos, nsToSec(System.nanoTime()-startNS)));
}
if (verbose && status.blockTreeStats != null && infoStream != null && status.termCount > 0) {
@@ -1476,6 +1509,7 @@ public class CheckIndex implements Close
* @lucene.experimental
*/
public static Status.StoredFieldStatus testStoredFields(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
+ long startNS = System.nanoTime();
final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
try {
@@ -1500,8 +1534,10 @@ public class CheckIndex implements Close
throw new RuntimeException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
}
- msg(infoStream, "OK [" + status.totFields + " total field count; avg " +
- NumberFormat.getInstance(Locale.ROOT).format((((float) status.totFields)/status.docCount)) + " fields per doc]");
+ msg(infoStream, String.format(Locale.ROOT, "OK [%d total field count; avg %.1f fields per doc] [took %.3f sec]",
+ status.totFields,
+ (((float) status.totFields)/status.docCount),
+ nsToSec(System.nanoTime() - startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
@@ -1523,6 +1559,7 @@ public class CheckIndex implements Close
public static Status.DocValuesStatus testDocValues(LeafReader reader,
PrintStream infoStream,
boolean failFast) throws IOException {
+ long startNS = System.nanoTime();
final Status.DocValuesStatus status = new Status.DocValuesStatus();
try {
if (infoStream != null) {
@@ -1543,12 +1580,15 @@ public class CheckIndex implements Close
}
}
- msg(infoStream, "OK [" + status.totalValueFields + " docvalues fields; "
- + status.totalBinaryFields + " BINARY; "
- + status.totalNumericFields + " NUMERIC; "
- + status.totalSortedFields + " SORTED; "
- + status.totalSortedNumericFields + " SORTED_NUMERIC; "
- + status.totalSortedSetFields + " SORTED_SET]");
+ msg(infoStream, String.format(Locale.ROOT,
+ "OK [%d docvalues fields; %d BINARY; %d NUMERIC; %d SORTED; %d SORTED_NUMERIC; %d SORTED_SET] [took %.3f sec]",
+ status.totalValueFields,
+ status.totalBinaryFields,
+ status.totalNumericFields,
+ status.totalSortedFields,
+ status.totalSortedNumericFields,
+ status.totalSortedSetFields,
+ nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
@@ -1797,6 +1837,7 @@ public class CheckIndex implements Close
* @lucene.experimental
*/
public static Status.TermVectorStatus testTermVectors(LeafReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
+ long startNS = System.nanoTime();
final Status.TermVectorStatus status = new Status.TermVectorStatus();
final FieldInfos fieldInfos = reader.getFieldInfos();
final Bits onlyDocIsDeleted = new FixedBitSet(1);
@@ -1839,8 +1880,11 @@ public class CheckIndex implements Close
// First run with no deletions:
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose);
- // Again, with the one doc deleted:
- checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true, infoStream, verbose);
+ if (j == 0) {
+ // Also test with the 1 doc deleted; we only do this for first doc because this really is just looking for a [slightly] buggy
+ // TermVectors impl that fails to respect the incoming live docs:
+ checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true, infoStream, verbose);
+ }
// Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j);
@@ -2005,8 +2049,8 @@ public class CheckIndex implements Close
}
}
float vectorAvg = status.docCount == 0 ? 0 : status.totVectors / (float)status.docCount;
- msg(infoStream, "OK [" + status.totVectors + " total vector count; avg " +
- NumberFormat.getInstance(Locale.ROOT).format(vectorAvg) + " term/freq vector fields per doc]");
+ msg(infoStream, String.format(Locale.ROOT, "OK [%d total term vector count; avg %.1f term/freq vector fields per doc] [took %.3f sec]",
+ status.totVectors, vectorAvg, nsToSec(System.nanoTime() - startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
@@ -2221,4 +2265,8 @@ public class CheckIndex implements Close
}
}
}
+
+ private static double nsToSec(long ns) {
+ return ns/1000000000.0;
+ }
}