You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/12/20 14:07:48 UTC
svn commit: r605912 - in /lucene/java/trunk/src:
java/org/apache/lucene/index/ test/org/apache/lucene/index/
Author: mikemccand
Date: Thu Dec 20 05:07:29 2007
New Revision: 605912
URL: http://svn.apache.org/viewvc?rev=605912&view=rev
Log:
LUCENE-1094: don't corrupt fdt file on hitting an exception partway through indexing a document
Modified:
lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java
lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=605912&r1=605911&r2=605912&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Dec 20 05:07:29 2007
@@ -562,13 +562,14 @@
// If we hit an exception while appending to the
// stored fields or term vectors files, we have to
- // abort because it means those files are possibly
- // inconsistent.
+ // abort all documents since we last flushed because
+ // it means those files are possibly inconsistent.
abortOnExc = true;
// Append stored fields to the real FieldsWriter:
- fieldsWriter.flushDocument(fdtLocal);
+ fieldsWriter.flushDocument(numStoredFields, fdtLocal);
fdtLocal.reset();
+ numStoredFields = 0;
// Append term vectors to the real outputs:
if (tvx != null) {
@@ -589,7 +590,6 @@
tvfLocal.reset();
}
}
- abortOnExc = false;
// Append norms for the fields we saw:
for(int i=0;i<numFieldData;i++) {
@@ -603,6 +603,7 @@
bn.add(norm);
}
}
+ abortOnExc = false;
if (bufferIsFull && !flushPending) {
flushPending = true;
@@ -621,6 +622,9 @@
numVectorFields = 0;
maxTermHit = 0;
+ assert 0 == fdtLocal.length();
+ assert 0 == tvfLocal.length();
+
List docFields = doc.getFields();
final int numDocFields = docFields.size();
boolean docHasVectors = false;
@@ -636,7 +640,6 @@
FieldInfo fi = fieldInfos.add(field.name(), field.isIndexed(), field.isTermVectorStored(),
field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
field.getOmitNorms(), false);
- numStoredFields += field.isStored() ? 1:0;
if (fi.isIndexed && !fi.omitNorms) {
// Maybe grow our buffered norms
if (norms.length <= fi.number) {
@@ -968,7 +971,7 @@
final int numFields = numFieldData;
- fdtLocal.writeVInt(numStoredFields);
+ assert 0 == fdtLocal.length();
if (tvx != null)
// If we are writing vectors then we must visit
@@ -1269,19 +1272,48 @@
if (field.isIndexed())
invertField(field, analyzer, maxFieldLength);
- if (field.isStored())
- localFieldsWriter.writeField(fieldInfo, field);
+ if (field.isStored()) {
+ numStoredFields++;
+ boolean success = false;
+ try {
+ localFieldsWriter.writeField(fieldInfo, field);
+ success = true;
+ } finally {
+ // If we hit an exception inside
+ // localFieldsWriter.writeField, the
+ // contents of fdtLocal can be corrupt, so
+ // we must discard all stored fields for
+ // this document:
+ if (!success) {
+ numStoredFields = 0;
+ fdtLocal.reset();
+ }
+ }
+ }
docFieldsFinal[j] = null;
}
} finally {
if (postingsVectorsUpto > 0) {
// Add term vectors for this field
- writeVectors(fieldInfo);
- if (postingsVectorsUpto > maxPostingsVectors)
- maxPostingsVectors = postingsVectorsUpto;
- postingsVectorsUpto = 0;
- vectorsPool.reset();
+ boolean success = false;
+ try {
+ writeVectors(fieldInfo);
+ success = true;
+ } finally {
+ if (!success) {
+ // If we hit an exception inside
+ // writeVectors, the contents of tvfLocal
+ // can be corrupt, so we must discard all
+ // term vectors for this document:
+ numVectorFields = 0;
+ tvfLocal.reset();
+ }
+ if (postingsVectorsUpto > maxPostingsVectors)
+ maxPostingsVectors = postingsVectorsUpto;
+ postingsVectorsUpto = 0;
+ vectorsPool.reset();
+ }
}
}
}
@@ -1449,7 +1481,8 @@
// If we hit an exception below, it's possible the
// posting list or term vectors data will be
// partially written and thus inconsistent if
- // flushed, so we have to abort:
+ // flushed, so we have to abort all documents
+ // since the last flush:
abortOnExc = true;
if (p != null) { // term seen since last flush
@@ -2243,12 +2276,12 @@
boolean success = false;
int maxTermHit;
try {
- // This call is not synchronized and does all the work
try {
+ // This call is not synchronized and does all the work
state.processDocument(analyzer);
} finally {
maxTermHit = state.maxTermHit;
- // This call synchronized but fast
+ // This call is synchronized but fast
finishDocument(state);
}
success = true;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java?rev=605912&r1=605911&r2=605912&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java Thu Dec 20 05:07:29 2007
@@ -155,6 +155,8 @@
FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
byte bits = fieldsStream.readByte();
+ assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
+
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java?rev=605912&r1=605911&r2=605912&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java Thu Dec 20 05:07:29 2007
@@ -60,8 +60,9 @@
// and adds a new entry for this document into the index
// stream. This assumes the buffer was already written
// in the correct fields format.
- void flushDocument(RAMOutputStream buffer) throws IOException {
+ void flushDocument(int numStoredFields, RAMOutputStream buffer) throws IOException {
indexStream.writeLong(fieldsStream.getFilePointer());
+ fieldsStream.writeVInt(numStoredFields);
buffer.writeTo(fieldsStream);
}
@@ -141,6 +142,7 @@
position += lengths[i];
}
fieldsStream.copyBytes(stream, position-start);
+ assert fieldsStream.getFilePointer() == position;
}
final void addDocument(Document doc) throws IOException {
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java?rev=605912&r1=605911&r2=605912&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java Thu Dec 20 05:07:29 2007
@@ -244,15 +244,21 @@
// array will be non-null at position i:
SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()];
+ // If this reader is a SegmentReader, and all of its
+ // field name -> number mappings match the "merged"
+ // FieldInfos, then we can do a bulk copy of the
+ // stored fields:
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
- boolean same = reader.getFieldNames(IndexReader.FieldOption.ALL).size() == fieldInfos.size() && reader instanceof SegmentReader;
- if (same) {
+ if (reader instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) reader;
- for (int j = 0; same && j < fieldInfos.size(); j++)
- same = fieldInfos.fieldName(j).equals(segmentReader.getFieldInfos().fieldName(j));
- if (same)
+ boolean same = true;
+ FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
+ for (int j = 0; same && j < segmentFieldInfos.size(); j++)
+ same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
+ if (same) {
matchingSegmentReaders[i] = segmentReader;
+ }
}
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=605912&r1=605911&r2=605912&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Thu Dec 20 05:07:29 2007
@@ -26,6 +26,7 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -1808,7 +1809,7 @@
if (doFail) {
StackTraceElement[] trace = new Exception().getStackTrace();
for (int i = 0; i < trace.length; i++) {
- if ("appendPostings".equals(trace[i].getMethodName()) && count++ == 30) {
+ if ("org.apache.lucene.index.DocumentsWriter".equals(trace[i].getClassName()) && "appendPostings".equals(trace[i].getMethodName()) && count++ == 30) {
doFail = false;
throw new IOException("now failing during flush");
}
@@ -1845,5 +1846,139 @@
writer.close();
IndexReader reader = IndexReader.open(dir);
assertEquals(198, reader.docFreq(new Term("content", "aa")));
+ reader.close();
+ }
+
+ private class CrashingFilter extends TokenFilter {
+ String fieldName;
+ int count;
+
+ public CrashingFilter(String fieldName, TokenStream input) {
+ super(input);
+ this.fieldName = fieldName;
+ }
+
+ public Token next(Token result) throws IOException {
+ if (this.fieldName.equals("crash") && count++ >= 4)
+ throw new IOException("I'm experiencing problems");
+ return input.next(result);
+ }
+ }
+
+ public void testDocumentsWriterExceptions() throws IOException {
+ Analyzer analyzer = new Analyzer() {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new CrashingFilter(fieldName, new WhitespaceTokenizer(reader));
+ }
+ };
+
+ for(int i=0;i<2;i++) {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, analyzer);
+ //writer.setInfoStream(System.out);
+ Document doc = new Document();
+ doc.add(new Field("contents", "here are some contents", Field.Store.YES,
+ Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+ writer.addDocument(doc);
+ writer.addDocument(doc);
+ doc.add(new Field("crash", "this should crash after 4 terms", Field.Store.YES,
+ Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+ doc.add(new Field("other", "this will not get indexed", Field.Store.YES,
+ Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+ try {
+ writer.addDocument(doc);
+ fail("did not hit expected exception");
+ } catch (IOException ioe) {
+ }
+
+ if (0 == i) {
+ doc = new Document();
+ doc.add(new Field("contents", "here are some contents", Field.Store.YES,
+ Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+ writer.addDocument(doc);
+ writer.addDocument(doc);
+ }
+ writer.close();
+
+ IndexReader reader = IndexReader.open(dir);
+ int expected = 3+(1-i)*2;
+ assertEquals(expected, reader.docFreq(new Term("contents", "here")));
+ assertEquals(expected, reader.maxDoc());
+ for(int j=0;j<reader.maxDoc();j++) {
+ reader.document(j);
+ reader.getTermFreqVectors(j);
+ }
+ reader.close();
+
+ writer = new IndexWriter(dir, analyzer);
+ writer.setMaxBufferedDocs(10);
+ doc = new Document();
+ doc.add(new Field("contents", "here are some contents", Field.Store.YES,
+ Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+ for(int j=0;j<17;j++)
+ writer.addDocument(doc);
+ writer.optimize();
+ writer.close();
+
+ reader = IndexReader.open(dir);
+ expected = 20+(1-i)*2;
+ assertEquals(expected, reader.docFreq(new Term("contents", "here")));
+ assertEquals(expected, reader.maxDoc());
+ for(int j=0;j<reader.maxDoc();j++) {
+ reader.document(j);
+ reader.getTermFreqVectors(j);
+ }
+ reader.close();
+
+ dir.close();
+ }
+ }
+
+ public void testVariableSchema() throws IOException {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ int delID = 0;
+ for(int i=0;i<20;i++) {
+ IndexWriter writer = new IndexWriter(dir, false, new WhitespaceAnalyzer());
+ writer.setMaxBufferedDocs(2);
+ writer.setMergeFactor(2);
+ writer.setUseCompoundFile(false);
+ Document doc = new Document();
+ String contents = "aa bb cc dd ee ff gg hh ii jj kk";
+
+ if (i == 7) {
+ // Add empty docs here
+ doc.add(new Field("content3", "", Field.Store.NO,
+ Field.Index.TOKENIZED));
+ } else {
+ Field.Store storeVal;
+ if (i%2 == 0) {
+ doc.add(new Field("content4", contents, Field.Store.YES,
+ Field.Index.TOKENIZED));
+ storeVal = Field.Store.YES;
+ } else
+ storeVal = Field.Store.NO;
+ doc.add(new Field("content1", contents, storeVal,
+ Field.Index.TOKENIZED));
+ doc.add(new Field("content3", "", Field.Store.YES,
+ Field.Index.TOKENIZED));
+ doc.add(new Field("content5", "", storeVal,
+ Field.Index.TOKENIZED));
+ }
+
+ for(int j=0;j<4;j++)
+ writer.addDocument(doc);
+
+ writer.close();
+ IndexReader reader = IndexReader.open(dir);
+ reader.deleteDocument(delID++);
+ reader.close();
+
+ if (0 == i % 4) {
+ writer = new IndexWriter(dir, false, new WhitespaceAnalyzer());
+ writer.setUseCompoundFile(false);
+ writer.optimize();
+ writer.close();
+ }
+ }
}
}