You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/12/03 11:09:11 UTC
svn commit: r600465 - in /lucene/java/trunk/src:
java/org/apache/lucene/index/DocumentsWriter.java
java/org/apache/lucene/index/IndexWriter.java
test/org/apache/lucene/index/TestIndexWriter.java
Author: mikemccand
Date: Mon Dec 3 02:09:10 2007
New Revision: 600465
URL: http://svn.apache.org/viewvc?rev=600465&view=rev
Log:
LUCENE-1072: make sure on hitting a too-long term that IndexWriter is still usable
Modified:
lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=600465&r1=600464&r2=600465&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Mon Dec 3 02:09:10 2007
@@ -354,7 +354,7 @@
state.tvfLocal.reset();
state.fdtLocal.reset();
}
-
+ docStoreSegment = null;
files = null;
} finally {
@@ -518,6 +518,7 @@
int numAllFieldData;
FieldData[] fieldDataHash; // Hash FieldData instances by field name
int fieldDataHashMask;
+ int maxTermHit; // Set to > 0 if this doc has a too-large term
boolean doFlushAfter;
@@ -608,6 +609,7 @@
numStoredFields = 0;
numFieldData = 0;
numVectorFields = 0;
+ maxTermHit = 0;
List docFields = doc.getFields();
final int numDocFields = docFields.size();
@@ -1483,17 +1485,23 @@
getPostings(postingsFreeList);
}
- // Pull next free Posting from free list
- p = postingsFreeList[--postingsFreeCount];
-
final int textLen1 = 1+tokenTextLen;
if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) {
- if (textLen1 > CHAR_BLOCK_SIZE)
- throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds max term length " + (CHAR_BLOCK_SIZE-1));
+ if (textLen1 > CHAR_BLOCK_SIZE) {
+ maxTermHit = tokenTextLen;
+ // Just skip this term; we will throw an
+ // exception after processing all accepted
+ // terms in the doc
+ return;
+ }
charPool.nextBuffer();
}
final char[] text = charPool.buffer;
final int textUpto = charPool.byteUpto;
+
+ // Pull next free Posting from free list
+ p = postingsFreeList[--postingsFreeCount];
+
p.textStart = textUpto + charPool.byteOffset;
charPool.byteUpto += textLen1;
@@ -2181,26 +2189,28 @@
/** Returns true if the caller (IndexWriter) should now
* flush. */
- boolean addDocument(Document doc, Analyzer analyzer)
+ int addDocument(Document doc, Analyzer analyzer)
throws CorruptIndexException, IOException {
return updateDocument(doc, analyzer, null);
}
- boolean updateDocument(Term t, Document doc, Analyzer analyzer)
+ int updateDocument(Term t, Document doc, Analyzer analyzer)
throws CorruptIndexException, IOException {
return updateDocument(doc, analyzer, t);
}
- boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm)
+ int updateDocument(Document doc, Analyzer analyzer, Term delTerm)
throws CorruptIndexException, IOException {
// This call is synchronized but fast
final ThreadState state = getThreadState(doc, delTerm);
boolean success = false;
+ int maxTermHit;
try {
// This call is not synchronized and does all the work
state.processDocument(analyzer);
// This call synchronized but fast
+ maxTermHit = state.maxTermHit;
finishDocument(state);
success = true;
} finally {
@@ -2209,7 +2219,11 @@
abort();
}
}
- return state.doFlushAfter || timeToFlushDeletes();
+
+ int status = maxTermHit<<1;
+ if (state.doFlushAfter || timeToFlushDeletes())
+ status += 1;
+ return status;
}
synchronized int getNumBufferedDeleteTerms() {
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=600465&r1=600464&r2=600465&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Mon Dec 3 02:09:10 2007
@@ -1426,10 +1426,10 @@
*/
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
ensureOpen();
- boolean doFlush = false;
+ int status = 0;
boolean success = false;
try {
- doFlush = docWriter.addDocument(doc, analyzer);
+ status = docWriter.addDocument(doc, analyzer);
success = true;
} finally {
if (!success) {
@@ -1446,8 +1446,9 @@
}
}
}
- if (doFlush)
+ if ((status & 1) != 0)
flush(true, false);
+ checkMaxTermLength(status);
}
/**
@@ -1511,10 +1512,10 @@
public void updateDocument(Term term, Document doc, Analyzer analyzer)
throws CorruptIndexException, IOException {
ensureOpen();
- boolean doFlush = false;
+ int status = 0;
boolean success = false;
try {
- doFlush = docWriter.updateDocument(term, doc, analyzer);
+ status = docWriter.updateDocument(term, doc, analyzer);
success = true;
} finally {
if (!success) {
@@ -1531,8 +1532,17 @@
}
}
}
- if (doFlush)
+ if ((status & 1) != 0)
flush(true, false);
+ checkMaxTermLength(status);
+ }
+
+ /** Throws IllegalArgumentException if the return status
+ * from DocumentsWriter.{add,update}Document indicates
+ * that a too-long term was encountered */
+ final private void checkMaxTermLength(int status) {
+ if (status > 1)
+ throw new IllegalArgumentException("at least one term (length " + (status>>1) + ") exceeds max term length " + (DocumentsWriter.CHAR_BLOCK_SIZE-1) + "; these terms were skipped");
}
// for test purpose
Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=600465&r1=600464&r2=600465&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Mon Dec 3 02:09:10 2007
@@ -28,8 +28,6 @@
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.TermQuery;
@@ -221,12 +219,8 @@
methodName = "addIndexesNoOptimize(Directory[])";
}
- int cycleCount = 0;
-
while(!done) {
- cycleCount++;
-
// Make a new dir that will enforce disk usage:
MockRAMDirectory dir = new MockRAMDirectory(startDir);
writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), false);
@@ -524,7 +518,7 @@
String[] startFiles = dir.list();
SegmentInfos infos = new SegmentInfos();
infos.read(dir);
- IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
+ new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
String[] endFiles = dir.list();
Arrays.sort(startFiles);
@@ -543,17 +537,44 @@
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
- char[] chars = new char[16384];
+ char[] chars = new char[16383];
Arrays.fill(chars, 'x');
Document doc = new Document();
- String contents = "a b c " + new String(chars);
+ final String bigTerm = new String(chars);
+
+ // Max length term is 16383, so this contents produces
+ // a too-long term:
+ String contents = "abc xyz x" + bigTerm;
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED));
try {
writer.addDocument(doc);
fail("did not hit expected exception");
} catch (IllegalArgumentException e) {
}
+
+ // Make sure we can add another normal document
+ doc = new Document();
+ doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.TOKENIZED));
+ writer.addDocument(doc);
writer.close();
+
+ IndexReader reader = IndexReader.open(dir);
+ // Make sure all terms < max size were indexed
+ assertEquals(2, reader.docFreq(new Term("content", "abc")));
+ assertEquals(1, reader.docFreq(new Term("content", "bbb")));
+ reader.close();
+
+ // Make sure we can add a document with exactly the
+ // maximum length term, and search on that term:
+ doc = new Document();
+ doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
+ writer = new IndexWriter(dir, new StandardAnalyzer());
+ writer.addDocument(doc);
+ writer.close();
+ reader = IndexReader.open(dir);
+ assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
+ reader.close();
+
dir.close();
}
@@ -1342,7 +1363,6 @@
public void testDiverseDocs() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
- long t0 = System.currentTimeMillis();
writer.setRAMBufferSizeMB(0.5);
Random rand = new Random(31415);
for(int i=0;i<3;i++) {
@@ -1381,7 +1401,6 @@
}
writer.close();
- long t1 = System.currentTimeMillis();
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("field", "aaa")));
assertEquals(300, hits.length());
@@ -1491,7 +1510,6 @@
addDoc(writer);
}
writer.close();
- IndexReader reader = IndexReader.open(dir);
Term searchTerm = new Term("content", "aaa");
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(searchTerm));