You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2014/03/05 20:37:55 UTC
svn commit: r1574637 - in /lucene/dev/branches/branch_4x: ./ dev-tools/
lucene/ lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/
lucene/analysis/icu/src/java/org/apache/lucene/collation/ lucene/backwards/
luce...
Author: hossman
Date: Wed Mar 5 19:37:53 2014
New Revision: 1574637
URL: http://svn.apache.org/r1574637
Log:
LUCENE-5472: IndexWriter.addDocument will now throw an IllegalArgumentException if a Term to be indexed exceeds IndexWriter.MAX_TERM_LENGTH (merge r1574595)
Added:
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
- copied unchanged from r1574595, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java
lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/update/TestExceedMaxTermLength.java
- copied unchanged from r1574595, lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/TestExceedMaxTermLength.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/dev-tools/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/BUILD.txt (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/JRE_VERSION_MIGRATION.txt (props changed)
lucene/dev/branches/branch_4x/lucene/LICENSE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/MIGRATE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/README.txt (props changed)
lucene/dev/branches/branch_4x/lucene/SYSTEM_REQUIREMENTS.txt (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/ASCIITLD.jflex-macro (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/SUPPLEMENTARY.jflex-macro (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/package.html (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java (props changed)
lucene/dev/branches/branch_4x/lucene/backwards/ (props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_4x/lucene/build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/classification/ (props changed)
lucene/dev/branches/branch_4x/lucene/classification/build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/classification/ivy.xml (props changed)
lucene/dev/branches/branch_4x/lucene/classification/src/ (props changed)
lucene/dev/branches/branch_4x/lucene/codecs/ (props changed)
lucene/dev/branches/branch_4x/lucene/common-build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSort.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java (props changed)
lucene/dev/branches/branch_4x/lucene/demo/ (props changed)
lucene/dev/branches/branch_4x/lucene/expressions/ (props changed)
lucene/dev/branches/branch_4x/lucene/facet/ (props changed)
lucene/dev/branches/branch_4x/lucene/grouping/ (props changed)
lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_4x/lucene/ivy-settings.xml (props changed)
lucene/dev/branches/branch_4x/lucene/ivy-versions.properties (props changed)
lucene/dev/branches/branch_4x/lucene/join/ (props changed)
lucene/dev/branches/branch_4x/lucene/licenses/ (props changed)
lucene/dev/branches/branch_4x/lucene/memory/ (props changed)
lucene/dev/branches/branch_4x/lucene/misc/ (props changed)
lucene/dev/branches/branch_4x/lucene/module-build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/queries/ (props changed)
lucene/dev/branches/branch_4x/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionQuerySort.java (props changed)
lucene/dev/branches/branch_4x/lucene/queryparser/ (props changed)
lucene/dev/branches/branch_4x/lucene/replicator/ (props changed)
lucene/dev/branches/branch_4x/lucene/sandbox/ (props changed)
lucene/dev/branches/branch_4x/lucene/site/ (props changed)
lucene/dev/branches/branch_4x/lucene/spatial/ (props changed)
lucene/dev/branches/branch_4x/lucene/spatial/src/test-files/data/simple-bbox.txt (props changed)
lucene/dev/branches/branch_4x/lucene/spatial/src/test-files/simple-Queries-BBox.txt (props changed)
lucene/dev/branches/branch_4x/lucene/suggest/ (props changed)
lucene/dev/branches/branch_4x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_4x/lucene/tools/ (props changed)
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/solr/LICENSE.txt (props changed)
lucene/dev/branches/branch_4x/solr/NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/README.txt (props changed)
lucene/dev/branches/branch_4x/solr/SYSTEM_REQUIREMENTS.txt (props changed)
lucene/dev/branches/branch_4x/solr/build.xml (props changed)
lucene/dev/branches/branch_4x/solr/cloud-dev/ (props changed)
lucene/dev/branches/branch_4x/solr/common-build.xml (props changed)
lucene/dev/branches/branch_4x/solr/contrib/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema11.xml
lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/core/TestConfig.java (props changed)
lucene/dev/branches/branch_4x/solr/example/ (props changed)
lucene/dev/branches/branch_4x/solr/licenses/ (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpclient-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpclient-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpcore-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpcore-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpmime-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpmime-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/scripts/ (props changed)
lucene/dev/branches/branch_4x/solr/site/ (props changed)
lucene/dev/branches/branch_4x/solr/solrj/ (props changed)
lucene/dev/branches/branch_4x/solr/test-framework/ (props changed)
lucene/dev/branches/branch_4x/solr/webapp/ (props changed)
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Mar 5 19:37:53 2014
@@ -5,6 +5,13 @@ http://s.apache.org/luceneversions
======================= Lucene 4.8.0 =======================
+Changes in Runtime Behavior
+
+* LUCENE-5472: IndexWriter.addDocument will now throw an IllegalArgumentException
+ if a Term to be indexed exceeds IndexWriter.MAX_TERM_LENGTH. To recreate previous
+ behavior of silently ignoring these terms, use LengthFilter in your Analyzer.
+ (hossman, Mike McCandless, Varun Thacker)
+
New Features
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java Wed Mar 5 19:37:53 2014
@@ -247,11 +247,6 @@ final class DocFieldProcessor extends Do
final DocFieldProcessorPerField perField = fields[i];
perField.consumer.processFields(perField.fields, perField.fieldCount);
}
-
- if (docState.maxTermPrefix != null && docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", "WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
- docState.maxTermPrefix = null;
- }
}
private static final Comparator<DocFieldProcessorPerField> fieldsComp = new Comparator<DocFieldProcessorPerField>() {
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java Wed Mar 5 19:37:53 2014
@@ -187,7 +187,17 @@ final class DocInverterPerField extends
// when we come back around to the field...
fieldState.position += posIncrAttribute.getPositionIncrement();
fieldState.offset += offsetAttribute.endOffset();
- succeededInProcessingField = true;
+
+
+ if (docState.maxTermPrefix != null) {
+ final String msg = "Document contains at least one immense term in field=\"" + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
+ if (docState.infoStream.isEnabled("IW")) {
+ docState.infoStream.message("IW", "ERROR: " + msg);
+ }
+ docState.maxTermPrefix = null;
+ throw new IllegalArgumentException(msg);
+ }
+
/* if success was false above there is an exception coming through and we won't get here.*/
succeededInProcessingField = true;
} finally {
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java Wed Mar 5 19:37:53 2014
@@ -209,8 +209,9 @@ public class IndexWriter implements Clos
/**
* Absolute hard maximum length for a term, in bytes once
* encoded as UTF8. If a term arrives from the analyzer
- * longer than this length, it is skipped and a message is
- * printed to infoStream, if set (see {@link
+ * longer than this length, an
+ * <code>IllegalArgumentException</code> is thrown
+ * and a message is printed to infoStream, if set (see {@link
* IndexWriterConfig#setInfoStream(InfoStream)}).
*/
public final static int MAX_TERM_LENGTH = DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8;
@@ -1160,7 +1161,7 @@ public class IndexWriter implements Clos
* merge policy.
*
* <p>Note that each term in the document can be no longer
- * than 16383 characters, otherwise an
+ * than {@link #MAX_TERM_LENGTH} in bytes, otherwise an
* IllegalArgumentException will be thrown.</p>
*
* <p>Note that it's possible to create an invalid Unicode
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java Wed Mar 5 19:37:53 2014
@@ -184,12 +184,11 @@ final class TermsHashPerField extends In
try {
termID = bytesHash.add(termBytesRef, termAtt.fillBytesRef());
} catch (MaxBytesLengthExceededException e) {
- // Not enough room in current block
- // Just skip this term, to remain as robust as
- // possible during indexing. A TokenFilter
- // can be inserted into the analyzer chain if
- // other behavior is wanted (pruning the term
- // to a prefix, throwing an exception, etc).
+ // Term is too large; record this here (can't throw an
+ // exc because DocInverterPerField will then abort the
+ // entire segment) and then throw an exc later in
+ // DocInverterPerField.java. LengthFilter can always be
+ // used to prune the term before indexing:
if (docState.maxTermPrefix == null) {
final int saved = termBytesRef.length;
try {
Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java Wed Mar 5 19:37:53 2014
@@ -1694,32 +1694,32 @@ public class TestIndexWriter extends Luc
// This contents produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new TextField("content", contents, Field.Store.NO));
- w.addDocument(doc);
+ try {
+ w.addDocument(doc);
+ fail("should have hit exception");
+ } catch (IllegalArgumentException iae) {
+ // expected
+ }
// Make sure we can add another normal document
doc = new Document();
doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
w.addDocument(doc);
+ // So we remove the deleted doc:
+ w.forceMerge(1);
+
IndexReader reader = w.getReader();
w.close();
// Make sure all terms < max size were indexed
- assertEquals(2, reader.docFreq(new Term("content", "abc")));
+ assertEquals(1, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
- assertEquals(1, reader.docFreq(new Term("content", "term")));
- assertEquals(1, reader.docFreq(new Term("content", "another")));
-
- // Make sure position is still incremented when
- // massive term is skipped:
- DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"));
- assertEquals(0, tps.nextDoc());
- assertEquals(1, tps.freq());
- assertEquals(3, tps.nextPosition());
+ assertEquals(0, reader.docFreq(new Term("content", "term")));
- // Make sure the doc that has the massive term is in
+ // Make sure the doc that has the massive term is NOT in
// the index:
- assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
+ assertEquals("document with wicked long term is in the index!", 1, reader.numDocs());
reader.close();
dir.close();
Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Wed Mar 5 19:37:53 2014
@@ -30,7 +30,16 @@ Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1
Apache ZooKeeper 3.4.5
-
+Upgrading from Solr 4.7
+----------------------
+
+* In previous versions of Solr, Terms that exceeded Lucene's MAX_TERM_LENGTH were
+ silently ignored when indexing documents. Begining with Solr 4.8, a document
+ an error will be generated when attempting to index a document with a term
+ that is too large. If you wish to continue to have large terms ignored,
+ use "solr.LengthFilterFactory" in all of your Analyzers. See LUCENE-5472 for
+ more details.
+
Detailed Change List
----------------------
@@ -108,6 +117,11 @@ Other Changes
registration exists, wait a short time to see if it goes away.
(Mark Miller)
+* LUCENE-5472: IndexWriter.addDocument will now throw an IllegalArgumentException
+ if a Term to be indexed exceeds IndexWriter.MAX_TERM_LENGTH. To recreate previous
+ behavior of silently ignoring these terms, use LengthFilter in your Analyzer.
+ (hossman, Mike McCandless, Varun Thacker)
+
================== 4.7.0 ==================
Versions of Major Components
Modified: lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema11.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema11.xml?rev=1574637&r1=1574636&r2=1574637&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema11.xml (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema11.xml Wed Mar 5 19:37:53 2014
@@ -287,6 +287,16 @@ valued. -->
class="solr.ExternalFileField"/>
<fieldType name="text_no_analyzer" stored="false" indexed="true" class="solr.TextField" />
+
+ <fieldtype name="text_length" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StandardFilterFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.LengthFilterFactory" min="2" max="32768"/>
+ </analyzer>
+ </fieldtype>
+
</types>
@@ -324,6 +334,9 @@ valued. -->
<field name="_version_" type="long" indexed="true" stored="true" multiValued="false" />
+ <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="cat_length" type="text_length" indexed="true" stored="true" multiValued="true"/>
+
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have