You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/11/10 10:53:48 UTC
lucene-solr:branch_6x: LUCENE-7538: throw IllegalArgumentException if
you attempt to store a too-massive text field
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x aa6a678a8 -> be47009ce
LUCENE-7538: throw IllegalArgumentException if you attempt to store a too-massive text field
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/be47009c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/be47009c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/be47009c
Branch: refs/heads/branch_6x
Commit: be47009ce765f75661f3eda4878b4bb14a9688a1
Parents: aa6a678
Author: Mike McCandless <mi...@apache.org>
Authored: Thu Nov 10 05:51:08 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Thu Nov 10 05:51:37 2016 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +++
.../GrowableByteArrayDataOutput.java | 2 +-
.../lucene/index/DefaultIndexingChain.java | 4 +++
.../org/apache/lucene/index/IndexWriter.java | 8 ++++++
.../java/org/apache/lucene/util/BytesRef.java | 2 +-
.../org/apache/lucene/util/BytesRefBuilder.java | 4 +--
.../org/apache/lucene/util/UnicodeUtil.java | 5 ++++
.../TestGrowableByteArrayDataOutput.java | 4 +--
.../apache/lucene/index/TestIndexWriter.java | 30 ++++++++++++++++++++
.../org/apache/lucene/util/TestUnicodeUtil.java | 6 ++--
.../lucene/util/automaton/TestUTF32ToUTF8.java | 2 +-
11 files changed, 61 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7936f58..f7f54c0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -22,6 +22,10 @@ Improvements
* LUCENE-7544: UnifiedHighlighter: add extension points for handling custom queries.
(Michael Braun, David Smiley)
+* LUCENE-7538: Asking IndexWriter to store a too-massive text field
+ now throws IllegalArgumentException instead of a cryptic exception
+ that closes your IndexWriter (Steve Chen via Mike McCandless)
+
======================= Lucene 6.3.0 =======================
API Changes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
index 67cfab6..ec551d1 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
@@ -64,7 +64,7 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
@Override
public void writeString(String string) throws IOException {
- int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
+ int maxLen = UnicodeUtil.maxUTF8Length(string.length());
if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) {
// string is small enough that we don't need to save memory by falling back to double-pass approach
// this is just an optimized writeString() that re-uses scratchBytes.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index e941911..d792111 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -430,6 +430,10 @@ final class DefaultIndexingChain extends DocConsumer {
fp = getOrAddField(fieldName, fieldType, false);
}
if (fieldType.stored()) {
+ String value = field.stringValue();
+ if (value != null && value.length() > IndexWriter.MAX_STORED_STRING_LENGTH) {
+ throw new IllegalArgumentException("stored field \"" + field.name() + "\" is too large (" + value.length() + " characters) to store");
+ }
try {
storedFieldsWriter.writeField(fp.fieldInfo, field);
} catch (Throwable th) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
index 68f1d7a..5557dd1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@@ -62,6 +62,7 @@ import org.apache.lucene.store.MergeInfo;
import org.apache.lucene.store.RateLimitedIndexOutput;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CloseableThreadLocal;
@@ -70,6 +71,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
/**
@@ -258,6 +260,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
* IndexWriterConfig#setInfoStream(InfoStream)}).
*/
public final static int MAX_TERM_LENGTH = DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8;
+
+ /**
+ * Maximum length string for a stored field.
+ */
+ public final static int MAX_STORED_STRING_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH / UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
+
// when unrecoverable disaster strikes, we populate this with the reason that we had to close IndexWriter
volatile Throwable tragedy;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
index c62d639..2fcf28a 100644
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
@@ -84,7 +84,7 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
* unicode text, with no unpaired surrogates.
*/
public BytesRef(CharSequence text) {
- this(new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * text.length()]);
+ this(new byte[UnicodeUtil.maxUTF8Length(text.length())]);
length = UnicodeUtil.UTF16toUTF8(text, 0, text.length(), bytes);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
index 2bfa2f2..08fda91 100644
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
@@ -143,7 +143,7 @@ public class BytesRefBuilder {
* represent the provided text.
*/
public void copyChars(CharSequence text, int off, int len) {
- grow(len * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
+ grow(UnicodeUtil.maxUTF8Length(len));
ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes);
}
@@ -152,7 +152,7 @@ public class BytesRefBuilder {
* represent the provided text.
*/
public void copyChars(char[] text, int off, int len) {
- grow(len * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
+ grow(UnicodeUtil.maxUTF8Length(len));
ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
index a21281f..20e6249 100644
--- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -612,6 +612,11 @@ public final class UnicodeUtil {
}
return out_offset;
}
+
+ /** Returns the maximum number of utf8 bytes required to encode a utf16 (e.g., java char[], String) */
+ public static int maxUTF8Length(int utf16Length) {
+ return Math.multiplyExact(utf16Length, MAX_UTF8_BYTES_PER_CHAR);
+ }
/**
* Utility method for {@link #UTF8toUTF16(byte[], int, int, char[])}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
index 3820733..37a7e4c 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
@@ -37,7 +37,7 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
// create a small string such that the single pass approach is used
int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
- byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
@@ -61,7 +61,7 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
int num = atLeast(100);
for (int i = 0; i < num; i++) {
String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
- byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
index 7a47d97..1b639f6 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
@@ -97,6 +97,7 @@ import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.junit.Ignore;
import org.junit.Test;
public class TestIndexWriter extends LuceneTestCase {
@@ -2766,5 +2767,34 @@ public class TestIndexWriter extends LuceneTestCase {
dir.close();
}
+ @Ignore("requires running tests with biggish heap")
+ public void testMassiveField() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+ final IndexWriter w = new IndexWriter(dir, iwc);
+
+ StringBuilder b = new StringBuilder();
+ while (b.length() <= IndexWriter.MAX_STORED_STRING_LENGTH) {
+ b.append("x ");
+ }
+
+ final Document doc = new Document();
+ //doc.add(new TextField("big", b.toString(), Field.Store.YES));
+ doc.add(new StoredField("big", b.toString()));
+ Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
+ assertEquals("stored field \"big\" is too large (" + b.length() + " characters) to store", e.getMessage());
+
+ // make sure writer is still usable:
+ Document doc2 = new Document();
+ doc2.add(new StringField("id", "foo", Field.Store.YES));
+ w.addDocument(doc2);
+
+ DirectoryReader r = DirectoryReader.open(w);
+ assertEquals(1, r.numDocs());
+ r.close();
+ w.close();
+ dir.close();
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
index 7bbd606..15251ad 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@@ -111,7 +111,7 @@ public class TestUnicodeUtil extends LuceneTestCase {
int num = atLeast(50000);
for (int i = 0; i < num; i++) {
final String s = TestUtil.randomUnicodeString(random());
- final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
assertEquals(s.codePointCount(0, s.length()),
UnicodeUtil.codePointCount(new BytesRef(utf8, 0, utf8Len)));
@@ -137,7 +137,7 @@ public class TestUnicodeUtil extends LuceneTestCase {
int num = atLeast(50000);
for (int i = 0; i < num; i++) {
final String s = TestUtil.randomUnicodeString(random());
- final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
utf32 = ArrayUtil.grow(utf32, utf8Len);
final int utf32Len = UnicodeUtil.UTF8toUTF32(new BytesRef(utf8, 0, utf8Len), utf32);
@@ -208,7 +208,7 @@ public class TestUnicodeUtil extends LuceneTestCase {
int num = atLeast(5000);
for (int i = 0; i < num; i++) {
String unicode = TestUtil.randomUnicodeString(random());
- byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
index 6434c1c..1c1d1d4 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
@@ -41,7 +41,7 @@ public class TestUTF32ToUTF8 extends LuceneTestCase {
private boolean matches(ByteRunAutomaton a, int code) {
char[] chars = Character.toChars(code);
- byte[] b = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * chars.length];
+ byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
return a.run(b, 0, len);
}