You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/11/10 10:53:48 UTC

lucene-solr:branch_6x: LUCENE-7538: throw IllegalArgumentException if you attempt to store a too-massive text field

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x aa6a678a8 -> be47009ce


LUCENE-7538: throw IllegalArgumentException if you attempt to store a too-massive text field


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/be47009c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/be47009c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/be47009c

Branch: refs/heads/branch_6x
Commit: be47009ce765f75661f3eda4878b4bb14a9688a1
Parents: aa6a678
Author: Mike McCandless <mi...@apache.org>
Authored: Thu Nov 10 05:51:08 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Thu Nov 10 05:51:37 2016 -0500

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  4 +++
 .../GrowableByteArrayDataOutput.java            |  2 +-
 .../lucene/index/DefaultIndexingChain.java      |  4 +++
 .../org/apache/lucene/index/IndexWriter.java    |  8 ++++++
 .../java/org/apache/lucene/util/BytesRef.java   |  2 +-
 .../org/apache/lucene/util/BytesRefBuilder.java |  4 +--
 .../org/apache/lucene/util/UnicodeUtil.java     |  5 ++++
 .../TestGrowableByteArrayDataOutput.java        |  4 +--
 .../apache/lucene/index/TestIndexWriter.java    | 30 ++++++++++++++++++++
 .../org/apache/lucene/util/TestUnicodeUtil.java |  6 ++--
 .../lucene/util/automaton/TestUTF32ToUTF8.java  |  2 +-
 11 files changed, 61 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7936f58..f7f54c0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -22,6 +22,10 @@ Improvements
 * LUCENE-7544: UnifiedHighlighter: add extension points for handling custom queries.
   (Michael Braun, David Smiley)
 
+* LUCENE-7538: Asking IndexWriter to store a too-massive text field
+  now throws IllegalArgumentException instead of a cryptic exception
+  that closes your IndexWriter (Steve Chen via Mike McCandless)
+
 ======================= Lucene 6.3.0 =======================
 
 API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
index 67cfab6..ec551d1 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
@@ -64,7 +64,7 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
 
   @Override
   public void writeString(String string) throws IOException {
-    int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
+    int maxLen = UnicodeUtil.maxUTF8Length(string.length());
     if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING)  {
       // string is small enough that we don't need to save memory by falling back to double-pass approach
       // this is just an optimized writeString() that re-uses scratchBytes.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index e941911..d792111 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -430,6 +430,10 @@ final class DefaultIndexingChain extends DocConsumer {
         fp = getOrAddField(fieldName, fieldType, false);
       }
       if (fieldType.stored()) {
+        String value = field.stringValue();
+        if (value != null && value.length() > IndexWriter.MAX_STORED_STRING_LENGTH) {
+          throw new IllegalArgumentException("stored field \"" + field.name() + "\" is too large (" + value.length() + " characters) to store");
+        }
         try {
           storedFieldsWriter.writeField(fp.fieldInfo, field);
         } catch (Throwable th) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
index 68f1d7a..5557dd1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@@ -62,6 +62,7 @@ import org.apache.lucene.store.MergeInfo;
 import org.apache.lucene.store.RateLimitedIndexOutput;
 import org.apache.lucene.store.TrackingDirectoryWrapper;
 import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CloseableThreadLocal;
@@ -70,6 +71,7 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.InfoStream;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.Version;
 
 /**
@@ -258,6 +260,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
    * IndexWriterConfig#setInfoStream(InfoStream)}).
    */
   public final static int MAX_TERM_LENGTH = DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8;
+
+  /**
+   * Maximum length string for a stored field.
+   */
+  public final static int MAX_STORED_STRING_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH / UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
+    
   // when unrecoverable disaster strikes, we populate this with the reason that we had to close IndexWriter
   volatile Throwable tragedy;
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
index c62d639..2fcf28a 100644
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java
@@ -84,7 +84,7 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
    * unicode text, with no unpaired surrogates.
    */
   public BytesRef(CharSequence text) {
-    this(new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * text.length()]);
+    this(new byte[UnicodeUtil.maxUTF8Length(text.length())]);
     length = UnicodeUtil.UTF16toUTF8(text, 0, text.length(), bytes);
   }
   

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
index 2bfa2f2..08fda91 100644
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java
@@ -143,7 +143,7 @@ public class BytesRefBuilder {
    * represent the provided text.
    */
   public void copyChars(CharSequence text, int off, int len) {
-    grow(len * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
+    grow(UnicodeUtil.maxUTF8Length(len));
     ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes);
   }
 
@@ -152,7 +152,7 @@ public class BytesRefBuilder {
    * represent the provided text.
    */
   public void copyChars(char[] text, int off, int len) {
-    grow(len * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
+    grow(UnicodeUtil.maxUTF8Length(len));
     ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes);
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
index a21281f..20e6249 100644
--- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -612,6 +612,11 @@ public final class UnicodeUtil {
     }
     return out_offset;
   }
+
+  /** Returns the maximum number of utf8 bytes required to encode a utf16 (e.g., java char[], String) */
+  public static int maxUTF8Length(int utf16Length) {
+    return Math.multiplyExact(utf16Length, MAX_UTF8_BYTES_PER_CHAR);
+  }
   
   /**
    * Utility method for {@link #UTF8toUTF16(byte[], int, int, char[])}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
index 3820733..37a7e4c 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
@@ -37,7 +37,7 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
       // create a small string such that the single pass approach is used
       int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
       String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
-      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
       int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
 
       GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
@@ -61,7 +61,7 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
     int num = atLeast(100);
     for (int i = 0; i < num; i++) {
       String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
-      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
       int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
 
       GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
index 7a47d97..1b639f6 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
@@ -97,6 +97,7 @@ import org.apache.lucene.util.ThreadInterruptedException;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.junit.Ignore;
 import org.junit.Test;
 
 public class TestIndexWriter extends LuceneTestCase {
@@ -2766,5 +2767,34 @@ public class TestIndexWriter extends LuceneTestCase {
     dir.close();
   }
 
+  @Ignore("requires running tests with biggish heap")
+  public void testMassiveField() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    final IndexWriter w = new IndexWriter(dir, iwc);
+
+    StringBuilder b = new StringBuilder();
+    while (b.length() <= IndexWriter.MAX_STORED_STRING_LENGTH) {
+      b.append("x ");
+    }
+
+    final Document doc = new Document();
+    //doc.add(new TextField("big", b.toString(), Field.Store.YES));
+    doc.add(new StoredField("big", b.toString()));
+    Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
+    assertEquals("stored field \"big\" is too large (" + b.length() + " characters) to store", e.getMessage());
+
+    // make sure writer is still usable:
+    Document doc2 = new Document();
+    doc2.add(new StringField("id", "foo", Field.Store.YES));
+    w.addDocument(doc2);
+
+    DirectoryReader r = DirectoryReader.open(w);
+    assertEquals(1, r.numDocs());
+    r.close();
+    w.close();
+    dir.close();
+  }
+
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
index 7bbd606..15251ad 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@@ -111,7 +111,7 @@ public class TestUnicodeUtil extends LuceneTestCase {
     int num = atLeast(50000);
     for (int i = 0; i < num; i++) {
       final String s = TestUtil.randomUnicodeString(random());
-      final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
       final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
       assertEquals(s.codePointCount(0, s.length()),
                    UnicodeUtil.codePointCount(new BytesRef(utf8, 0, utf8Len)));
@@ -137,7 +137,7 @@ public class TestUnicodeUtil extends LuceneTestCase {
     int num = atLeast(50000);
     for (int i = 0; i < num; i++) {
       final String s = TestUtil.randomUnicodeString(random());
-      final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
       final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
       utf32 = ArrayUtil.grow(utf32, utf8Len);
       final int utf32Len = UnicodeUtil.UTF8toUTF32(new BytesRef(utf8, 0, utf8Len), utf32);
@@ -208,7 +208,7 @@ public class TestUnicodeUtil extends LuceneTestCase {
     int num = atLeast(5000);
     for (int i = 0; i < num; i++) {
       String unicode = TestUtil.randomUnicodeString(random());
-      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
       int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
       assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be47009c/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
index 6434c1c..1c1d1d4 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
@@ -41,7 +41,7 @@ public class TestUTF32ToUTF8 extends LuceneTestCase {
 
   private boolean matches(ByteRunAutomaton a, int code) {
     char[] chars = Character.toChars(code);
-    byte[] b = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * chars.length];
+    byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
     final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
     return a.run(b, 0, len);
   }