You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2021/09/26 16:12:39 UTC

[commons-io] branch master updated: [IO-638] Infinite loop in CharSequenceInputStream.read for 4-byte characters with UTF-8 and 3-byte buffer.

This is an automated email from the ASF dual-hosted git repository.

ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-io.git


The following commit(s) were added to refs/heads/master by this push:
     new c7817ac  [IO-638] Infinite loop in CharSequenceInputStream.read for 4-byte characters with UTF-8 and 3-byte buffer.
c7817ac is described below

commit c7817acf1e125d459af7f3a4aa161fac8454bced
Author: Gary Gregory <ga...@gmail.com>
AuthorDate: Sun Sep 26 12:12:36 2021 -0400

    [IO-638] Infinite loop in CharSequenceInputStream.read for 4-byte
    characters with UTF-8 and 3-byte buffer.
    
    [IO-716] ReaderInputStream enter infinite loop for too small buffer
    sizes.
    
    Use CharsetEncoder#maxBytesPerChar() * 2 the min.
---
 src/changes/changes.xml                            |   3 +
 .../commons/io/input/CharSequenceInputStream.java  |  17 ++-
 .../apache/commons/io/input/ReaderInputStream.java | 129 ++++++++++-----------
 .../io/input/CharSequenceInputStreamTest.java      |   7 +-
 .../commons/io/input/ReaderInputStreamTest.java    |  10 +-
 5 files changed, 85 insertions(+), 81 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index da2dee0..f2122fc 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -98,6 +98,9 @@ The <action> type attribute can be add,update,fix,remove.
       <action issue="IO-716" dev="ggregory" type="fix" due-to="Marcono1234, Gary Gregory">
         ReaderInputStream enter infinite loop for too small buffer sizes.
       </action>
+      <action issue="IO-638" dev="ggregory" type="fix" due-to=", Gary Gregory">
+        Infinite loop in CharSequenceInputStream.read for 4-byte characters with UTF-8 and 3-byte buffer.
+      </action>
       <!-- ADD -->
       <action dev="ggregory" type="add" due-to="Gary Gregory">
         Add BrokenReader.INSTANCE.
diff --git a/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java b/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java
index ddce28c..da45fa1 100644
--- a/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java
+++ b/src/main/java/org/apache/commons/io/input/CharSequenceInputStream.java
@@ -45,7 +45,7 @@ public class CharSequenceInputStream extends InputStream {
 
     private static final int NO_MARK = -1;
 
-    private final CharsetEncoder encoder;
+    private final CharsetEncoder charsetEncoder;
     private final CharBuffer cbuf;
     private final ByteBuffer bbuf;
 
@@ -73,16 +73,13 @@ public class CharSequenceInputStream extends InputStream {
      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character
      */
     public CharSequenceInputStream(final CharSequence cs, final Charset charset, final int bufferSize) {
-        this.encoder = charset.newEncoder()
+        // @formatter:off
+        this.charsetEncoder = charset.newEncoder()
             .onMalformedInput(CodingErrorAction.REPLACE)
             .onUnmappableCharacter(CodingErrorAction.REPLACE);
+        // @formatter:on
         // Ensure that buffer is long enough to hold a complete character
-        final float maxBytesPerChar = encoder.maxBytesPerChar();
-        if (bufferSize < maxBytesPerChar) {
-            throw new IllegalArgumentException("Buffer size " + bufferSize + " is less than maxBytesPerChar " +
-                    maxBytesPerChar);
-        }
-        this.bbuf = ByteBuffer.allocate(bufferSize);
+        this.bbuf = ByteBuffer.allocate(ReaderInputStream.checkMinBufferSize(charsetEncoder, bufferSize));
         this.bbuf.flip();
         this.cbuf = CharBuffer.wrap(cs);
         this.mark_cbuf = NO_MARK;
@@ -141,7 +138,7 @@ public class CharSequenceInputStream extends InputStream {
      */
     private void fillBuffer() throws CharacterCodingException {
         this.bbuf.compact();
-        final CoderResult result = this.encoder.encode(this.cbuf, this.bbuf, true);
+        final CoderResult result = this.charsetEncoder.encode(this.cbuf, this.bbuf, true);
         if (result.isError()) {
             result.throwException();
         }
@@ -232,7 +229,7 @@ public class CharSequenceInputStream extends InputStream {
         if (this.mark_cbuf != NO_MARK) {
             // if cbuf is at 0, we have not started reading anything, so skip re-encoding
             if (this.cbuf.position() != 0) {
-                this.encoder.reset();
+                this.charsetEncoder.reset();
                 this.cbuf.rewind();
                 this.bbuf.rewind();
                 this.bbuf.limit(0); // rewind does not clear the buffer
diff --git a/src/main/java/org/apache/commons/io/input/ReaderInputStream.java b/src/main/java/org/apache/commons/io/input/ReaderInputStream.java
index de20bcb..4a52715 100644
--- a/src/main/java/org/apache/commons/io/input/ReaderInputStream.java
+++ b/src/main/java/org/apache/commons/io/input/ReaderInputStream.java
@@ -30,51 +30,46 @@ import java.nio.charset.CodingErrorAction;
 import java.util.Objects;
 
 /**
- * {@link InputStream} implementation that reads a character stream from a {@link Reader}
- * and transforms it to a byte stream using a specified charset encoding. The stream
- * is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset
- * encodings supported by the JRE are handled correctly. In particular for charsets such as
- * UTF-16, the implementation ensures that one and only one byte order marker
- * is produced.
+ * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte
+ * stream using a specified charset encoding. The stream is transformed using a {@link CharsetEncoder} object,
+ * guaranteeing that all charset encodings supported by the JRE are handled correctly. In particular for charsets such
+ * as UTF-16, the implementation ensures that one and only one byte order marker is produced.
  * <p>
- * Since in general it is not possible to predict the number of characters to be read from the
- * {@link Reader} to satisfy a read request on the {@link ReaderInputStream}, all reads from
- * the {@link Reader} are buffered. There is therefore no well defined correlation
- * between the current position of the {@link Reader} and that of the {@link ReaderInputStream}.
- * This also implies that in general there is no need to wrap the underlying {@link Reader}
+ * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy
+ * a read request on the {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore
+ * no well defined correlation between the current position of the {@link Reader} and that of the
+ * {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader}
  * in a {@link java.io.BufferedReader}.
  * </p>
  * <p>
- * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader};
- * in the following example, reading from {@code in2} would return the same byte
- * sequence as reading from {@code in} (provided that the initial byte sequence is legal
- * with respect to the charset encoding):
+ * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the
+ * following example, reading from {@code in2} would return the same byte sequence as reading from {@code in} (provided
+ * that the initial byte sequence is legal with respect to the charset encoding):
  * </p>
+ * 
  * <pre>
  * InputStream inputStream = ...
  * Charset cs = ...
  * InputStreamReader reader = new InputStreamReader(inputStream, cs);
- * ReaderInputStream in2 = new ReaderInputStream(reader, cs);</pre>
+ * ReaderInputStream in2 = new ReaderInputStream(reader, cs);
+ * </pre>
  * <p>
- * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter},
- * except that the control flow is reversed: both classes transform a character stream
- * into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream,
- * while {@link ReaderInputStream} pulls it from the underlying stream.
+ * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the
+ * control flow is reversed: both classes transform a character stream into a byte stream, but
+ * {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} pulls it
+ * from the underlying stream.
  * </p>
  * <p>
- * Note that while there are use cases where there is no alternative to using
- * this class, very often the need to use this class is an indication of a flaw
- * in the design of the code. This class is typically used in situations where an existing
- * API only accepts an {@link InputStream}, but where the most natural way to produce the data
- * is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation
- * where this problem may appear is when implementing the {@code javax.activation.DataSource}
- * interface from the Java Activation Framework.
+ * Note that while there are use cases where there is no alternative to using this class, very often the need to use
+ * this class is an indication of a flaw in the design of the code. This class is typically used in situations where an
+ * existing API only accepts an {@link InputStream}, but where the most natural way to produce the data is as a
+ * character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may
+ * appear is when implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
  * </p>
  * <p>
- * Given the fact that the {@link Reader} class doesn't provide any way to predict whether the next
- * read operation will block or not, it is not possible to provide a meaningful
- * implementation of the {@link InputStream#available()} method. A call to this method
- * will always return 0. Also, this class doesn't support {@link InputStream#mark(int)}.
+ * Given the fact that the {@link Reader} class doesn't provide any way to predict whether the next read operation will
+ * block or not, it is not possible to provide a meaningful implementation of the {@link InputStream#available()}
+ * method. A call to this method will always return 0. Also, this class doesn't support {@link InputStream#mark(int)}.
  * </p>
  * <p>
  * Instances of {@link ReaderInputStream} are not thread safe.
@@ -86,35 +81,41 @@ import java.util.Objects;
 public class ReaderInputStream extends InputStream {
     private static final int DEFAULT_BUFFER_SIZE = 1024;
 
-    private static int checkBufferSize(int bufferSize) {
-        if (bufferSize < 2) {
-            throw new IllegalArgumentException("Buffer size < 2");
+    static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
+        final float minRequired = minBufferSize(charsetEncoder);
+        if (bufferSize < minRequired) {
+            throw new IllegalArgumentException(
+                String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, charsetEncoder.charset().displayName()));
         }
         return bufferSize;
     }
+
+    static float minBufferSize(final CharsetEncoder charsetEncoder) {
+        return charsetEncoder.maxBytesPerChar() * 2;
+    }
+
     private final Reader reader;
 
-    private final CharsetEncoder encoder;
+    private final CharsetEncoder charsetEncoder;
 
     /**
-     * CharBuffer used as input for the decoder. It should be reasonably
-     * large as we read data from the underlying Reader into this buffer.
+     * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader
+     * into this buffer.
      */
     private final CharBuffer encoderIn;
-
     /**
-     * ByteBuffer used as output for the decoder. This buffer can be small
-     * as it is only used to transfer data from the decoder to the
-     * buffer provided by the caller.
+     * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the
+     * decoder to the buffer provided by the caller.
      */
     private final ByteBuffer encoderOut;
+
     private CoderResult lastCoderResult;
 
     private boolean endOfInput;
 
     /**
-     * Constructs a new {@link ReaderInputStream} that uses the default character encoding
-     * with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE} characters.
+     * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size
+     * of {@value #DEFAULT_BUFFER_SIZE} characters.
      *
      * @param reader the target {@link Reader}
      * @deprecated 2.5 use {@link #ReaderInputStream(Reader, Charset)} instead
@@ -125,8 +126,8 @@ public class ReaderInputStream extends InputStream {
     }
 
     /**
-     * Constructs a new {@link ReaderInputStream} with a default input buffer size of
-     * {@value #DEFAULT_BUFFER_SIZE} characters.
+     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE}
+     * characters.
      *
      * @param reader the target {@link Reader}
      * @param charset the charset encoding
@@ -148,7 +149,7 @@ public class ReaderInputStream extends InputStream {
              charset.newEncoder()
                     .onMalformedInput(CodingErrorAction.REPLACE)
                     .onUnmappableCharacter(CodingErrorAction.REPLACE),
-             checkBufferSize(bufferSize));
+             bufferSize);
         // @formatter:on
     }
 
@@ -156,33 +157,33 @@ public class ReaderInputStream extends InputStream {
      * Constructs a new {@link ReaderInputStream}.
      *
      * @param reader the target {@link Reader}
-     * @param encoder the charset encoder
+     * @param charsetEncoder the charset encoder
      * @since 2.1
      */
-    public ReaderInputStream(final Reader reader, final CharsetEncoder encoder) {
-        this(reader, encoder, DEFAULT_BUFFER_SIZE);
+    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
+        this(reader, charsetEncoder, DEFAULT_BUFFER_SIZE);
     }
 
     /**
      * Constructs a new {@link ReaderInputStream}.
      *
      * @param reader the target {@link Reader}
-     * @param encoder the charset encoder
+     * @param charsetEncoder the charset encoder
      * @param bufferSize the size of the input buffer in number of characters
      * @since 2.1
      */
-    public ReaderInputStream(final Reader reader, final CharsetEncoder encoder, final int bufferSize) {
+    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
         this.reader = reader;
-        this.encoder = encoder;
-        this.encoderIn = CharBuffer.allocate(bufferSize);
+        this.charsetEncoder = charsetEncoder;
+        this.encoderIn = CharBuffer.allocate(checkMinBufferSize(charsetEncoder, bufferSize));
         this.encoderIn.flip();
         this.encoderOut = ByteBuffer.allocate(128);
         this.encoderOut.flip();
     }
 
     /**
-     * Constructs a new {@link ReaderInputStream} with a default input buffer size of
-     * {@value #DEFAULT_BUFFER_SIZE} characters.
+     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE}
+     * characters.
      *
      * @param reader the target {@link Reader}
      * @param charsetName the name of the charset encoding
@@ -203,8 +204,8 @@ public class ReaderInputStream extends InputStream {
     }
 
     /**
-     * Close the stream. This method will cause the underlying {@link Reader}
-     * to be closed.
+     * Close the stream. This method will cause the underlying {@link Reader} to be closed.
+     * 
      * @throws IOException if an I/O error occurs.
      */
     @Override
@@ -215,8 +216,7 @@ public class ReaderInputStream extends InputStream {
     /**
      * Fills the internal char buffer from the reader.
      *
-     * @throws IOException
-     *             If an I/O error occurs
+     * @throws IOException If an I/O error occurs
      */
     private void fillBuffer() throws IOException {
         if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
@@ -234,7 +234,7 @@ public class ReaderInputStream extends InputStream {
             encoderIn.flip();
         }
         encoderOut.compact();
-        lastCoderResult = encoder.encode(encoderIn, encoderOut, endOfInput);
+        lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
         if (lastCoderResult.isError()) {
             lastCoderResult.throwException();
         }
@@ -244,8 +244,7 @@ public class ReaderInputStream extends InputStream {
     /**
      * Read a single byte.
      *
-     * @return either the byte read or {@code -1} if the end of the stream
-     *         has been reached
+     * @return either the byte read or {@code -1} if the end of the stream has been reached
      * @throws IOException if an I/O error occurs.
      */
     @Override
@@ -265,8 +264,7 @@ public class ReaderInputStream extends InputStream {
      * Read the specified number of bytes into an array.
      *
      * @param b the byte array to read into
-     * @return the number of bytes read or {@code -1}
-     *         if the end of the stream has been reached
+     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
      * @throws IOException if an I/O error occurs.
      */
     @Override
@@ -280,8 +278,7 @@ public class ReaderInputStream extends InputStream {
      * @param array the byte array to read into
      * @param off the offset to start reading bytes into
      * @param len the number of bytes to read
-     * @return the number of bytes read or {@code -1}
-     *         if the end of the stream has been reached
+     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
      * @throws IOException if an I/O error occurs.
      */
     @Override
diff --git a/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java b/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java
index 3dfaba0..13dee59 100644
--- a/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java
+++ b/src/test/java/org/apache/commons/io/input/CharSequenceInputStreamTest.java
@@ -25,6 +25,7 @@ import static org.junit.jupiter.api.Assertions.fail;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.Random;
 import java.util.Set;
 
@@ -264,12 +265,14 @@ private boolean isAvailabilityTestableForCharset(final String csName) {
 
     @Test
     public void testIO_356_Loop_UTF16() throws Exception {
-        testIO_356_Loop("UTF-16", 4);
+        final Charset charset = StandardCharsets.UTF_16;
+        testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder()));
     }
 
     @Test
     public void testIO_356_Loop_UTF8() throws Exception {
-        testIO_356_Loop("UTF-8", 4);
+        final Charset charset = StandardCharsets.UTF_8;
+        testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder()));
     }
 
     @Test
diff --git a/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java b/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java
index 5c985da..adabf8b 100644
--- a/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java
+++ b/src/test/java/org/apache/commons/io/input/ReaderInputStreamTest.java
@@ -52,13 +52,16 @@ public class ReaderInputStreamTest {
 
     @Test
     public void testBufferTooSmall() throws IOException {
+        assertThrows(IllegalArgumentException.class, () -> new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, -1));
+        assertThrows(IllegalArgumentException.class, () -> new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, 0));
         assertThrows(IllegalArgumentException.class, () -> new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, 1));
     }
     
     @Test
     @Timeout(value = 500, unit = TimeUnit.MILLISECONDS)
     public void testBufferSmallest() throws IOException {
-        try (InputStream in = new ReaderInputStream(new StringReader("\uD800"), StandardCharsets.UTF_8, 2)) {
+        final Charset charset = StandardCharsets.UTF_8;
+        try (InputStream in = new ReaderInputStream(new StringReader("\uD800"), charset, (int) ReaderInputStream.minBufferSize(charset.newEncoder()))) {
             in.read();
         }
     }
@@ -85,8 +88,9 @@ public class ReaderInputStreamTest {
     @Test
     @Timeout(value = 500, unit = TimeUnit.MILLISECONDS)
     public void testCodingErrorAction() throws IOException {
-        CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder().onMalformedInput(CodingErrorAction.REPORT);
-        try (InputStream in = new ReaderInputStream(new StringReader("\uD800aa"), encoder, 2)) {
+        final Charset charset = StandardCharsets.UTF_8;
+        CharsetEncoder encoder = charset.newEncoder().onMalformedInput(CodingErrorAction.REPORT);
+        try (InputStream in = new ReaderInputStream(new StringReader("\uD800aa"), encoder, (int) ReaderInputStream.minBufferSize(charset.newEncoder()))) {
             assertThrows(CharacterCodingException.class, in::read);
         }
     }