You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tomcat.apache.org by ma...@apache.org on 2022/04/26 17:02:29 UTC
[tomcat] branch main updated: Remove the customer UTF-8 decoder.

This is an automated email from the ASF dual-hosted git repository.

markt pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tomcat.git


The following commit(s) were added to refs/heads/main by this push:
     new a9ca10266b Remove the customer UTF-8 decoder.
a9ca10266b is described below

commit a9ca10266bd5cf2919f59b1916ee786c01e2a033
Author: Mark Thomas <ma...@apache.org>
AuthorDate: Tue Apr 26 18:01:51 2022 +0100

    Remove the customer UTF-8 decoder.
    
    The issues this was introduced to work around were fixed in early Java 8
    releases. Now the minimum Java version is 11, we can be sure we don't
    need the custom decoder.
    
    Retain the test that checks the JVM provided decoder to catch any
    regressions.
---
 java/org/apache/tomcat/util/buf/B2CConverter.java |  10 +-
 java/org/apache/tomcat/util/buf/Utf8Decoder.java  | 299 ----------------------
 java/org/apache/tomcat/websocket/WsFrameBase.java |   6 +-
 test/org/apache/tomcat/util/buf/TestUtf8.java     |   9 -
 webapps/docs/changelog.xml                        |   6 +
 5 files changed, 10 insertions(+), 320 deletions(-)

diff --git a/java/org/apache/tomcat/util/buf/B2CConverter.java b/java/org/apache/tomcat/util/buf/B2CConverter.java
index 532c209ec9..c7fd4b67b7 100644
--- a/java/org/apache/tomcat/util/buf/B2CConverter.java
+++ b/java/org/apache/tomcat/util/buf/B2CConverter.java
@@ -24,7 +24,6 @@ import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
-import java.nio.charset.StandardCharsets;
 import java.util.Locale;
 
 import org.apache.tomcat.util.res.StringManager;
@@ -90,14 +89,7 @@ public class B2CConverter {
         } else {
             action = CodingErrorAction.REPORT;
         }
-        // Special case. Use the Apache Harmony based UTF-8 decoder because it
-        // - a) rejects invalid sequences that the JVM decoder does not
-        // - b) fails faster for some invalid sequences
-        if (charset.equals(StandardCharsets.UTF_8)) {
-            decoder = new Utf8Decoder();
-        } else {
-            decoder = charset.newDecoder();
-        }
+        decoder = charset.newDecoder();
         decoder.onMalformedInput(action);
         decoder.onUnmappableCharacter(action);
     }
diff --git a/java/org/apache/tomcat/util/buf/Utf8Decoder.java b/java/org/apache/tomcat/util/buf/Utf8Decoder.java
deleted file mode 100644
index 932e88c764..0000000000
--- a/java/org/apache/tomcat/util/buf/Utf8Decoder.java
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tomcat.util.buf;
-
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CoderResult;
-import java.nio.charset.StandardCharsets;
-
-/**
- * Decodes bytes to UTF-8. Extracted from Apache Harmony and modified to reject
- * code points from U+D800 to U+DFFF as per RFC3629. The standard Java decoder
- * does not reject these. It has also been modified to reject code points
- * greater than U+10FFFF which the standard Java decoder rejects but the harmony
- * one does not.
- */
-public class Utf8Decoder extends CharsetDecoder {
-
-    // The next table contains information about UTF-8 charset and
-    // correspondence of 1st byte to the length of sequence
-    // For information please visit http://www.ietf.org/rfc/rfc3629.txt
-    //
-    // Please note, o means 0, actually.
-    // -------------------------------------------------------------------
-    // 0 1 2 3 Value
-    // -------------------------------------------------------------------
-    // oxxxxxxx                            00000000 00000000 0xxxxxxx
-    // 11oyyyyy 1oxxxxxx                   00000000 00000yyy yyxxxxxx
-    // 111ozzzz 1oyyyyyy 1oxxxxxx          00000000 zzzzyyyy yyxxxxxx
-    // 1111ouuu 1ouuzzzz 1oyyyyyy 1oxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
-    private static final int remainingBytes[] = {
-            // 1owwwwww
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            // 11oyyyyy
-            -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            // 111ozzzz
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            // 1111ouuu
-            3, 3, 3, 3, 3, -1, -1, -1,
-            // > 11110111
-            -1, -1, -1, -1, -1, -1, -1, -1};
-    private static final int remainingNumbers[] = {0, // 0 1 2 3
-            4224, // (01o00000b << 6)+(1o000000b)
-            401536, // (011o0000b << 12)+(1o000000b << 6)+(1o000000b)
-            29892736 // (0111o000b << 18)+(1o000000b << 12)+(1o000000b <<
-                     // 6)+(1o000000b)
-    };
-    private static final int lowerEncodingLimit[] = {-1, 0x80, 0x800, 0x10000};
-
-
-    public Utf8Decoder() {
-        super(StandardCharsets.UTF_8, 1.0f, 1.0f);
-    }
-
-
-    @Override
-    protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
-        if (in.hasArray() && out.hasArray()) {
-            return decodeHasArray(in, out);
-        }
-        return decodeNotHasArray(in, out);
-    }
-
-
-    private CoderResult decodeNotHasArray(ByteBuffer in, CharBuffer out) {
-        int outRemaining = out.remaining();
-        int pos = in.position();
-        int limit = in.limit();
-        try {
-            while (pos < limit) {
-                if (outRemaining == 0) {
-                    return CoderResult.OVERFLOW;
-                }
-                int jchar = in.get();
-                if (jchar < 0) {
-                    jchar = jchar & 0x7F;
-                    int tail = remainingBytes[jchar];
-                    if (tail == -1) {
-                        return CoderResult.malformedForLength(1);
-                    }
-                    if (limit - pos < 1 + tail) {
-                        // No early test for invalid sequences here as peeking
-                        // at the next byte is harder
-                        return CoderResult.UNDERFLOW;
-                    }
-                    int nextByte;
-                    for (int i = 0; i < tail; i++) {
-                        nextByte = in.get() & 0xFF;
-                        if ((nextByte & 0xC0) != 0x80) {
-                            return CoderResult.malformedForLength(1 + i);
-                        }
-                        jchar = (jchar << 6) + nextByte;
-                    }
-                    jchar -= remainingNumbers[tail];
-                    if (jchar < lowerEncodingLimit[tail]) {
-                        // Should have been encoded in a fewer octets
-                        return CoderResult.malformedForLength(1);
-                    }
-                    pos += tail;
-                }
-                // Apache Tomcat added test
-                if (jchar >= 0xD800 && jchar <= 0xDFFF) {
-                    return CoderResult.unmappableForLength(3);
-                }
-                // Apache Tomcat added test
-                if (jchar > 0x10FFFF) {
-                    return CoderResult.unmappableForLength(4);
-                }
-                if (jchar <= 0xffff) {
-                    out.put((char) jchar);
-                    outRemaining--;
-                } else {
-                    if (outRemaining < 2) {
-                        return CoderResult.OVERFLOW;
-                    }
-                    out.put((char) ((jchar >> 0xA) + 0xD7C0));
-                    out.put((char) ((jchar & 0x3FF) + 0xDC00));
-                    outRemaining -= 2;
-                }
-                pos++;
-            }
-            return CoderResult.UNDERFLOW;
-        } finally {
-            in.position(pos);
-        }
-    }
-
-
-    private CoderResult decodeHasArray(ByteBuffer in, CharBuffer out) {
-        int outRemaining = out.remaining();
-        int pos = in.position();
-        int limit = in.limit();
-        final byte[] bArr = in.array();
-        final char[] cArr = out.array();
-        final int inIndexLimit = limit + in.arrayOffset();
-        int inIndex = pos + in.arrayOffset();
-        int outIndex = out.position() + out.arrayOffset();
-        // if someone would change the limit in process,
-        // they would face consequences
-        for (; inIndex < inIndexLimit && outRemaining > 0; inIndex++) {
-            int jchar = bArr[inIndex];
-            if (jchar < 0) {
-                jchar = jchar & 0x7F;
-                // If first byte is invalid, tail will be set to -1
-                int tail = remainingBytes[jchar];
-                if (tail == -1) {
-                    in.position(inIndex - in.arrayOffset());
-                    out.position(outIndex - out.arrayOffset());
-                    return CoderResult.malformedForLength(1);
-                }
-                // Additional checks to detect invalid sequences ASAP
-                // Checks derived from Unicode 6.2, Chapter 3, Table 3-7
-                // Check 2nd byte
-                int tailAvailable = inIndexLimit - inIndex - 1;
-                if (tailAvailable > 0) {
-                    // First byte C2..DF, second byte 80..BF
-                    if (jchar > 0x41 && jchar < 0x60 &&
-                            (bArr[inIndex + 1] & 0xC0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                    // First byte E0, second byte A0..BF
-                    if (jchar == 0x60 && (bArr[inIndex + 1] & 0xE0) != 0xA0) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                    // First byte E1..EC, second byte 80..BF
-                    if (jchar > 0x60 && jchar < 0x6D &&
-                            (bArr[inIndex + 1] & 0xC0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                    // First byte ED, second byte 80..9F
-                    if (jchar == 0x6D && (bArr[inIndex + 1] & 0xE0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                    // First byte EE..EF, second byte 80..BF
-                    if (jchar > 0x6D && jchar < 0x70 &&
-                            (bArr[inIndex + 1] & 0xC0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                    // First byte F0, second byte 90..BF
-                    if (jchar == 0x70 &&
-                            ((bArr[inIndex + 1] & 0xFF) < 0x90 ||
-                            (bArr[inIndex + 1] & 0xFF) > 0xBF)) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                    // First byte F1..F3, second byte 80..BF
-                    if (jchar > 0x70 && jchar < 0x74 &&
-                            (bArr[inIndex + 1] & 0xC0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                    // First byte F4, second byte 80..8F
-                    if (jchar == 0x74 &&
-                            (bArr[inIndex + 1] & 0xF0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1);
-                    }
-                }
-                // Check third byte if present and expected
-                if (tailAvailable > 1 && tail > 1) {
-                    if ((bArr[inIndex + 2] & 0xC0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(2);
-                    }
-                }
-                // Check fourth byte if present and expected
-                if (tailAvailable > 2 && tail > 2) {
-                    if ((bArr[inIndex + 3] & 0xC0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(3);
-                    }
-                }
-                if (tailAvailable < tail) {
-                    break;
-                }
-                for (int i = 0; i < tail; i++) {
-                    int nextByte = bArr[inIndex + i + 1] & 0xFF;
-                    if ((nextByte & 0xC0) != 0x80) {
-                        in.position(inIndex - in.arrayOffset());
-                        out.position(outIndex - out.arrayOffset());
-                        return CoderResult.malformedForLength(1 + i);
-                    }
-                    jchar = (jchar << 6) + nextByte;
-                }
-                jchar -= remainingNumbers[tail];
-                if (jchar < lowerEncodingLimit[tail]) {
-                    // Should have been encoded in fewer octets
-                    in.position(inIndex - in.arrayOffset());
-                    out.position(outIndex - out.arrayOffset());
-                    return CoderResult.malformedForLength(1);
-                }
-                inIndex += tail;
-            }
-            // Apache Tomcat added test
-            if (jchar >= 0xD800 && jchar <= 0xDFFF) {
-                return CoderResult.unmappableForLength(3);
-            }
-            // Apache Tomcat added test
-            if (jchar > 0x10FFFF) {
-                return CoderResult.unmappableForLength(4);
-            }
-            if (jchar <= 0xffff) {
-                cArr[outIndex++] = (char) jchar;
-                outRemaining--;
-            } else {
-                if (outRemaining < 2) {
-                    // Encoded with 4 bytes. inIndex currently points
-                    // to the final byte. Move it back to first byte.
-                    inIndex -= 3;
-                    in.position(inIndex - in.arrayOffset());
-                    out.position(outIndex - out.arrayOffset());
-                    return CoderResult.OVERFLOW;
-                }
-                cArr[outIndex++] = (char) ((jchar >> 0xA) + 0xD7C0);
-                cArr[outIndex++] = (char) ((jchar & 0x3FF) + 0xDC00);
-                outRemaining -= 2;
-            }
-        }
-        in.position(inIndex - in.arrayOffset());
-        out.position(outIndex - out.arrayOffset());
-        return (outRemaining == 0 && inIndex < inIndexLimit) ?
-                CoderResult.OVERFLOW :
-                CoderResult.UNDERFLOW;
-    }
-}
diff --git a/java/org/apache/tomcat/websocket/WsFrameBase.java b/java/org/apache/tomcat/websocket/WsFrameBase.java
index fd8c4ebab0..10dfb7913d 100644
--- a/java/org/apache/tomcat/websocket/WsFrameBase.java
+++ b/java/org/apache/tomcat/websocket/WsFrameBase.java
@@ -22,6 +22,7 @@ import java.nio.CharBuffer;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
 
@@ -33,7 +34,6 @@ import jakarta.websocket.PongMessage;
 
 import org.apache.juli.logging.Log;
 import org.apache.tomcat.util.ExceptionUtils;
-import org.apache.tomcat.util.buf.Utf8Decoder;
 import org.apache.tomcat.util.res.StringManager;
 
 /**
@@ -57,10 +57,10 @@ public abstract class WsFrameBase {
     private final CharBuffer controlBufferText = CharBuffer.allocate(125);
 
     // Attributes of the current message
-    private final CharsetDecoder utf8DecoderControl = new Utf8Decoder().
+    private final CharsetDecoder utf8DecoderControl = StandardCharsets.UTF_8.newDecoder().
             onMalformedInput(CodingErrorAction.REPORT).
             onUnmappableCharacter(CodingErrorAction.REPORT);
-    private final CharsetDecoder utf8DecoderMessage = new Utf8Decoder().
+    private final CharsetDecoder utf8DecoderMessage = StandardCharsets.UTF_8.newDecoder().
             onMalformedInput(CodingErrorAction.REPORT).
             onUnmappableCharacter(CodingErrorAction.REPORT);
     private boolean continuationExpected = false;
diff --git a/test/org/apache/tomcat/util/buf/TestUtf8.java b/test/org/apache/tomcat/util/buf/TestUtf8.java
index 71e8ef30c1..3bc55a64e1 100644
--- a/test/org/apache/tomcat/util/buf/TestUtf8.java
+++ b/test/org/apache/tomcat/util/buf/TestUtf8.java
@@ -316,15 +316,6 @@ public class TestUtf8 {
         TEST_CASES = Collections.unmodifiableList(testCases);
     }
 
-    @Test
-    public void testHarmonyDecoder() {
-        CharsetDecoder decoder = new Utf8Decoder();
-        for (Utf8TestCase testCase : TEST_CASES) {
-            doTest(decoder, testCase);
-        }
-    }
-
-
     @Test
     public void testJvmDecoder() {
         CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
diff --git a/webapps/docs/changelog.xml b/webapps/docs/changelog.xml
index 85ea9b3301..01949d0425 100644
--- a/webapps/docs/changelog.xml
+++ b/webapps/docs/changelog.xml
@@ -137,6 +137,12 @@
         of the OS that uses kernel 5.10 or later. Thanks to Christopher Gual for
         the research into this issue. (markt)
       </fix>
+      <scode>
+        Remove the custom UTF-decoder that was introduced to work around various
+        UTF-8 decoding bugs in Java. These issues were fixed in early Java 8
+        releases. Now the minimum Java version is 11, we can be sure that Tomcat
+        will not be running on a JRE where these issues are present. (markt)
+      </scode>
     </changelog>
   </subsection>
   <subsection name="Jasper">


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org