You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tomcat.apache.org by ma...@apache.org on 2022/04/26 17:02:29 UTC
[tomcat] branch main updated: Remove the customer UTF-8 decoder.
This is an automated email from the ASF dual-hosted git repository.
markt pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tomcat.git
The following commit(s) were added to refs/heads/main by this push:
new a9ca10266b Remove the customer UTF-8 decoder.
a9ca10266b is described below
commit a9ca10266bd5cf2919f59b1916ee786c01e2a033
Author: Mark Thomas <ma...@apache.org>
AuthorDate: Tue Apr 26 18:01:51 2022 +0100
Remove the customer UTF-8 decoder.
The issues this was introduced to work around were fixed in early Java 8
releases. Now the minimum Java version is 11, we can be sure we don't
need the custom decoder.
Retain the test that checks the JVM provided decoder to catch any
regressions.
---
java/org/apache/tomcat/util/buf/B2CConverter.java | 10 +-
java/org/apache/tomcat/util/buf/Utf8Decoder.java | 299 ----------------------
java/org/apache/tomcat/websocket/WsFrameBase.java | 6 +-
test/org/apache/tomcat/util/buf/TestUtf8.java | 9 -
webapps/docs/changelog.xml | 6 +
5 files changed, 10 insertions(+), 320 deletions(-)
diff --git a/java/org/apache/tomcat/util/buf/B2CConverter.java b/java/org/apache/tomcat/util/buf/B2CConverter.java
index 532c209ec9..c7fd4b67b7 100644
--- a/java/org/apache/tomcat/util/buf/B2CConverter.java
+++ b/java/org/apache/tomcat/util/buf/B2CConverter.java
@@ -24,7 +24,6 @@ import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
-import java.nio.charset.StandardCharsets;
import java.util.Locale;
import org.apache.tomcat.util.res.StringManager;
@@ -90,14 +89,7 @@ public class B2CConverter {
} else {
action = CodingErrorAction.REPORT;
}
- // Special case. Use the Apache Harmony based UTF-8 decoder because it
- // - a) rejects invalid sequences that the JVM decoder does not
- // - b) fails faster for some invalid sequences
- if (charset.equals(StandardCharsets.UTF_8)) {
- decoder = new Utf8Decoder();
- } else {
- decoder = charset.newDecoder();
- }
+ decoder = charset.newDecoder();
decoder.onMalformedInput(action);
decoder.onUnmappableCharacter(action);
}
diff --git a/java/org/apache/tomcat/util/buf/Utf8Decoder.java b/java/org/apache/tomcat/util/buf/Utf8Decoder.java
deleted file mode 100644
index 932e88c764..0000000000
--- a/java/org/apache/tomcat/util/buf/Utf8Decoder.java
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tomcat.util.buf;
-
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CoderResult;
-import java.nio.charset.StandardCharsets;
-
-/**
- * Decodes bytes to UTF-8. Extracted from Apache Harmony and modified to reject
- * code points from U+D800 to U+DFFF as per RFC3629. The standard Java decoder
- * does not reject these. It has also been modified to reject code points
- * greater than U+10FFFF which the standard Java decoder rejects but the harmony
- * one does not.
- */
-public class Utf8Decoder extends CharsetDecoder {
-
- // The next table contains information about UTF-8 charset and
- // correspondence of 1st byte to the length of sequence
- // For information please visit http://www.ietf.org/rfc/rfc3629.txt
- //
- // Please note, o means 0, actually.
- // -------------------------------------------------------------------
- // 0 1 2 3 Value
- // -------------------------------------------------------------------
- // oxxxxxxx 00000000 00000000 0xxxxxxx
- // 11oyyyyy 1oxxxxxx 00000000 00000yyy yyxxxxxx
- // 111ozzzz 1oyyyyyy 1oxxxxxx 00000000 zzzzyyyy yyxxxxxx
- // 1111ouuu 1ouuzzzz 1oyyyyyy 1oxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
- private static final int remainingBytes[] = {
- // 1owwwwww
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- // 11oyyyyy
- -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- // 111ozzzz
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- // 1111ouuu
- 3, 3, 3, 3, 3, -1, -1, -1,
- // > 11110111
- -1, -1, -1, -1, -1, -1, -1, -1};
- private static final int remainingNumbers[] = {0, // 0 1 2 3
- 4224, // (01o00000b << 6)+(1o000000b)
- 401536, // (011o0000b << 12)+(1o000000b << 6)+(1o000000b)
- 29892736 // (0111o000b << 18)+(1o000000b << 12)+(1o000000b <<
- // 6)+(1o000000b)
- };
- private static final int lowerEncodingLimit[] = {-1, 0x80, 0x800, 0x10000};
-
-
- public Utf8Decoder() {
- super(StandardCharsets.UTF_8, 1.0f, 1.0f);
- }
-
-
- @Override
- protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
- if (in.hasArray() && out.hasArray()) {
- return decodeHasArray(in, out);
- }
- return decodeNotHasArray(in, out);
- }
-
-
- private CoderResult decodeNotHasArray(ByteBuffer in, CharBuffer out) {
- int outRemaining = out.remaining();
- int pos = in.position();
- int limit = in.limit();
- try {
- while (pos < limit) {
- if (outRemaining == 0) {
- return CoderResult.OVERFLOW;
- }
- int jchar = in.get();
- if (jchar < 0) {
- jchar = jchar & 0x7F;
- int tail = remainingBytes[jchar];
- if (tail == -1) {
- return CoderResult.malformedForLength(1);
- }
- if (limit - pos < 1 + tail) {
- // No early test for invalid sequences here as peeking
- // at the next byte is harder
- return CoderResult.UNDERFLOW;
- }
- int nextByte;
- for (int i = 0; i < tail; i++) {
- nextByte = in.get() & 0xFF;
- if ((nextByte & 0xC0) != 0x80) {
- return CoderResult.malformedForLength(1 + i);
- }
- jchar = (jchar << 6) + nextByte;
- }
- jchar -= remainingNumbers[tail];
- if (jchar < lowerEncodingLimit[tail]) {
- // Should have been encoded in a fewer octets
- return CoderResult.malformedForLength(1);
- }
- pos += tail;
- }
- // Apache Tomcat added test
- if (jchar >= 0xD800 && jchar <= 0xDFFF) {
- return CoderResult.unmappableForLength(3);
- }
- // Apache Tomcat added test
- if (jchar > 0x10FFFF) {
- return CoderResult.unmappableForLength(4);
- }
- if (jchar <= 0xffff) {
- out.put((char) jchar);
- outRemaining--;
- } else {
- if (outRemaining < 2) {
- return CoderResult.OVERFLOW;
- }
- out.put((char) ((jchar >> 0xA) + 0xD7C0));
- out.put((char) ((jchar & 0x3FF) + 0xDC00));
- outRemaining -= 2;
- }
- pos++;
- }
- return CoderResult.UNDERFLOW;
- } finally {
- in.position(pos);
- }
- }
-
-
- private CoderResult decodeHasArray(ByteBuffer in, CharBuffer out) {
- int outRemaining = out.remaining();
- int pos = in.position();
- int limit = in.limit();
- final byte[] bArr = in.array();
- final char[] cArr = out.array();
- final int inIndexLimit = limit + in.arrayOffset();
- int inIndex = pos + in.arrayOffset();
- int outIndex = out.position() + out.arrayOffset();
- // if someone would change the limit in process,
- // they would face consequences
- for (; inIndex < inIndexLimit && outRemaining > 0; inIndex++) {
- int jchar = bArr[inIndex];
- if (jchar < 0) {
- jchar = jchar & 0x7F;
- // If first byte is invalid, tail will be set to -1
- int tail = remainingBytes[jchar];
- if (tail == -1) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // Additional checks to detect invalid sequences ASAP
- // Checks derived from Unicode 6.2, Chapter 3, Table 3-7
- // Check 2nd byte
- int tailAvailable = inIndexLimit - inIndex - 1;
- if (tailAvailable > 0) {
- // First byte C2..DF, second byte 80..BF
- if (jchar > 0x41 && jchar < 0x60 &&
- (bArr[inIndex + 1] & 0xC0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // First byte E0, second byte A0..BF
- if (jchar == 0x60 && (bArr[inIndex + 1] & 0xE0) != 0xA0) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // First byte E1..EC, second byte 80..BF
- if (jchar > 0x60 && jchar < 0x6D &&
- (bArr[inIndex + 1] & 0xC0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // First byte ED, second byte 80..9F
- if (jchar == 0x6D && (bArr[inIndex + 1] & 0xE0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // First byte EE..EF, second byte 80..BF
- if (jchar > 0x6D && jchar < 0x70 &&
- (bArr[inIndex + 1] & 0xC0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // First byte F0, second byte 90..BF
- if (jchar == 0x70 &&
- ((bArr[inIndex + 1] & 0xFF) < 0x90 ||
- (bArr[inIndex + 1] & 0xFF) > 0xBF)) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // First byte F1..F3, second byte 80..BF
- if (jchar > 0x70 && jchar < 0x74 &&
- (bArr[inIndex + 1] & 0xC0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- // First byte F4, second byte 80..8F
- if (jchar == 0x74 &&
- (bArr[inIndex + 1] & 0xF0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- }
- // Check third byte if present and expected
- if (tailAvailable > 1 && tail > 1) {
- if ((bArr[inIndex + 2] & 0xC0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(2);
- }
- }
- // Check fourth byte if present and expected
- if (tailAvailable > 2 && tail > 2) {
- if ((bArr[inIndex + 3] & 0xC0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(3);
- }
- }
- if (tailAvailable < tail) {
- break;
- }
- for (int i = 0; i < tail; i++) {
- int nextByte = bArr[inIndex + i + 1] & 0xFF;
- if ((nextByte & 0xC0) != 0x80) {
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1 + i);
- }
- jchar = (jchar << 6) + nextByte;
- }
- jchar -= remainingNumbers[tail];
- if (jchar < lowerEncodingLimit[tail]) {
- // Should have been encoded in fewer octets
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.malformedForLength(1);
- }
- inIndex += tail;
- }
- // Apache Tomcat added test
- if (jchar >= 0xD800 && jchar <= 0xDFFF) {
- return CoderResult.unmappableForLength(3);
- }
- // Apache Tomcat added test
- if (jchar > 0x10FFFF) {
- return CoderResult.unmappableForLength(4);
- }
- if (jchar <= 0xffff) {
- cArr[outIndex++] = (char) jchar;
- outRemaining--;
- } else {
- if (outRemaining < 2) {
- // Encoded with 4 bytes. inIndex currently points
- // to the final byte. Move it back to first byte.
- inIndex -= 3;
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return CoderResult.OVERFLOW;
- }
- cArr[outIndex++] = (char) ((jchar >> 0xA) + 0xD7C0);
- cArr[outIndex++] = (char) ((jchar & 0x3FF) + 0xDC00);
- outRemaining -= 2;
- }
- }
- in.position(inIndex - in.arrayOffset());
- out.position(outIndex - out.arrayOffset());
- return (outRemaining == 0 && inIndex < inIndexLimit) ?
- CoderResult.OVERFLOW :
- CoderResult.UNDERFLOW;
- }
-}
diff --git a/java/org/apache/tomcat/websocket/WsFrameBase.java b/java/org/apache/tomcat/websocket/WsFrameBase.java
index fd8c4ebab0..10dfb7913d 100644
--- a/java/org/apache/tomcat/websocket/WsFrameBase.java
+++ b/java/org/apache/tomcat/websocket/WsFrameBase.java
@@ -22,6 +22,7 @@ import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
@@ -33,7 +34,6 @@ import jakarta.websocket.PongMessage;
import org.apache.juli.logging.Log;
import org.apache.tomcat.util.ExceptionUtils;
-import org.apache.tomcat.util.buf.Utf8Decoder;
import org.apache.tomcat.util.res.StringManager;
/**
@@ -57,10 +57,10 @@ public abstract class WsFrameBase {
private final CharBuffer controlBufferText = CharBuffer.allocate(125);
// Attributes of the current message
- private final CharsetDecoder utf8DecoderControl = new Utf8Decoder().
+ private final CharsetDecoder utf8DecoderControl = StandardCharsets.UTF_8.newDecoder().
onMalformedInput(CodingErrorAction.REPORT).
onUnmappableCharacter(CodingErrorAction.REPORT);
- private final CharsetDecoder utf8DecoderMessage = new Utf8Decoder().
+ private final CharsetDecoder utf8DecoderMessage = StandardCharsets.UTF_8.newDecoder().
onMalformedInput(CodingErrorAction.REPORT).
onUnmappableCharacter(CodingErrorAction.REPORT);
private boolean continuationExpected = false;
diff --git a/test/org/apache/tomcat/util/buf/TestUtf8.java b/test/org/apache/tomcat/util/buf/TestUtf8.java
index 71e8ef30c1..3bc55a64e1 100644
--- a/test/org/apache/tomcat/util/buf/TestUtf8.java
+++ b/test/org/apache/tomcat/util/buf/TestUtf8.java
@@ -316,15 +316,6 @@ public class TestUtf8 {
TEST_CASES = Collections.unmodifiableList(testCases);
}
- @Test
- public void testHarmonyDecoder() {
- CharsetDecoder decoder = new Utf8Decoder();
- for (Utf8TestCase testCase : TEST_CASES) {
- doTest(decoder, testCase);
- }
- }
-
-
@Test
public void testJvmDecoder() {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
diff --git a/webapps/docs/changelog.xml b/webapps/docs/changelog.xml
index 85ea9b3301..01949d0425 100644
--- a/webapps/docs/changelog.xml
+++ b/webapps/docs/changelog.xml
@@ -137,6 +137,12 @@
of the OS that uses kernel 5.10 or later. Thanks to Christopher Gual for
the research into this issue. (markt)
</fix>
+ <scode>
+ Remove the custom UTF-decoder that was introduced to work around various
+ UTF-8 decoding bugs in Java. These issues were fixed in early Java 8
+ releases. Now the minimum Java version is 11, we can be sure that Tomcat
+ will not be running on a JRE where these issues are present. (markt)
+ </scode>
</changelog>
</subsection>
<subsection name="Jasper">
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org