You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tomcat.apache.org by ma...@apache.org on 2023/06/22 18:55:14 UTC

[tomcat] branch main updated: Add control of byte decoding errors to ByteChunk and StringCache

This is an automated email from the ASF dual-hosted git repository.

markt pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tomcat.git


The following commit(s) were added to refs/heads/main by this push:
     new 944951302e Add control of byte decoding errors to ByteChunk and StringCache
944951302e is described below

commit 944951302e2f478879411dbff353f5818ad44121
Author: Mark Thomas <ma...@apache.org>
AuthorDate: Wed Jun 14 12:25:21 2023 +0100

    Add control of byte decoding errors to ByteChunk and StringCache
---
 java/org/apache/tomcat/util/buf/ByteChunk.java     |  47 +++++++++-
 java/org/apache/tomcat/util/buf/StringCache.java   |  66 +++++++++++---
 test/org/apache/jasper/compiler/TestGenerator.java |   3 +-
 .../apache/tomcat/util/buf/TestStringCache.java    | 100 +++++++++++++++++++++
 4 files changed, 203 insertions(+), 13 deletions(-)

diff --git a/java/org/apache/tomcat/util/buf/ByteChunk.java b/java/org/apache/tomcat/util/buf/ByteChunk.java
index 101f9c0eaa..ff00f55774 100644
--- a/java/org/apache/tomcat/util/buf/ByteChunk.java
+++ b/java/org/apache/tomcat/util/buf/ByteChunk.java
@@ -21,7 +21,9 @@ import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
+import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 
 /*
@@ -521,23 +523,64 @@ public final class ByteChunk extends AbstractChunk {
 
     @Override
     public String toString() {
+        try {
+            return toString(CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
+        } catch (CharacterCodingException e) {
+            // Unreachable code. Use of REPLACE above means the exception will never be thrown.
+            throw new IllegalStateException(e);
+        }
+    }
+
+
+    public String toString(CodingErrorAction malformedInputAction, CodingErrorAction unmappableCharacterAction)
+            throws CharacterCodingException {
         if (isNull()) {
             return null;
         } else if (end - start == 0) {
             return "";
         }
-        return StringCache.toString(this);
+        return StringCache.toString(this, malformedInputAction, unmappableCharacterAction);
     }
 
 
+    /**
+     * Converts the current content of the byte buffer to a String using the configured character set.
+     *
+     * @return The result of converting the bytes to a String
+     *
+     * @deprecated Unused. This method will be removed in Tomcat 11 onwards.
+     */
+    @Deprecated
     public String toStringInternal() {
+        try {
+            return toStringInternal(CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
+        } catch (CharacterCodingException e) {
+            // Unreachable code. Use of REPLACE above means the exception will never be thrown.
+            throw new IllegalStateException(e);
+        }
+    }
+
+
+    /**
+     * Converts the current content of the byte buffer to a String using the configured character set.
+     *
+     * @param malformedInputAction      Action to take if the input is malformed
+     * @param unmappableCharacterAction Action to take if a byte sequence can't be mapped to a character
+     *
+     * @return The result of converting the bytes to a String
+     *
+     * @throws CharacterCodingException If an error occurs during the conversion
+     */
+    public String toStringInternal(CodingErrorAction malformedInputAction, CodingErrorAction unmappableCharacterAction)
+            throws CharacterCodingException {
         if (charset == null) {
             charset = DEFAULT_CHARSET;
         }
         // new String(byte[], int, int, Charset) takes a defensive copy of the
         // entire byte array. This is expensive if only a small subset of the
         // bytes will be used. The code below is from Apache Harmony.
-        CharBuffer cb = charset.decode(ByteBuffer.wrap(buff, start, end - start));
+        CharBuffer cb = charset.newDecoder().onMalformedInput(malformedInputAction)
+                .onUnmappableCharacter(unmappableCharacterAction).decode(ByteBuffer.wrap(buff, start, end - start));
         return new String(cb.array(), cb.arrayOffset(), cb.length());
     }
 
diff --git a/java/org/apache/tomcat/util/buf/StringCache.java b/java/org/apache/tomcat/util/buf/StringCache.java
index d39de93cfa..5b82e44f74 100644
--- a/java/org/apache/tomcat/util/buf/StringCache.java
+++ b/java/org/apache/tomcat/util/buf/StringCache.java
@@ -16,10 +16,13 @@
  */
 package org.apache.tomcat.util.buf;
 
+import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
+import java.nio.charset.CodingErrorAction;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map.Entry;
+import java.util.Objects;
 import java.util.TreeMap;
 
 import org.apache.juli.logging.Log;
@@ -208,11 +211,22 @@ public class StringCache {
 
 
     public static String toString(ByteChunk bc) {
+        try {
+            return toString(bc, CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
+        } catch (CharacterCodingException e) {
+            // Unreachable code. Use of REPLACE above means the exception will never be thrown.
+            throw new IllegalStateException(e);
+        }
+    }
+
+
+    public static String toString(ByteChunk bc, CodingErrorAction malformedInputAction,
+            CodingErrorAction unmappableCharacterAction) throws CharacterCodingException {
 
         // If the cache is null, then either caching is disabled, or we're
         // still training
         if (bcCache == null) {
-            String value = bc.toStringInternal();
+            String value = bc.toStringInternal(malformedInputAction, unmappableCharacterAction);
             if (byteEnabled && (value.length() < maxStringSize)) {
                 // If training, everything is synced
                 synchronized (bcStats) {
@@ -285,6 +299,8 @@ public class StringCache {
                             System.arraycopy(bc.getBuffer(), start, entry.name, 0, end - start);
                             // Set encoding
                             entry.charset = bc.getCharset();
+                            entry.malformedInputAction = malformedInputAction;
+                            entry.unmappableCharacterAction = unmappableCharacterAction;
                             // Initialize occurrence count to one
                             count = new int[1];
                             count[0] = 1;
@@ -300,9 +316,9 @@ public class StringCache {
         } else {
             accessCount++;
             // Find the corresponding String
-            String result = find(bc);
+            String result = find(bc, malformedInputAction, unmappableCharacterAction);
             if (result == null) {
-                return bc.toStringInternal();
+                return bc.toStringInternal(malformedInputAction, unmappableCharacterAction);
             }
             // Note: We don't care about safety for the stats
             hitCount++;
@@ -462,10 +478,31 @@ public class StringCache {
      * @param name The name to find
      *
      * @return the corresponding value
+     *
+     * @deprecated Unused. Will be removed in Tomcat 11.
+     *             Use {@link #find(ByteChunk, CodingErrorAction, CodingErrorAction)}
      */
+    @Deprecated
     protected static final String find(ByteChunk name) {
+        return find(name, CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
+    }
+
+
+    /**
+     * Find an entry given its name in the cache and return the associated String.
+     *
+     * @param name                      The name to find
+     * @param malformedInputAction      Action to take if an malformed input is encountered
+     * @param unmappableCharacterAction Action to take if an unmappable character is encountered
+     *
+     * @return the corresponding value
+     */
+    protected static final String find(ByteChunk name, CodingErrorAction malformedInputAction,
+        CodingErrorAction unmappableCharacterAction) {
         int pos = findClosest(name, bcCache, bcCache.length);
-        if ((pos < 0) || (compare(name, bcCache[pos].name) != 0) || !(name.getCharset().equals(bcCache[pos].charset))) {
+        if ((pos < 0) || (compare(name, bcCache[pos].name) != 0) || !(name.getCharset().equals(bcCache[pos].charset)) ||
+                !malformedInputAction.equals(bcCache[pos].malformedInputAction) ||
+                !unmappableCharacterAction.equals(bcCache[pos].unmappableCharacterAction)) {
             return null;
         } else {
             return bcCache[pos].value;
@@ -631,11 +668,12 @@ public class StringCache {
 
     // -------------------------------------------------- ByteEntry Inner Class
 
-
     private static class ByteEntry {
 
         private byte[] name = null;
         private Charset charset = null;
+        private CodingErrorAction malformedInputAction = null;
+        private CodingErrorAction unmappableCharacterAction = null;
         private String value = null;
 
         @Override
@@ -645,17 +683,25 @@ public class StringCache {
 
         @Override
         public int hashCode() {
-            return value.hashCode();
+            return Objects.hash(malformedInputAction, unmappableCharacterAction, value);
         }
 
         @Override
         public boolean equals(Object obj) {
-            if (obj instanceof ByteEntry) {
-                return value.equals(((ByteEntry) obj).value);
+            if (this == obj) {
+                return true;
             }
-            return false;
+            if (obj == null) {
+                return false;
+            }
+            if (getClass() != obj.getClass()) {
+                return false;
+            }
+            ByteEntry other = (ByteEntry) obj;
+            return Objects.equals(malformedInputAction, other.malformedInputAction) &&
+                    Objects.equals(unmappableCharacterAction, other.unmappableCharacterAction) &&
+                    Objects.equals(value, other.value);
         }
-
     }
 
 
diff --git a/test/org/apache/jasper/compiler/TestGenerator.java b/test/org/apache/jasper/compiler/TestGenerator.java
index 8cfe73f54c..b854a6010b 100644
--- a/test/org/apache/jasper/compiler/TestGenerator.java
+++ b/test/org/apache/jasper/compiler/TestGenerator.java
@@ -22,6 +22,7 @@ import java.beans.PropertyDescriptor;
 import java.beans.PropertyEditorSupport;
 import java.io.File;
 import java.io.IOException;
+import java.nio.charset.CodingErrorAction;
 import java.util.Date;
 import java.util.Scanner;
 
@@ -211,7 +212,7 @@ public class TestGenerator extends TomcatBaseTest {
         ByteChunk bc = new ByteChunk();
         int rc = getUrl("http://localhost:" + getPort() + "/test/bug5nnnn/bug56529.jsp", bc, null);
         Assert.assertEquals(HttpServletResponse.SC_OK, rc);
-        String response = bc.toStringInternal();
+        String response = bc.toStringInternal(CodingErrorAction.REPORT, CodingErrorAction.REPORT);
         Assert.assertTrue(response, response.contains("[1:attribute1: '', attribute2: '']"));
         Assert.assertTrue(response, response.contains("[2:attribute1: '', attribute2: '']"));
     }
diff --git a/test/org/apache/tomcat/util/buf/TestStringCache.java b/test/org/apache/tomcat/util/buf/TestStringCache.java
new file mode 100644
index 0000000000..2345d2de76
--- /dev/null
+++ b/test/org/apache/tomcat/util/buf/TestStringCache.java
@@ -0,0 +1,100 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.tomcat.util.buf;
+
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestStringCache {
+
+    private static final byte[] BYTES_VALID = new byte[] { 65, 66, 67, 68};
+    private static final byte[] BYTES_INVALID = new byte[] {65, 66, -1, 67, 68};
+
+    private static final ByteChunk INPUT_VALID = new ByteChunk();
+    private static final ByteChunk INPUT_INVALID = new ByteChunk();
+
+    private static final CodingErrorAction[] actions =
+            new CodingErrorAction[] { CodingErrorAction.IGNORE, CodingErrorAction.REPLACE, CodingErrorAction.REPORT };
+
+    static {
+        INPUT_VALID.setBytes(BYTES_VALID, 0, BYTES_VALID.length);
+        INPUT_VALID.setCharset(StandardCharsets.UTF_8);
+        INPUT_INVALID.setBytes(BYTES_INVALID, 0, BYTES_INVALID.length);
+        INPUT_INVALID.setCharset(StandardCharsets.UTF_8);
+    }
+
+
+    @Test
+    public void testCodingErrorLookup() {
+
+        System.setProperty("tomcat.util.buf.StringCache.byte.enabled", "true");
+
+        Assert.assertTrue(StringCache.byteEnabled);
+        StringCache sc = new StringCache();
+        sc.reset();
+
+        for (int i = 0; i < StringCache.trainThreshold * 2; i++) {
+            for (CodingErrorAction malformedInputAction : actions) {
+                try {
+                    // UTF-8 doesn't have any unmappable characters
+                    INPUT_VALID.toString(malformedInputAction, CodingErrorAction.IGNORE);
+                    INPUT_INVALID.toString(malformedInputAction, CodingErrorAction.IGNORE);
+                } catch (CharacterCodingException e) {
+                    // Ignore
+                }
+            }
+        }
+
+        Assert.assertNotNull(StringCache.bcCache);
+
+        // Check the valid input is cached correctly
+        for (CodingErrorAction malformedInputAction : actions) {
+            try {
+                Assert.assertEquals("ABCD", INPUT_VALID.toString(malformedInputAction, CodingErrorAction.IGNORE));
+            } catch (CharacterCodingException e) {
+                // Should not happen for valid input
+                Assert.fail();
+            }
+        }
+
+        // Check the valid input is cached correctly
+        try {
+            Assert.assertEquals("ABCD", INPUT_INVALID.toString(CodingErrorAction.IGNORE, CodingErrorAction.IGNORE));
+        } catch (CharacterCodingException e) {
+            // Should not happen for invalid input with IGNORE
+            Assert.fail();
+        }
+        try {
+            Assert.assertEquals("AB\ufffdCD", INPUT_INVALID.toString(CodingErrorAction.REPLACE, CodingErrorAction.IGNORE));
+        } catch (CharacterCodingException e) {
+            // Should not happen for invalid input with REPLACE
+            Assert.fail();
+        }
+        try {
+            Assert.assertEquals("ABCD", INPUT_INVALID.toString(CodingErrorAction.REPORT, CodingErrorAction.IGNORE));
+            // Should throw exception
+            Assert.fail();
+        } catch (CharacterCodingException e) {
+            // Ignore
+        }
+
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org


Re: [tomcat] branch main updated: Add control of byte decoding errors to ByteChunk and StringCache

Posted by Mark Thomas <ma...@apache.org>.
On 22/06/2023 19:55, markt@apache.org wrote:
> This is an automated email from the ASF dual-hosted git repository.
> 
> markt pushed a commit to branch main
> in repository https://gitbox.apache.org/repos/asf/tomcat.git
> 
> 
> The following commit(s) were added to refs/heads/main by this push:
>       new 944951302e Add control of byte decoding errors to ByteChunk and StringCache
> 944951302e is described below
> 
> commit 944951302e2f478879411dbff353f5818ad44121
> Author: Mark Thomas <ma...@apache.org>
> AuthorDate: Wed Jun 14 12:25:21 2023 +0100
> 
>      Add control of byte decoding errors to ByteChunk and StringCache

Hi all,

This is the first commit in a series that I am working on that will 
implement a change for Servlet 6.1 that allows getParameter() and 
friends to throw exceptions for invalid parameters. The default for 
Tomcat 11 will be to throw an exception.

The spec change also allows containers to provide container specific 
configuration to control this and what I am working on is a 
ParameterErrorHandlingConfiguration class that is similar to the cookie 
configuration that allows individual control for each of the different 
errors.

Because this change touches ByteChunk and StringCache which are 
fundamental low-level classes in Tomcat's request handling I wanted to 
give folks plenty of time to review these changes before I backport.

I plan to back-port the parameter changes as well, but with different 
defaults so exceptions are not thrown.

Thanks,

Mark

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org


Re: [tomcat] branch main updated: Add control of byte decoding errors to ByteChunk and StringCache

Posted by Romain Manni-Bucau <rm...@gmail.com>.
Hi Rémy,

If it helps openjmh uses an annotation processor so you can use it with ant
build, the setup is just not documented  AFAIK but there is no reason it
does not work technically.
Then code usage is more or less close to junit, run is a main which will
include resources/files generated from the annotation processor, really
nothing crazy IMHO on that part (hard one is to use Blackhole properly and
tune the jvm for the bench) but happy to help if needed.

Romain Manni-Bucau
@rmannibucau <https://twitter.com/rmannibucau> |  Blog
<https://rmannibucau.metawerx.net/> | Old Blog
<http://rmannibucau.wordpress.com> | Github <https://github.com/rmannibucau> |
LinkedIn <https://www.linkedin.com/in/rmannibucau> | Book
<https://www.packtpub.com/application-development/java-ee-8-high-performance>


Le ven. 23 juin 2023 à 10:15, Rémy Maucherat <re...@apache.org> a écrit :

> On Fri, Jun 23, 2023 at 10:07 AM Mark Thomas <ma...@apache.org> wrote:
> >
> > On 23/06/2023 08:34, Rémy Maucherat wrote:
> > > On Thu, Jun 22, 2023 at 8:55 PM <ma...@apache.org> wrote:
> > >>
> > >> This is an automated email from the ASF dual-hosted git repository.
> > >>
> > >> markt pushed a commit to branch main
> > >> in repository https://gitbox.apache.org/repos/asf/tomcat.git
> > >>
> > >>
> > >> The following commit(s) were added to refs/heads/main by this push:
> > >>       new 944951302e Add control of byte decoding errors to ByteChunk
> and StringCache
> > >> 944951302e is described below
> > >>
> > >> commit 944951302e2f478879411dbff353f5818ad44121
> > >> Author: Mark Thomas <ma...@apache.org>
> > >> AuthorDate: Wed Jun 14 12:25:21 2023 +0100
> > >>
> > >>      Add control of byte decoding errors to ByteChunk and StringCache
> >
> > <snip/>
> >
> > >> +    /**
> > >> +     * Converts the current content of the byte buffer to a String
> using the configured character set.
> > >> +     *
> > >> +     * @param malformedInputAction      Action to take if the input
> is malformed
> > >> +     * @param unmappableCharacterAction Action to take if a byte
> sequence can't be mapped to a character
> > >> +     *
> > >> +     * @return The result of converting the bytes to a String
> > >> +     *
> > >> +     * @throws CharacterCodingException If an error occurs during
> the conversion
> > >> +     */
> > >> +    public String toStringInternal(CodingErrorAction
> malformedInputAction, CodingErrorAction unmappableCharacterAction)
> > >> +            throws CharacterCodingException {
> > >>           if (charset == null) {
> > >>               charset = DEFAULT_CHARSET;
> > >>           }
> > >>           // new String(byte[], int, int, Charset) takes a defensive
> copy of the
> > >>           // entire byte array. This is expensive if only a small
> subset of the
> > >>           // bytes will be used. The code below is from Apache
> Harmony.
> > >> -        CharBuffer cb = charset.decode(ByteBuffer.wrap(buff, start,
> end - start));
> > >> +        CharBuffer cb =
> charset.newDecoder().onMalformedInput(malformedInputAction)
> > >> +
> .onUnmappableCharacter(unmappableCharacterAction).decode(ByteBuffer.wrap(buff,
> start, end - start));
> > >
> > > Looking at the code, this is not equivalent, like charset.decode uses
> > > thread locals and so on. I will make a change so that charset.decode
> > > is used if is REPLACE REPLACE, if you don't mind. I'm pretty sure
> > > benching would show no performance difference though.
> >
> > Seems reasonable to me.
>
> BTW, for actual microbenchmarks, the OpenJDK people recommend using
> https://github.com/openjdk/jmh rather than whatever hack. Does anyone
> have any experience with it ?
> Since it is Maven based, microbenchmarks would need to be added in a
> separate mavenized location rather than the testsuite.
>
> Rémy
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
> For additional commands, e-mail: dev-help@tomcat.apache.org
>
>

Re: [tomcat] branch main updated: Add control of byte decoding errors to ByteChunk and StringCache

Posted by Rémy Maucherat <re...@apache.org>.
On Fri, Jun 23, 2023 at 10:07 AM Mark Thomas <ma...@apache.org> wrote:
>
> On 23/06/2023 08:34, Rémy Maucherat wrote:
> > On Thu, Jun 22, 2023 at 8:55 PM <ma...@apache.org> wrote:
> >>
> >> This is an automated email from the ASF dual-hosted git repository.
> >>
> >> markt pushed a commit to branch main
> >> in repository https://gitbox.apache.org/repos/asf/tomcat.git
> >>
> >>
> >> The following commit(s) were added to refs/heads/main by this push:
> >>       new 944951302e Add control of byte decoding errors to ByteChunk and StringCache
> >> 944951302e is described below
> >>
> >> commit 944951302e2f478879411dbff353f5818ad44121
> >> Author: Mark Thomas <ma...@apache.org>
> >> AuthorDate: Wed Jun 14 12:25:21 2023 +0100
> >>
> >>      Add control of byte decoding errors to ByteChunk and StringCache
>
> <snip/>
>
> >> +    /**
> >> +     * Converts the current content of the byte buffer to a String using the configured character set.
> >> +     *
> >> +     * @param malformedInputAction      Action to take if the input is malformed
> >> +     * @param unmappableCharacterAction Action to take if a byte sequence can't be mapped to a character
> >> +     *
> >> +     * @return The result of converting the bytes to a String
> >> +     *
> >> +     * @throws CharacterCodingException If an error occurs during the conversion
> >> +     */
> >> +    public String toStringInternal(CodingErrorAction malformedInputAction, CodingErrorAction unmappableCharacterAction)
> >> +            throws CharacterCodingException {
> >>           if (charset == null) {
> >>               charset = DEFAULT_CHARSET;
> >>           }
> >>           // new String(byte[], int, int, Charset) takes a defensive copy of the
> >>           // entire byte array. This is expensive if only a small subset of the
> >>           // bytes will be used. The code below is from Apache Harmony.
> >> -        CharBuffer cb = charset.decode(ByteBuffer.wrap(buff, start, end - start));
> >> +        CharBuffer cb = charset.newDecoder().onMalformedInput(malformedInputAction)
> >> +                .onUnmappableCharacter(unmappableCharacterAction).decode(ByteBuffer.wrap(buff, start, end - start));
> >
> > Looking at the code, this is not equivalent, like charset.decode uses
> > thread locals and so on. I will make a change so that charset.decode
> > is used if is REPLACE REPLACE, if you don't mind. I'm pretty sure
> > benching would show no performance difference though.
>
> Seems reasonable to me.

BTW, for actual microbenchmarks, the OpenJDK people recommend using
https://github.com/openjdk/jmh rather than whatever hack. Does anyone
have any experience with it ?
Since it is Maven based, microbenchmarks would need to be added in a
separate mavenized location rather than the testsuite.

Rémy

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org


Re: [tomcat] branch main updated: Add control of byte decoding errors to ByteChunk and StringCache

Posted by Mark Thomas <ma...@apache.org>.
On 23/06/2023 08:34, Rémy Maucherat wrote:
> On Thu, Jun 22, 2023 at 8:55 PM <ma...@apache.org> wrote:
>>
>> This is an automated email from the ASF dual-hosted git repository.
>>
>> markt pushed a commit to branch main
>> in repository https://gitbox.apache.org/repos/asf/tomcat.git
>>
>>
>> The following commit(s) were added to refs/heads/main by this push:
>>       new 944951302e Add control of byte decoding errors to ByteChunk and StringCache
>> 944951302e is described below
>>
>> commit 944951302e2f478879411dbff353f5818ad44121
>> Author: Mark Thomas <ma...@apache.org>
>> AuthorDate: Wed Jun 14 12:25:21 2023 +0100
>>
>>      Add control of byte decoding errors to ByteChunk and StringCache

<snip/>

>> +    /**
>> +     * Converts the current content of the byte buffer to a String using the configured character set.
>> +     *
>> +     * @param malformedInputAction      Action to take if the input is malformed
>> +     * @param unmappableCharacterAction Action to take if a byte sequence can't be mapped to a character
>> +     *
>> +     * @return The result of converting the bytes to a String
>> +     *
>> +     * @throws CharacterCodingException If an error occurs during the conversion
>> +     */
>> +    public String toStringInternal(CodingErrorAction malformedInputAction, CodingErrorAction unmappableCharacterAction)
>> +            throws CharacterCodingException {
>>           if (charset == null) {
>>               charset = DEFAULT_CHARSET;
>>           }
>>           // new String(byte[], int, int, Charset) takes a defensive copy of the
>>           // entire byte array. This is expensive if only a small subset of the
>>           // bytes will be used. The code below is from Apache Harmony.
>> -        CharBuffer cb = charset.decode(ByteBuffer.wrap(buff, start, end - start));
>> +        CharBuffer cb = charset.newDecoder().onMalformedInput(malformedInputAction)
>> +                .onUnmappableCharacter(unmappableCharacterAction).decode(ByteBuffer.wrap(buff, start, end - start));
> 
> Looking at the code, this is not equivalent, like charset.decode uses
> thread locals and so on. I will make a change so that charset.decode
> is used if is REPLACE REPLACE, if you don't mind. I'm pretty sure
> benching would show no performance difference though.

Seems reasonable to me.

Mark

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org


Re: [tomcat] branch main updated: Add control of byte decoding errors to ByteChunk and StringCache

Posted by Rémy Maucherat <re...@apache.org>.
On Thu, Jun 22, 2023 at 8:55 PM <ma...@apache.org> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> markt pushed a commit to branch main
> in repository https://gitbox.apache.org/repos/asf/tomcat.git
>
>
> The following commit(s) were added to refs/heads/main by this push:
>      new 944951302e Add control of byte decoding errors to ByteChunk and StringCache
> 944951302e is described below
>
> commit 944951302e2f478879411dbff353f5818ad44121
> Author: Mark Thomas <ma...@apache.org>
> AuthorDate: Wed Jun 14 12:25:21 2023 +0100
>
>     Add control of byte decoding errors to ByteChunk and StringCache
> ---
>  java/org/apache/tomcat/util/buf/ByteChunk.java     |  47 +++++++++-
>  java/org/apache/tomcat/util/buf/StringCache.java   |  66 +++++++++++---
>  test/org/apache/jasper/compiler/TestGenerator.java |   3 +-
>  .../apache/tomcat/util/buf/TestStringCache.java    | 100 +++++++++++++++++++++
>  4 files changed, 203 insertions(+), 13 deletions(-)
>
> diff --git a/java/org/apache/tomcat/util/buf/ByteChunk.java b/java/org/apache/tomcat/util/buf/ByteChunk.java
> index 101f9c0eaa..ff00f55774 100644
> --- a/java/org/apache/tomcat/util/buf/ByteChunk.java
> +++ b/java/org/apache/tomcat/util/buf/ByteChunk.java
> @@ -21,7 +21,9 @@ import java.io.ObjectInputStream;
>  import java.io.ObjectOutputStream;
>  import java.nio.ByteBuffer;
>  import java.nio.CharBuffer;
> +import java.nio.charset.CharacterCodingException;
>  import java.nio.charset.Charset;
> +import java.nio.charset.CodingErrorAction;
>  import java.nio.charset.StandardCharsets;
>
>  /*
> @@ -521,23 +523,64 @@ public final class ByteChunk extends AbstractChunk {
>
>      @Override
>      public String toString() {
> +        try {
> +            return toString(CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
> +        } catch (CharacterCodingException e) {
> +            // Unreachable code. Use of REPLACE above means the exception will never be thrown.
> +            throw new IllegalStateException(e);
> +        }
> +    }
> +
> +
> +    public String toString(CodingErrorAction malformedInputAction, CodingErrorAction unmappableCharacterAction)
> +            throws CharacterCodingException {
>          if (isNull()) {
>              return null;
>          } else if (end - start == 0) {
>              return "";
>          }
> -        return StringCache.toString(this);
> +        return StringCache.toString(this, malformedInputAction, unmappableCharacterAction);
>      }
>
>
> +    /**
> +     * Converts the current content of the byte buffer to a String using the configured character set.
> +     *
> +     * @return The result of converting the bytes to a String
> +     *
> +     * @deprecated Unused. This method will be removed in Tomcat 11 onwards.
> +     */
> +    @Deprecated
>      public String toStringInternal() {
> +        try {
> +            return toStringInternal(CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
> +        } catch (CharacterCodingException e) {
> +            // Unreachable code. Use of REPLACE above means the exception will never be thrown.
> +            throw new IllegalStateException(e);
> +        }
> +    }
> +
> +
> +    /**
> +     * Converts the current content of the byte buffer to a String using the configured character set.
> +     *
> +     * @param malformedInputAction      Action to take if the input is malformed
> +     * @param unmappableCharacterAction Action to take if a byte sequence can't be mapped to a character
> +     *
> +     * @return The result of converting the bytes to a String
> +     *
> +     * @throws CharacterCodingException If an error occurs during the conversion
> +     */
> +    public String toStringInternal(CodingErrorAction malformedInputAction, CodingErrorAction unmappableCharacterAction)
> +            throws CharacterCodingException {
>          if (charset == null) {
>              charset = DEFAULT_CHARSET;
>          }
>          // new String(byte[], int, int, Charset) takes a defensive copy of the
>          // entire byte array. This is expensive if only a small subset of the
>          // bytes will be used. The code below is from Apache Harmony.
> -        CharBuffer cb = charset.decode(ByteBuffer.wrap(buff, start, end - start));
> +        CharBuffer cb = charset.newDecoder().onMalformedInput(malformedInputAction)
> +                .onUnmappableCharacter(unmappableCharacterAction).decode(ByteBuffer.wrap(buff, start, end - start));

Looking at the code, this is not equivalent, like charset.decode uses
thread locals and so on. I will make a change so that charset.decode
is used if is REPLACE REPLACE, if you don't mind. I'm pretty sure
benching would show no performance difference though.

Rémy

>          return new String(cb.array(), cb.arrayOffset(), cb.length());
>      }
>
> diff --git a/java/org/apache/tomcat/util/buf/StringCache.java b/java/org/apache/tomcat/util/buf/StringCache.java
> index d39de93cfa..5b82e44f74 100644
> --- a/java/org/apache/tomcat/util/buf/StringCache.java
> +++ b/java/org/apache/tomcat/util/buf/StringCache.java
> @@ -16,10 +16,13 @@
>   */
>  package org.apache.tomcat.util.buf;
>
> +import java.nio.charset.CharacterCodingException;
>  import java.nio.charset.Charset;
> +import java.nio.charset.CodingErrorAction;
>  import java.util.ArrayList;
>  import java.util.HashMap;
>  import java.util.Map.Entry;
> +import java.util.Objects;
>  import java.util.TreeMap;
>
>  import org.apache.juli.logging.Log;
> @@ -208,11 +211,22 @@ public class StringCache {
>
>
>      public static String toString(ByteChunk bc) {
> +        try {
> +            return toString(bc, CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
> +        } catch (CharacterCodingException e) {
> +            // Unreachable code. Use of REPLACE above means the exception will never be thrown.
> +            throw new IllegalStateException(e);
> +        }
> +    }
> +
> +
> +    public static String toString(ByteChunk bc, CodingErrorAction malformedInputAction,
> +            CodingErrorAction unmappableCharacterAction) throws CharacterCodingException {
>
>          // If the cache is null, then either caching is disabled, or we're
>          // still training
>          if (bcCache == null) {
> -            String value = bc.toStringInternal();
> +            String value = bc.toStringInternal(malformedInputAction, unmappableCharacterAction);
>              if (byteEnabled && (value.length() < maxStringSize)) {
>                  // If training, everything is synced
>                  synchronized (bcStats) {
> @@ -285,6 +299,8 @@ public class StringCache {
>                              System.arraycopy(bc.getBuffer(), start, entry.name, 0, end - start);
>                              // Set encoding
>                              entry.charset = bc.getCharset();
> +                            entry.malformedInputAction = malformedInputAction;
> +                            entry.unmappableCharacterAction = unmappableCharacterAction;
>                              // Initialize occurrence count to one
>                              count = new int[1];
>                              count[0] = 1;
> @@ -300,9 +316,9 @@ public class StringCache {
>          } else {
>              accessCount++;
>              // Find the corresponding String
> -            String result = find(bc);
> +            String result = find(bc, malformedInputAction, unmappableCharacterAction);
>              if (result == null) {
> -                return bc.toStringInternal();
> +                return bc.toStringInternal(malformedInputAction, unmappableCharacterAction);
>              }
>              // Note: We don't care about safety for the stats
>              hitCount++;
> @@ -462,10 +478,31 @@ public class StringCache {
>       * @param name The name to find
>       *
>       * @return the corresponding value
> +     *
> +     * @deprecated Unused. Will be removed in Tomcat 11.
> +     *             Use {@link #find(ByteChunk, CodingErrorAction, CodingErrorAction)}
>       */
> +    @Deprecated
>      protected static final String find(ByteChunk name) {
> +        return find(name, CodingErrorAction.REPLACE, CodingErrorAction.REPLACE);
> +    }
> +
> +
> +    /**
> +     * Find an entry given its name in the cache and return the associated String.
> +     *
> +     * @param name                      The name to find
> +     * @param malformedInputAction      Action to take if an malformed input is encountered
> +     * @param unmappableCharacterAction Action to take if an unmappable character is encountered
> +     *
> +     * @return the corresponding value
> +     */
> +    protected static final String find(ByteChunk name, CodingErrorAction malformedInputAction,
> +        CodingErrorAction unmappableCharacterAction) {
>          int pos = findClosest(name, bcCache, bcCache.length);
> -        if ((pos < 0) || (compare(name, bcCache[pos].name) != 0) || !(name.getCharset().equals(bcCache[pos].charset))) {
> +        if ((pos < 0) || (compare(name, bcCache[pos].name) != 0) || !(name.getCharset().equals(bcCache[pos].charset)) ||
> +                !malformedInputAction.equals(bcCache[pos].malformedInputAction) ||
> +                !unmappableCharacterAction.equals(bcCache[pos].unmappableCharacterAction)) {
>              return null;
>          } else {
>              return bcCache[pos].value;
> @@ -631,11 +668,12 @@ public class StringCache {
>
>      // -------------------------------------------------- ByteEntry Inner Class
>
> -
>      private static class ByteEntry {
>
>          private byte[] name = null;
>          private Charset charset = null;
> +        private CodingErrorAction malformedInputAction = null;
> +        private CodingErrorAction unmappableCharacterAction = null;
>          private String value = null;
>
>          @Override
> @@ -645,17 +683,25 @@ public class StringCache {
>
>          @Override
>          public int hashCode() {
> -            return value.hashCode();
> +            return Objects.hash(malformedInputAction, unmappableCharacterAction, value);
>          }
>
>          @Override
>          public boolean equals(Object obj) {
> -            if (obj instanceof ByteEntry) {
> -                return value.equals(((ByteEntry) obj).value);
> +            if (this == obj) {
> +                return true;
>              }
> -            return false;
> +            if (obj == null) {
> +                return false;
> +            }
> +            if (getClass() != obj.getClass()) {
> +                return false;
> +            }
> +            ByteEntry other = (ByteEntry) obj;
> +            return Objects.equals(malformedInputAction, other.malformedInputAction) &&
> +                    Objects.equals(unmappableCharacterAction, other.unmappableCharacterAction) &&
> +                    Objects.equals(value, other.value);
>          }
> -
>      }
>
>
> diff --git a/test/org/apache/jasper/compiler/TestGenerator.java b/test/org/apache/jasper/compiler/TestGenerator.java
> index 8cfe73f54c..b854a6010b 100644
> --- a/test/org/apache/jasper/compiler/TestGenerator.java
> +++ b/test/org/apache/jasper/compiler/TestGenerator.java
> @@ -22,6 +22,7 @@ import java.beans.PropertyDescriptor;
>  import java.beans.PropertyEditorSupport;
>  import java.io.File;
>  import java.io.IOException;
> +import java.nio.charset.CodingErrorAction;
>  import java.util.Date;
>  import java.util.Scanner;
>
> @@ -211,7 +212,7 @@ public class TestGenerator extends TomcatBaseTest {
>          ByteChunk bc = new ByteChunk();
>          int rc = getUrl("http://localhost:" + getPort() + "/test/bug5nnnn/bug56529.jsp", bc, null);
>          Assert.assertEquals(HttpServletResponse.SC_OK, rc);
> -        String response = bc.toStringInternal();
> +        String response = bc.toStringInternal(CodingErrorAction.REPORT, CodingErrorAction.REPORT);
>          Assert.assertTrue(response, response.contains("[1:attribute1: '', attribute2: '']"));
>          Assert.assertTrue(response, response.contains("[2:attribute1: '', attribute2: '']"));
>      }
> diff --git a/test/org/apache/tomcat/util/buf/TestStringCache.java b/test/org/apache/tomcat/util/buf/TestStringCache.java
> new file mode 100644
> index 0000000000..2345d2de76
> --- /dev/null
> +++ b/test/org/apache/tomcat/util/buf/TestStringCache.java
> @@ -0,0 +1,100 @@
> +/*
> + *  Licensed to the Apache Software Foundation (ASF) under one or more
> + *  contributor license agreements.  See the NOTICE file distributed with
> + *  this work for additional information regarding copyright ownership.
> + *  The ASF licenses this file to You under the Apache License, Version 2.0
> + *  (the "License"); you may not use this file except in compliance with
> + *  the License.  You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + *  Unless required by applicable law or agreed to in writing, software
> + *  distributed under the License is distributed on an "AS IS" BASIS,
> + *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + *  See the License for the specific language governing permissions and
> + *  limitations under the License.
> + */
> +package org.apache.tomcat.util.buf;
> +
> +import java.nio.charset.CharacterCodingException;
> +import java.nio.charset.CodingErrorAction;
> +import java.nio.charset.StandardCharsets;
> +
> +import org.junit.Assert;
> +import org.junit.Test;
> +
> +public class TestStringCache {
> +
> +    private static final byte[] BYTES_VALID = new byte[] { 65, 66, 67, 68};
> +    private static final byte[] BYTES_INVALID = new byte[] {65, 66, -1, 67, 68};
> +
> +    private static final ByteChunk INPUT_VALID = new ByteChunk();
> +    private static final ByteChunk INPUT_INVALID = new ByteChunk();
> +
> +    private static final CodingErrorAction[] actions =
> +            new CodingErrorAction[] { CodingErrorAction.IGNORE, CodingErrorAction.REPLACE, CodingErrorAction.REPORT };
> +
> +    static {
> +        INPUT_VALID.setBytes(BYTES_VALID, 0, BYTES_VALID.length);
> +        INPUT_VALID.setCharset(StandardCharsets.UTF_8);
> +        INPUT_INVALID.setBytes(BYTES_INVALID, 0, BYTES_INVALID.length);
> +        INPUT_INVALID.setCharset(StandardCharsets.UTF_8);
> +    }
> +
> +
> +    @Test
> +    public void testCodingErrorLookup() {
> +
> +        System.setProperty("tomcat.util.buf.StringCache.byte.enabled", "true");
> +
> +        Assert.assertTrue(StringCache.byteEnabled);
> +        StringCache sc = new StringCache();
> +        sc.reset();
> +
> +        for (int i = 0; i < StringCache.trainThreshold * 2; i++) {
> +            for (CodingErrorAction malformedInputAction : actions) {
> +                try {
> +                    // UTF-8 doesn't have any unmappable characters
> +                    INPUT_VALID.toString(malformedInputAction, CodingErrorAction.IGNORE);
> +                    INPUT_INVALID.toString(malformedInputAction, CodingErrorAction.IGNORE);
> +                } catch (CharacterCodingException e) {
> +                    // Ignore
> +                }
> +            }
> +        }
> +
> +        Assert.assertNotNull(StringCache.bcCache);
> +
> +        // Check the valid input is cached correctly
> +        for (CodingErrorAction malformedInputAction : actions) {
> +            try {
> +                Assert.assertEquals("ABCD", INPUT_VALID.toString(malformedInputAction, CodingErrorAction.IGNORE));
> +            } catch (CharacterCodingException e) {
> +                // Should not happen for valid input
> +                Assert.fail();
> +            }
> +        }
> +
> +        // Check the valid input is cached correctly
> +        try {
> +            Assert.assertEquals("ABCD", INPUT_INVALID.toString(CodingErrorAction.IGNORE, CodingErrorAction.IGNORE));
> +        } catch (CharacterCodingException e) {
> +            // Should not happen for invalid input with IGNORE
> +            Assert.fail();
> +        }
> +        try {
> +            Assert.assertEquals("AB\ufffdCD", INPUT_INVALID.toString(CodingErrorAction.REPLACE, CodingErrorAction.IGNORE));
> +        } catch (CharacterCodingException e) {
> +            // Should not happen for invalid input with REPLACE
> +            Assert.fail();
> +        }
> +        try {
> +            Assert.assertEquals("ABCD", INPUT_INVALID.toString(CodingErrorAction.REPORT, CodingErrorAction.IGNORE));
> +            // Should throw exception
> +            Assert.fail();
> +        } catch (CharacterCodingException e) {
> +            // Ignore
> +        }
> +
> +    }
> +}
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
> For additional commands, e-mail: dev-help@tomcat.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org