You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by bo...@apache.org on 2017/07/05 15:34:25 UTC

[06/12] commons-compress git commit: Do better estimating of required buffer size for character encoding. If an unencodable character is found that requires output buffer expansion, scan buffer for all such characters, and attempt to expand buffer only o

Do better estimating of required buffer size for character encoding. If an unencodable character is found that requires output buffer expansion, scan buffer for all such characters, and attempt to expand buffer only once.

Signed-off-by: Simon Spero <se...@gmail.com>

(cherry picked from commit aa30e21)
Signed-off-by: Simon Spero <se...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/a67bdc01
Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/a67bdc01
Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/a67bdc01

Branch: refs/heads/master
Commit: a67bdc013c9fd965abaca375b9b47554a115f40e
Parents: db586ba
Author: Simon Spero <se...@gmail.com>
Authored: Sun Jun 18 18:55:38 2017 -0400
Committer: Stefan Bodewig <bo...@apache.org>
Committed: Wed Jul 5 16:30:00 2017 +0200

----------------------------------------------------------------------
 .../compress/archivers/zip/NioZipEncoding.java  | 109 ++++++++++++++++---
 .../archivers/zip/ZipEncodingHelper.java        |  10 ++
 .../compress/archivers/zip/ZipEncodingTest.java |   5 +-
 3 files changed, 104 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-compress/blob/a67bdc01/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java
index 6f0306b..fed597f 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java
@@ -48,11 +48,11 @@ class NioZipEncoding implements ZipEncoding,HasCharset {
      * @param charset The NIO charset to wrap.
      */
     NioZipEncoding(final Charset charset) {
-        this.charset = charset;
+        this(charset, false);
     }
 
     NioZipEncoding(final Charset charset, boolean useReplacement) {
-        this(charset);
+        this.charset = charset;
         this.useReplacement = useReplacement;
 
     }
@@ -107,42 +107,115 @@ class NioZipEncoding implements ZipEncoding,HasCharset {
         final CharsetEncoder enc = newEncoder();
 
         final CharBuffer cb = CharBuffer.wrap(name);
-        int estimatedSize = (int) Math.ceil(name.length() * enc.averageBytesPerChar());
-        ByteBuffer out = ByteBuffer.allocate(estimatedSize);
+        CharBuffer tmp=null;
+        ByteBuffer out = ByteBuffer.allocate(estimateInitialBufferSize(enc, cb.remaining()));
 
         while (cb.remaining() > 0) {
-            final CoderResult res = enc.encode(cb, out,true);
+            final CoderResult res = enc.encode(cb, out, false);
 
             if (res.isUnmappable() || res.isMalformed()) {
 
                 // write the unmappable characters in utf-16
                 // pseudo-URL encoding style to ByteBuffer.
-                if (res.length() * 6 > out.remaining()) {
-                    out = ZipEncodingHelper.growBuffer(out, out.position()
-                                                       + res.length() * 6);
-                }
 
-                for (int i=0; i<res.length(); ++i) {
-                    ZipEncodingHelper.appendSurrogate(out,cb.get());
+                int spaceForSurrogate = estimateIncrementalEncodingSize(enc, (6 * res.length()));
+                if (spaceForSurrogate > out.remaining()) {
+                    // if the destination buffer isn't over sized, assume that the presence of one
+                    // unmappable character makes it likely that there will be more. Find all the
+                    // un-encoded characters and allocate space based on those estimates.
+                    int charCount = 0;
+                    for (int i = cb.position() ; i < cb.limit(); i++) {
+                        if (!enc.canEncode(cb.get(i))) {
+                            charCount+= 6;
+                        } else {
+                            charCount++;
+                        }
+                    }
+                    int totalExtraSpace = estimateIncrementalEncodingSize(enc, charCount);
+                    out = ZipEncodingHelper.growBufferBy(out, totalExtraSpace- out.remaining());
+                }
+                if(tmp == null) {
+                    tmp = CharBuffer.allocate(6);
+                }
+                for (int i = 0; i < res.length(); ++i) {
+                    out = encodeFully(enc, encodeSurrogate(tmp,cb.get()), out);
                 }
 
             } else if (res.isOverflow()) {
+                int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
+                out = ZipEncodingHelper.growBufferBy(out, increment);
+            }
+        }
+        CoderResult coderResult = enc.encode(cb, out, true);
 
-                out = ZipEncodingHelper.growBuffer(out, 0);
+        if (!coderResult.isUnderflow()) {
+            throw new RuntimeException("unexpected coder result: " + coderResult);
+        }
 
-            } else if (res.isUnderflow()) {
+        out.limit(out.position());
+        out.rewind();
+        return out;
+    }
 
-                enc.flush(out);
+    private static ByteBuffer encodeFully(CharsetEncoder enc, CharBuffer cb, ByteBuffer out) {
+        while (cb.hasRemaining()) {
+            CoderResult result = enc.encode(cb, out, false);
+            if (result.isOverflow()) {
+                int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
+                out = ZipEncodingHelper.growBufferBy(out, increment);
+            } else {
                 break;
-
             }
         }
-
-        out.limit(out.position());
-        out.rewind();
         return out;
     }
 
+    static char[] HEX_CHARS = new char[]{
+        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
+    };
+
+    private CharBuffer encodeSurrogate( CharBuffer cb,char c) {
+        cb.position(0).limit(6);
+        cb.put('%');
+        cb.put('U');
+
+        cb.put(HEX_CHARS[(c >> 12) & 0x0f]);
+        cb.put(HEX_CHARS[(c >> 8) & 0x0f]);
+        cb.put(HEX_CHARS[(c >> 4) & 0x0f]);
+        cb.put(HEX_CHARS[c & 0x0f]);
+        cb.flip();
+        return cb;
+    }
+
+    /**
+     * Estimate the initial encoded size (in bytes) for a character buffer.
+     * <p>
+     * The estimate assumes that one character consumes uses the maximum length encoding,
+     * whilst the rest use an average size encoding. This accounts for any BOM for UTF-16, at
+     * the expense of a couple of extra bytes for UTF-8 encoded ASCII.
+     * </p>
+     *
+     * @param enc        encoder to use for estimates
+     * @param charChount number of characters in string
+     * @return estimated size in bytes.
+     */
+    private int estimateInitialBufferSize(CharsetEncoder enc, int charChount) {
+        float first = enc.maxBytesPerChar();
+        float rest = (charChount - 1) * enc.averageBytesPerChar();
+        return (int) Math.ceil(first + rest);
+    }
+
+    /**
+     * Estimate the size needed for remaining characters
+     *
+     * @param enc       encoder to use for estimates
+     * @param charCount number of characters remaining
+     * @return estimated size in bytes.
+     */
+    private static int estimateIncrementalEncodingSize(CharsetEncoder enc, int charCount) {
+        return (int) Math.ceil(charCount * enc.averageBytesPerChar());
+    }
+
     /**
      * @see
      * ZipEncoding#decode(byte[])

http://git-wip-us.apache.org/repos/asf/commons-compress/blob/a67bdc01/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java
index 18ad103..f31d75c 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java
@@ -136,4 +136,14 @@ public abstract class ZipEncodingHelper {
         }
         return false;
     }
+
+    static ByteBuffer growBufferBy(ByteBuffer buffer, int increment) {
+        buffer.limit(buffer.position());
+        buffer.rewind();
+
+        final ByteBuffer on = ByteBuffer.allocate(buffer.capacity() + increment);
+
+        on.put(buffer);
+        return on;
+    }
 }

http://git-wip-us.apache.org/repos/asf/commons-compress/blob/a67bdc01/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java
index f3e5127..ce0934f 100644
--- a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java
+++ b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java
@@ -180,8 +180,9 @@ public class ZipEncodingTest {
         assertFalse(enc.canEncode(UNENC_STRING));
         assertEquals("%U2016".getBytes(CharsetNames.US_ASCII), enc.encode(UNENC_STRING));
         assertFalse(enc.canEncode(BAD_STRING));
-        assertEquals(BAD_STRING_ENC.getBytes(CharsetNames.US_ASCII),
-                     enc.encode(BAD_STRING));
+        byte[] expected = BAD_STRING_ENC.getBytes(CharsetNames.US_ASCII);
+        ByteBuffer actual = enc.encode(BAD_STRING);
+        assertEquals(expected, actual);
     }
 
 }