You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2022/06/14 22:06:16 UTC

[commons-io] branch master updated: Refactor internals for better Unicode processing

This is an automated email from the ASF dual-hosted git repository.

ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-io.git


The following commit(s) were added to refs/heads/master by this push:
     new dcd4f550 Refactor internals for better Unicode processing
dcd4f550 is described below

commit dcd4f5505e3aab7957029adaa3d2aa365ca9e612
Author: Gary Gregory <ga...@gmail.com>
AuthorDate: Tue Jun 14 18:06:10 2022 -0400

    Refactor internals for better Unicode processing
    
    Add FileSystem.getIllegalFileNameCodePoints()
---
 src/changes/changes.xml                            |   3 +
 .../java/org/apache/commons/io/FileSystem.java     | 199 +++++++++++----------
 2 files changed, 105 insertions(+), 97 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 31122869..733c273d 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -371,6 +371,9 @@ The <action> type attribute can be add,update,fix,remove.
       <action dev="ggregory" type="add" due-to="Gary Gregory">
         Add PathUtils.touch(Path).
       </action>
+      <action dev="ggregory" type="add" due-to="Gary Gregory">
+        Add Add FileSystem.getIllegalFileNameCodePoints().
+      </action>
       <!-- UPDATE -->
       <action dev="kinow" type="update" due-to="Dependabot, Gary Gregory">
         Bump actions/cache from 2.1.6 to 3.0.4 #307, #337.
diff --git a/src/main/java/org/apache/commons/io/FileSystem.java b/src/main/java/org/apache/commons/io/FileSystem.java
index 95f8bbe1..d42db03c 100644
--- a/src/main/java/org/apache/commons/io/FileSystem.java
+++ b/src/main/java/org/apache/commons/io/FileSystem.java
@@ -36,12 +36,12 @@ public enum FileSystem {
     /**
      * Generic file system.
      */
-    GENERIC(false, false, Integer.MAX_VALUE, Integer.MAX_VALUE, new char[] { 0 }, new String[] {}, false, false, '/'),
+    GENERIC(false, false, Integer.MAX_VALUE, Integer.MAX_VALUE, new int[] { 0 }, new String[] {}, false, false, '/'),
 
     /**
      * Linux file system.
      */
-    LINUX(true, true, 255, 4096, new char[] {
+    LINUX(true, true, 255, 4096, new int[] {
             // KEEP THIS ARRAY SORTED!
             // @formatter:off
             // ASCII NUL
@@ -53,7 +53,7 @@ public enum FileSystem {
     /**
      * MacOS file system.
      */
-    MAC_OSX(true, true, 255, 1024, new char[] {
+    MAC_OSX(true, true, 255, 1024, new int[] {
             // KEEP THIS ARRAY SORTED!
             // @formatter:off
             // ASCII NUL
@@ -77,7 +77,7 @@ public enum FileSystem {
      *      CreateFileA function - Consoles (microsoft.com)</a>
      */
     WINDOWS(false, true, 255,
-            32000, new char[] {
+            32000, new int[] {
                     // KEEP THIS ARRAY SORTED!
                     // @formatter:off
                     // ASCII NUL
@@ -193,6 +193,79 @@ public enum FileSystem {
         }
     }
 
+    /**
+     * Copied from Apache Commons Lang CharSequenceUtils.
+     *
+     * Returns the index within {@code cs} of the first occurrence of the
+     * specified character, starting the search at the specified index.
+     * <p>
+     * If a character with value {@code searchChar} occurs in the
+     * character sequence represented by the {@code cs}
+     * object at an index no smaller than {@code start}, then
+     * the index of the first such occurrence is returned. For values
+     * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive),
+     * this is the smallest value <i>k</i> such that:
+     * </p>
+     * <blockquote><pre>
+     * (this.charAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
+     * </pre></blockquote>
+     * is true. For other values of {@code searchChar}, it is the
+     * smallest value <i>k</i> such that:
+     * <blockquote><pre>
+     * (this.codePointAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
+     * </pre></blockquote>
+     * <p>
+     * is true. In either case, if no such character occurs inm {@code cs}
+     * at or after position {@code start}, then
+     * {@code -1} is returned.
+     * </p>
+     * <p>
+     * There is no restriction on the value of {@code start}. If it
+     * is negative, it has the same effect as if it were zero: the entire
+     * {@code CharSequence} may be searched. If it is greater than
+     * the length of {@code cs}, it has the same effect as if it were
+     * equal to the length of {@code cs}: {@code -1} is returned.
+     * </p>
+     * <p>All indices are specified in {@code char} values
+     * (Unicode code units).
+     * </p>
+     *
+     * @param cs  the {@code CharSequence} to be processed, not null
+     * @param searchChar  the char to be searched for
+     * @param start  the start index, negative starts at the string start
+     * @return the index where the search char was found, -1 if not found
+     * @since 3.6 updated to behave more like {@code String}
+     */
+    private static int indexOf(final CharSequence cs, final int searchChar, int start) {
+        if (cs instanceof String) {
+            return ((String) cs).indexOf(searchChar, start);
+        }
+        final int sz = cs.length();
+        if (start < 0) {
+            start = 0;
+        }
+        if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+            for (int i = start; i < sz; i++) {
+                if (cs.charAt(i) == searchChar) {
+                    return i;
+                }
+            }
+            return -1;
+        }
+        //supplementary characters (LANG1300)
+        if (searchChar <= Character.MAX_CODE_POINT) {
+            final char[] chars = Character.toChars(searchChar);
+            for (int i = start; i < sz - 1; i++) {
+                final char high = cs.charAt(i);
+                final char low = cs.charAt(i + 1);
+                if (high == chars[0] && low == chars[1]) {
+                    return i;
+                }
+            }
+        }
+        return -1;
+    }
+
     /**
      * Decides if the operating system matches.
      * <p>
@@ -223,16 +296,16 @@ public enum FileSystem {
     private static String replace(final String path, final char oldChar, final char newChar) {
         return path == null ? null : path.replace(oldChar, newChar);
     }
-
     private final boolean casePreserving;
     private final boolean caseSensitive;
-    private final char[] illegalFileNameChars;
+    private final int[] illegalFileNameChars;
     private final int maxFileNameLength;
     private final int maxPathLength;
     private final String[] reservedFileNames;
     private final boolean reservedFileNamesExtensions;
     private final boolean supportsDriveLetter;
     private final char nameSeparator;
+
     private final char nameSeparatorOther;
 
     /**
@@ -249,7 +322,7 @@ public enum FileSystem {
      * @param nameSeparator The name separator, '\\' on Windows, '/' on Linux.
      */
     FileSystem(final boolean caseSensitive, final boolean casePreserving, final int maxFileLength,
-        final int maxPathLength, final char[] illegalFileNameChars, final String[] reservedFileNames,
+        final int maxPathLength, final int[] illegalFileNameChars, final String[] reservedFileNames,
         final boolean reservedFileNamesExtensions, final boolean supportsDriveLetter, final char nameSeparator) {
         this.maxFileNameLength = maxFileLength;
         this.maxPathLength = maxPathLength;
@@ -269,6 +342,20 @@ public enum FileSystem {
      * @return the illegal characters for this file system.
      */
     public char[] getIllegalFileNameChars() {
+        final char[] chars = new char[illegalFileNameChars.length];
+        for (int i = 0; i < illegalFileNameChars.length; i++) {
+            chars[i] = (char) illegalFileNameChars[i];
+        }
+        return chars;
+    }
+
+    /**
+     * Gets a cloned copy of the illegal code points for this file system.
+     *
+     * @return the illegal code points for this file system.
+     * @since 2.12.0
+     */
+    public int[] getIllegalFileNameCodePoints() {
         return this.illegalFileNameChars.clone();
     }
 
@@ -335,7 +422,7 @@ public enum FileSystem {
      *            the character to test
      * @return {@code true} if the given character is illegal in a file name, {@code false} otherwise.
      */
-    private boolean isIllegalFileNameChar(final char c) {
+    private boolean isIllegalFileNameChar(final int c) {
         return Arrays.binarySearch(illegalFileNameChars, c) >= 0;
     }
 
@@ -355,7 +442,7 @@ public enum FileSystem {
         if (isReservedFileName(candidate)) {
             return false;
         }
-        return candidate.chars().noneMatch(i -> isIllegalFileNameChar((char) i));
+        return candidate.chars().noneMatch(this::isIllegalFileNameChar);
     }
 
     /**
@@ -411,95 +498,13 @@ public enum FileSystem {
      */
     public String toLegalFileName(final String candidate, final char replacement) {
         if (isIllegalFileNameChar(replacement)) {
-            throw new IllegalArgumentException(
-                    String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s",
-                            // %s does not work properly with NUL
-                            replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars)));
-        }
-        final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength)
-                : candidate;
-        boolean changed = false;
-        final char[] charArray = truncated.toCharArray();
-        for (int i = 0; i < charArray.length; i++) {
-            if (isIllegalFileNameChar(charArray[i])) {
-                charArray[i] = replacement;
-                changed = true;
-            }
-        }
-        return changed ? String.valueOf(charArray) : truncated;
-    }
-
-    /**
-     * Copied from Apache Commons Lang CharSequenceUtils.
-     *
-     * Returns the index within {@code cs} of the first occurrence of the
-     * specified character, starting the search at the specified index.
-     * <p>
-     * If a character with value {@code searchChar} occurs in the
-     * character sequence represented by the {@code cs}
-     * object at an index no smaller than {@code start}, then
-     * the index of the first such occurrence is returned. For values
-     * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive),
-     * this is the smallest value <i>k</i> such that:
-     * </p>
-     * <blockquote><pre>
-     * (this.charAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
-     * </pre></blockquote>
-     * is true. For other values of {@code searchChar}, it is the
-     * smallest value <i>k</i> such that:
-     * <blockquote><pre>
-     * (this.codePointAt(<i>k</i>) == searchChar) &amp;&amp; (<i>k</i> &gt;= start)
-     * </pre></blockquote>
-     * <p>
-     * is true. In either case, if no such character occurs inm {@code cs}
-     * at or after position {@code start}, then
-     * {@code -1} is returned.
-     * </p>
-     * <p>
-     * There is no restriction on the value of {@code start}. If it
-     * is negative, it has the same effect as if it were zero: the entire
-     * {@code CharSequence} may be searched. If it is greater than
-     * the length of {@code cs}, it has the same effect as if it were
-     * equal to the length of {@code cs}: {@code -1} is returned.
-     * </p>
-     * <p>All indices are specified in {@code char} values
-     * (Unicode code units).
-     * </p>
-     *
-     * @param cs  the {@code CharSequence} to be processed, not null
-     * @param searchChar  the char to be searched for
-     * @param start  the start index, negative starts at the string start
-     * @return the index where the search char was found, -1 if not found
-     * @since 3.6 updated to behave more like {@code String}
-     */
-    private static int indexOf(final CharSequence cs, final int searchChar, int start) {
-        if (cs instanceof String) {
-            return ((String) cs).indexOf(searchChar, start);
+            // %s does not work properly with NUL
+            throw new IllegalArgumentException(String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s",
+                replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars)));
         }
-        final int sz = cs.length();
-        if (start < 0) {
-            start = 0;
-        }
-        if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
-            for (int i = start; i < sz; i++) {
-                if (cs.charAt(i) == searchChar) {
-                    return i;
-                }
-            }
-            return -1;
-        }
-        //supplementary characters (LANG1300)
-        if (searchChar <= Character.MAX_CODE_POINT) {
-            final char[] chars = Character.toChars(searchChar);
-            for (int i = start; i < sz - 1; i++) {
-                final char high = cs.charAt(i);
-                final char low = cs.charAt(i + 1);
-                if (high == chars[0] && low == chars[1]) {
-                    return i;
-                }
-            }
-        }
-        return -1;
+        final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength) : candidate;
+        final int[] array = truncated.chars().map(i -> isIllegalFileNameChar(i) ? replacement : i).toArray();
+        return new String(array, 0, array.length);
     }
 
     CharSequence trimExtension(final CharSequence cs) {