You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2022/06/14 22:06:16 UTC
[commons-io] branch master updated: Refactor internals for better Unicode processing
This is an automated email from the ASF dual-hosted git repository.
ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-io.git
The following commit(s) were added to refs/heads/master by this push:
new dcd4f550 Refactor internals for better Unicode processing
dcd4f550 is described below
commit dcd4f5505e3aab7957029adaa3d2aa365ca9e612
Author: Gary Gregory <ga...@gmail.com>
AuthorDate: Tue Jun 14 18:06:10 2022 -0400
Refactor internals for better Unicode processing
Add FileSystem.getIllegalFileNameCodePoints()
---
src/changes/changes.xml | 3 +
.../java/org/apache/commons/io/FileSystem.java | 199 +++++++++++----------
2 files changed, 105 insertions(+), 97 deletions(-)
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 31122869..733c273d 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -371,6 +371,9 @@ The <action> type attribute can be add,update,fix,remove.
<action dev="ggregory" type="add" due-to="Gary Gregory">
Add PathUtils.touch(Path).
</action>
+ <action dev="ggregory" type="add" due-to="Gary Gregory">
+ Add Add FileSystem.getIllegalFileNameCodePoints().
+ </action>
<!-- UPDATE -->
<action dev="kinow" type="update" due-to="Dependabot, Gary Gregory">
Bump actions/cache from 2.1.6 to 3.0.4 #307, #337.
diff --git a/src/main/java/org/apache/commons/io/FileSystem.java b/src/main/java/org/apache/commons/io/FileSystem.java
index 95f8bbe1..d42db03c 100644
--- a/src/main/java/org/apache/commons/io/FileSystem.java
+++ b/src/main/java/org/apache/commons/io/FileSystem.java
@@ -36,12 +36,12 @@ public enum FileSystem {
/**
* Generic file system.
*/
- GENERIC(false, false, Integer.MAX_VALUE, Integer.MAX_VALUE, new char[] { 0 }, new String[] {}, false, false, '/'),
+ GENERIC(false, false, Integer.MAX_VALUE, Integer.MAX_VALUE, new int[] { 0 }, new String[] {}, false, false, '/'),
/**
* Linux file system.
*/
- LINUX(true, true, 255, 4096, new char[] {
+ LINUX(true, true, 255, 4096, new int[] {
// KEEP THIS ARRAY SORTED!
// @formatter:off
// ASCII NUL
@@ -53,7 +53,7 @@ public enum FileSystem {
/**
* MacOS file system.
*/
- MAC_OSX(true, true, 255, 1024, new char[] {
+ MAC_OSX(true, true, 255, 1024, new int[] {
// KEEP THIS ARRAY SORTED!
// @formatter:off
// ASCII NUL
@@ -77,7 +77,7 @@ public enum FileSystem {
* CreateFileA function - Consoles (microsoft.com)</a>
*/
WINDOWS(false, true, 255,
- 32000, new char[] {
+ 32000, new int[] {
// KEEP THIS ARRAY SORTED!
// @formatter:off
// ASCII NUL
@@ -193,6 +193,79 @@ public enum FileSystem {
}
}
+ /**
+ * Copied from Apache Commons Lang CharSequenceUtils.
+ *
+ * Returns the index within {@code cs} of the first occurrence of the
+ * specified character, starting the search at the specified index.
+ * <p>
+ * If a character with value {@code searchChar} occurs in the
+ * character sequence represented by the {@code cs}
+ * object at an index no smaller than {@code start}, then
+ * the index of the first such occurrence is returned. For values
+ * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive),
+ * this is the smallest value <i>k</i> such that:
+ * </p>
+ * <blockquote><pre>
+ * (this.charAt(<i>k</i>) == searchChar) && (<i>k</i> >= start)
+ * </pre></blockquote>
+ * is true. For other values of {@code searchChar}, it is the
+ * smallest value <i>k</i> such that:
+ * <blockquote><pre>
+ * (this.codePointAt(<i>k</i>) == searchChar) && (<i>k</i> >= start)
+ * </pre></blockquote>
+ * <p>
+ * is true. In either case, if no such character occurs inm {@code cs}
+ * at or after position {@code start}, then
+ * {@code -1} is returned.
+ * </p>
+ * <p>
+ * There is no restriction on the value of {@code start}. If it
+ * is negative, it has the same effect as if it were zero: the entire
+ * {@code CharSequence} may be searched. If it is greater than
+ * the length of {@code cs}, it has the same effect as if it were
+ * equal to the length of {@code cs}: {@code -1} is returned.
+ * </p>
+ * <p>All indices are specified in {@code char} values
+ * (Unicode code units).
+ * </p>
+ *
+ * @param cs the {@code CharSequence} to be processed, not null
+ * @param searchChar the char to be searched for
+ * @param start the start index, negative starts at the string start
+ * @return the index where the search char was found, -1 if not found
+ * @since 3.6 updated to behave more like {@code String}
+ */
+ private static int indexOf(final CharSequence cs, final int searchChar, int start) {
+ if (cs instanceof String) {
+ return ((String) cs).indexOf(searchChar, start);
+ }
+ final int sz = cs.length();
+ if (start < 0) {
+ start = 0;
+ }
+ if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+ for (int i = start; i < sz; i++) {
+ if (cs.charAt(i) == searchChar) {
+ return i;
+ }
+ }
+ return -1;
+ }
+ //supplementary characters (LANG1300)
+ if (searchChar <= Character.MAX_CODE_POINT) {
+ final char[] chars = Character.toChars(searchChar);
+ for (int i = start; i < sz - 1; i++) {
+ final char high = cs.charAt(i);
+ final char low = cs.charAt(i + 1);
+ if (high == chars[0] && low == chars[1]) {
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+
/**
* Decides if the operating system matches.
* <p>
@@ -223,16 +296,16 @@ public enum FileSystem {
private static String replace(final String path, final char oldChar, final char newChar) {
return path == null ? null : path.replace(oldChar, newChar);
}
-
private final boolean casePreserving;
private final boolean caseSensitive;
- private final char[] illegalFileNameChars;
+ private final int[] illegalFileNameChars;
private final int maxFileNameLength;
private final int maxPathLength;
private final String[] reservedFileNames;
private final boolean reservedFileNamesExtensions;
private final boolean supportsDriveLetter;
private final char nameSeparator;
+
private final char nameSeparatorOther;
/**
@@ -249,7 +322,7 @@ public enum FileSystem {
* @param nameSeparator The name separator, '\\' on Windows, '/' on Linux.
*/
FileSystem(final boolean caseSensitive, final boolean casePreserving, final int maxFileLength,
- final int maxPathLength, final char[] illegalFileNameChars, final String[] reservedFileNames,
+ final int maxPathLength, final int[] illegalFileNameChars, final String[] reservedFileNames,
final boolean reservedFileNamesExtensions, final boolean supportsDriveLetter, final char nameSeparator) {
this.maxFileNameLength = maxFileLength;
this.maxPathLength = maxPathLength;
@@ -269,6 +342,20 @@ public enum FileSystem {
* @return the illegal characters for this file system.
*/
public char[] getIllegalFileNameChars() {
+ final char[] chars = new char[illegalFileNameChars.length];
+ for (int i = 0; i < illegalFileNameChars.length; i++) {
+ chars[i] = (char) illegalFileNameChars[i];
+ }
+ return chars;
+ }
+
+ /**
+ * Gets a cloned copy of the illegal code points for this file system.
+ *
+ * @return the illegal code points for this file system.
+ * @since 2.12.0
+ */
+ public int[] getIllegalFileNameCodePoints() {
return this.illegalFileNameChars.clone();
}
@@ -335,7 +422,7 @@ public enum FileSystem {
* the character to test
* @return {@code true} if the given character is illegal in a file name, {@code false} otherwise.
*/
- private boolean isIllegalFileNameChar(final char c) {
+ private boolean isIllegalFileNameChar(final int c) {
return Arrays.binarySearch(illegalFileNameChars, c) >= 0;
}
@@ -355,7 +442,7 @@ public enum FileSystem {
if (isReservedFileName(candidate)) {
return false;
}
- return candidate.chars().noneMatch(i -> isIllegalFileNameChar((char) i));
+ return candidate.chars().noneMatch(this::isIllegalFileNameChar);
}
/**
@@ -411,95 +498,13 @@ public enum FileSystem {
*/
public String toLegalFileName(final String candidate, final char replacement) {
if (isIllegalFileNameChar(replacement)) {
- throw new IllegalArgumentException(
- String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s",
- // %s does not work properly with NUL
- replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars)));
- }
- final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength)
- : candidate;
- boolean changed = false;
- final char[] charArray = truncated.toCharArray();
- for (int i = 0; i < charArray.length; i++) {
- if (isIllegalFileNameChar(charArray[i])) {
- charArray[i] = replacement;
- changed = true;
- }
- }
- return changed ? String.valueOf(charArray) : truncated;
- }
-
- /**
- * Copied from Apache Commons Lang CharSequenceUtils.
- *
- * Returns the index within {@code cs} of the first occurrence of the
- * specified character, starting the search at the specified index.
- * <p>
- * If a character with value {@code searchChar} occurs in the
- * character sequence represented by the {@code cs}
- * object at an index no smaller than {@code start}, then
- * the index of the first such occurrence is returned. For values
- * of {@code searchChar} in the range from 0 to 0xFFFF (inclusive),
- * this is the smallest value <i>k</i> such that:
- * </p>
- * <blockquote><pre>
- * (this.charAt(<i>k</i>) == searchChar) && (<i>k</i> >= start)
- * </pre></blockquote>
- * is true. For other values of {@code searchChar}, it is the
- * smallest value <i>k</i> such that:
- * <blockquote><pre>
- * (this.codePointAt(<i>k</i>) == searchChar) && (<i>k</i> >= start)
- * </pre></blockquote>
- * <p>
- * is true. In either case, if no such character occurs inm {@code cs}
- * at or after position {@code start}, then
- * {@code -1} is returned.
- * </p>
- * <p>
- * There is no restriction on the value of {@code start}. If it
- * is negative, it has the same effect as if it were zero: the entire
- * {@code CharSequence} may be searched. If it is greater than
- * the length of {@code cs}, it has the same effect as if it were
- * equal to the length of {@code cs}: {@code -1} is returned.
- * </p>
- * <p>All indices are specified in {@code char} values
- * (Unicode code units).
- * </p>
- *
- * @param cs the {@code CharSequence} to be processed, not null
- * @param searchChar the char to be searched for
- * @param start the start index, negative starts at the string start
- * @return the index where the search char was found, -1 if not found
- * @since 3.6 updated to behave more like {@code String}
- */
- private static int indexOf(final CharSequence cs, final int searchChar, int start) {
- if (cs instanceof String) {
- return ((String) cs).indexOf(searchChar, start);
+ // %s does not work properly with NUL
+ throw new IllegalArgumentException(String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s",
+ replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars)));
}
- final int sz = cs.length();
- if (start < 0) {
- start = 0;
- }
- if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
- for (int i = start; i < sz; i++) {
- if (cs.charAt(i) == searchChar) {
- return i;
- }
- }
- return -1;
- }
- //supplementary characters (LANG1300)
- if (searchChar <= Character.MAX_CODE_POINT) {
- final char[] chars = Character.toChars(searchChar);
- for (int i = start; i < sz - 1; i++) {
- final char high = cs.charAt(i);
- final char low = cs.charAt(i + 1);
- if (high == chars[0] && low == chars[1]) {
- return i;
- }
- }
- }
- return -1;
+ final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength) : candidate;
+ final int[] array = truncated.chars().map(i -> isIllegalFileNameChar(i) ? replacement : i).toArray();
+ return new String(array, 0, array.length);
}
CharSequence trimExtension(final CharSequence cs) {