You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by js...@apache.org on 2024/02/26 22:34:55 UTC
(solr) branch branch_9x updated: SOLR-17181: Using apache commons implementation for wildcard matching for glob patterns (#2301)
This is an automated email from the ASF dual-hosted git repository.
jsweeney pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 78187de26b1 SOLR-17181: Using apache commons implementation for wildcard matching for glob patterns (#2301)
78187de26b1 is described below
commit 78187de26b1bdfcb0ae340645d175abc46aa457c
Author: Justin Sweeney <ju...@fullstory.com>
AuthorDate: Mon Feb 26 17:34:01 2024 -0500
SOLR-17181: Using apache commons implementation for wildcard matching for glob patterns (#2301)
* Using apache commons implementation for wildcard matching for glob patterns
* Removing commons io dependency and instead reusing applicable code from that library
* Fixing comments with Javadoc links
* Actually fixing comments with Javadoc links
* Fixing javadoc comments
---
.../apache/solr/common/util/GlobPatternUtil.java | 163 ++++++++++++++++++++-
1 file changed, 157 insertions(+), 6 deletions(-)
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java b/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java
index 8b26ab5a355..32badc75c1c 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java
@@ -16,22 +16,173 @@
*/
package org.apache.solr.common.util;
-import java.nio.file.FileSystems;
-import java.nio.file.Paths;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
/** Provides methods for matching glob patterns against input strings. */
public class GlobPatternUtil {
/**
- * Matches an input string against a provided glob patterns. This uses Java NIO FileSystems
- * PathMatcher to match glob patterns in the same way to how glob patterns are matches for file
- * paths, rather than implementing our own glob pattern matching.
+ * Matches an input string against a provided glob patterns. This uses the implementation from
+ * Apache Commons IO FilenameUtils. We are just redoing the implementation here instead of
+ * bringing in commons-io as a dependency.
*
+ * @see <a
+ * href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/FilenameUtils.html#wildcardMatch(java.lang.String,java.lang.String)">This
+ * uses code from Apache Commons IO</a>
* @param pattern the glob pattern to match against
* @param input the input string to match against a glob pattern
* @return true if the input string matches the glob pattern, false otherwise
*/
public static boolean matches(String pattern, String input) {
- return FileSystems.getDefault().getPathMatcher("glob:" + pattern).matches(Paths.get(input));
+ if (input == null && pattern == null) {
+ return true;
+ }
+ if (input == null || pattern == null) {
+ return false;
+ }
+ final String[] wcs = splitOnTokens(pattern);
+ boolean anyChars = false;
+ int textIdx = 0;
+ int wcsIdx = 0;
+ final Deque<int[]> backtrack = new ArrayDeque<>(wcs.length);
+
+ // loop around a backtrack stack, to handle complex * matching
+ do {
+ if (!backtrack.isEmpty()) {
+ final int[] array = backtrack.pop();
+ wcsIdx = array[0];
+ textIdx = array[1];
+ anyChars = true;
+ }
+
+ // loop whilst tokens and text left to process
+ while (wcsIdx < wcs.length) {
+
+ if (wcs[wcsIdx].equals("?")) {
+ // ? so move to next text char
+ textIdx++;
+ if (textIdx > input.length()) {
+ break;
+ }
+ anyChars = false;
+
+ } else if (wcs[wcsIdx].equals("*")) {
+ // set any chars status
+ anyChars = true;
+ if (wcsIdx == wcs.length - 1) {
+ textIdx = input.length();
+ }
+
+ } else {
+ // matching text token
+ if (anyChars) {
+ // any chars then try to locate text token
+ textIdx = checkIndexOf(input, textIdx, wcs[wcsIdx]);
+ if (textIdx == -1) {
+ // token not found
+ break;
+ }
+ final int repeat = checkIndexOf(input, textIdx + 1, wcs[wcsIdx]);
+ if (repeat >= 0) {
+ backtrack.push(new int[] {wcsIdx, repeat});
+ }
+ } else if (!input.regionMatches(false, textIdx, wcs[wcsIdx], 0, wcs[wcsIdx].length())) {
+ // matching from current position
+ // couldn't match token
+ break;
+ }
+
+ // matched text token, move text index to end of matched token
+ textIdx += wcs[wcsIdx].length();
+ anyChars = false;
+ }
+
+ wcsIdx++;
+ }
+
+ // full match
+ if (wcsIdx == wcs.length && textIdx == input.length()) {
+ return true;
+ }
+
+ } while (!backtrack.isEmpty());
+
+ return false;
+ }
+
+ /**
+ * Splits a string into a number of tokens. The text is split by '?' and '*'. Where multiple '*'
+ * occur consecutively they are collapsed into a single '*'.
+ *
+ * @see <a
+ * href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/FilenameUtils.html">This
+ * uses code from Apache Commons IO</a>
+ * @param text the text to split
+ * @return the array of tokens, never null
+ */
+ private static String[] splitOnTokens(final String text) {
+ // used by wildcardMatch
+ // package level so a unit test may run on this
+
+ if (text.indexOf('?') == -1 && text.indexOf('*') == -1) {
+ return new String[] {text};
+ }
+
+ final char[] array = text.toCharArray();
+ final ArrayList<String> list = new ArrayList<>();
+ final StringBuilder buffer = new StringBuilder();
+ char prevChar = 0;
+ for (final char ch : array) {
+ if (ch == '?' || ch == '*') {
+ if (buffer.length() != 0) {
+ list.add(buffer.toString());
+ buffer.setLength(0);
+ }
+ if (ch == '?') {
+ list.add("?");
+ } else if (prevChar != '*') { // ch == '*' here; check if previous char was '*'
+ list.add("*");
+ }
+ } else {
+ buffer.append(ch);
+ }
+ prevChar = ch;
+ }
+ if (buffer.length() != 0) {
+ list.add(buffer.toString());
+ }
+
+ return list.toArray(new String[] {});
+ }
+
+ /**
+ * Checks if one string contains another starting at a specific index using the case-sensitivity
+ * rule.
+ *
+ * <p>This method mimics parts of {@link String#indexOf(String, int)} but takes case-sensitivity
+ * into account.
+ *
+ * @see <a
+ * href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/FilenameUtils.html">This
+ * uses code from Apache Commons IO</a>
+ * @param str the string to check, not null
+ * @param strStartIndex the index to start at in str
+ * @param search the start to search for, not null
+ * @return the first index of the search String, -1 if no match or {@code null} string input
+ * @throws NullPointerException if either string is null
+ * @since 2.0
+ */
+ private static int checkIndexOf(final String str, final int strStartIndex, final String search) {
+ final int endIndex = str.length() - search.length();
+ if (endIndex >= strStartIndex) {
+ for (int i = strStartIndex; i <= endIndex; i++) {
+ if (str.regionMatches(false, i, search, 0, search.length())) {
+ return i;
+ }
+ }
+ }
+ return -1;
}
}