You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by js...@apache.org on 2024/02/26 22:34:55 UTC

(solr) branch branch_9x updated: SOLR-17181: Using apache commons implementation for wildcard matching for glob patterns (#2301)

This is an automated email from the ASF dual-hosted git repository.

jsweeney pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 78187de26b1 SOLR-17181: Using apache commons implementation for wildcard matching for glob patterns (#2301)
78187de26b1 is described below

commit 78187de26b1bdfcb0ae340645d175abc46aa457c
Author: Justin Sweeney <ju...@fullstory.com>
AuthorDate: Mon Feb 26 17:34:01 2024 -0500

    SOLR-17181: Using apache commons implementation for wildcard matching for glob patterns (#2301)
    
    * Using apache commons implementation for wildcard matching for glob patterns
    
    * Removing commons io dependency and instead reusing applicable code from that library
    
    * Fixing comments with Javadoc links
    
    * Actually fixing comments with Javadoc links
    
    * Fixing javadoc comments
---
 .../apache/solr/common/util/GlobPatternUtil.java   | 163 ++++++++++++++++++++-
 1 file changed, 157 insertions(+), 6 deletions(-)

diff --git a/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java b/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java
index 8b26ab5a355..32badc75c1c 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/GlobPatternUtil.java
@@ -16,22 +16,173 @@
  */
 package org.apache.solr.common.util;
 
-import java.nio.file.FileSystems;
-import java.nio.file.Paths;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
 
 /** Provides methods for matching glob patterns against input strings. */
 public class GlobPatternUtil {
 
   /**
-   * Matches an input string against a provided glob patterns. This uses Java NIO FileSystems
-   * PathMatcher to match glob patterns in the same way to how glob patterns are matches for file
-   * paths, rather than implementing our own glob pattern matching.
+   * Matches an input string against a provided glob patterns. This uses the implementation from
+   * Apache Commons IO FilenameUtils. We are just redoing the implementation here instead of
+   * bringing in commons-io as a dependency.
    *
+   * @see <a
+   *     href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/FilenameUtils.html#wildcardMatch(java.lang.String,java.lang.String)">This
+   *     uses code from Apache Commons IO</a>
    * @param pattern the glob pattern to match against
    * @param input the input string to match against a glob pattern
    * @return true if the input string matches the glob pattern, false otherwise
    */
   public static boolean matches(String pattern, String input) {
-    return FileSystems.getDefault().getPathMatcher("glob:" + pattern).matches(Paths.get(input));
+    if (input == null && pattern == null) {
+      return true;
+    }
+    if (input == null || pattern == null) {
+      return false;
+    }
+    final String[] wcs = splitOnTokens(pattern);
+    boolean anyChars = false;
+    int textIdx = 0;
+    int wcsIdx = 0;
+    final Deque<int[]> backtrack = new ArrayDeque<>(wcs.length);
+
+    // loop around a backtrack stack, to handle complex * matching
+    do {
+      if (!backtrack.isEmpty()) {
+        final int[] array = backtrack.pop();
+        wcsIdx = array[0];
+        textIdx = array[1];
+        anyChars = true;
+      }
+
+      // loop whilst tokens and text left to process
+      while (wcsIdx < wcs.length) {
+
+        if (wcs[wcsIdx].equals("?")) {
+          // ? so move to next text char
+          textIdx++;
+          if (textIdx > input.length()) {
+            break;
+          }
+          anyChars = false;
+
+        } else if (wcs[wcsIdx].equals("*")) {
+          // set any chars status
+          anyChars = true;
+          if (wcsIdx == wcs.length - 1) {
+            textIdx = input.length();
+          }
+
+        } else {
+          // matching text token
+          if (anyChars) {
+            // any chars then try to locate text token
+            textIdx = checkIndexOf(input, textIdx, wcs[wcsIdx]);
+            if (textIdx == -1) {
+              // token not found
+              break;
+            }
+            final int repeat = checkIndexOf(input, textIdx + 1, wcs[wcsIdx]);
+            if (repeat >= 0) {
+              backtrack.push(new int[] {wcsIdx, repeat});
+            }
+          } else if (!input.regionMatches(false, textIdx, wcs[wcsIdx], 0, wcs[wcsIdx].length())) {
+            // matching from current position
+            // couldn't match token
+            break;
+          }
+
+          // matched text token, move text index to end of matched token
+          textIdx += wcs[wcsIdx].length();
+          anyChars = false;
+        }
+
+        wcsIdx++;
+      }
+
+      // full match
+      if (wcsIdx == wcs.length && textIdx == input.length()) {
+        return true;
+      }
+
+    } while (!backtrack.isEmpty());
+
+    return false;
+  }
+
+  /**
+   * Splits a string into a number of tokens. The text is split by '?' and '*'. Where multiple '*'
+   * occur consecutively they are collapsed into a single '*'.
+   *
+   * @see <a
+   *     href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/FilenameUtils.html">This
+   *     uses code from Apache Commons IO</a>
+   * @param text the text to split
+   * @return the array of tokens, never null
+   */
+  private static String[] splitOnTokens(final String text) {
+    // used by wildcardMatch
+    // package level so a unit test may run on this
+
+    if (text.indexOf('?') == -1 && text.indexOf('*') == -1) {
+      return new String[] {text};
+    }
+
+    final char[] array = text.toCharArray();
+    final ArrayList<String> list = new ArrayList<>();
+    final StringBuilder buffer = new StringBuilder();
+    char prevChar = 0;
+    for (final char ch : array) {
+      if (ch == '?' || ch == '*') {
+        if (buffer.length() != 0) {
+          list.add(buffer.toString());
+          buffer.setLength(0);
+        }
+        if (ch == '?') {
+          list.add("?");
+        } else if (prevChar != '*') { // ch == '*' here; check if previous char was '*'
+          list.add("*");
+        }
+      } else {
+        buffer.append(ch);
+      }
+      prevChar = ch;
+    }
+    if (buffer.length() != 0) {
+      list.add(buffer.toString());
+    }
+
+    return list.toArray(new String[] {});
+  }
+
+  /**
+   * Checks if one string contains another starting at a specific index using the case-sensitivity
+   * rule.
+   *
+   * <p>This method mimics parts of {@link String#indexOf(String, int)} but takes case-sensitivity
+   * into account.
+   *
+   * @see <a
+   *     href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/FilenameUtils.html">This
+   *     uses code from Apache Commons IO</a>
+   * @param str the string to check, not null
+   * @param strStartIndex the index to start at in str
+   * @param search the start to search for, not null
+   * @return the first index of the search String, -1 if no match or {@code null} string input
+   * @throws NullPointerException if either string is null
+   * @since 2.0
+   */
+  private static int checkIndexOf(final String str, final int strStartIndex, final String search) {
+    final int endIndex = str.length() - search.length();
+    if (endIndex >= strStartIndex) {
+      for (int i = strStartIndex; i <= endIndex; i++) {
+        if (str.regionMatches(false, i, search, 0, search.length())) {
+          return i;
+        }
+      }
+    }
+    return -1;
   }
 }