You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/10/21 13:32:16 UTC

[GitHub] [spark] srowen commented on a change in pull request #26190: [SPARK-29532][SQL] simplify interval string parsing

srowen commented on a change in pull request #26190: [SPARK-29532][SQL] simplify interval string parsing
URL: https://github.com/apache/spark/pull/26190#discussion_r337018766
 
 

 ##########
 File path: common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
 ##########
 @@ -32,94 +32,11 @@
   public static final long MICROS_PER_DAY = MICROS_PER_HOUR * 24;
   public static final long MICROS_PER_WEEK = MICROS_PER_DAY * 7;
 
-  /**
-   * A function to generate regex which matches interval string's unit part like "3 years".
-   *
-   * First, we can leave out some units in interval string, and we only care about the value of
-   * unit, so here we use non-capturing group to wrap the actual regex.
-   * At the beginning of the actual regex, we should match spaces before the unit part.
-   * Next is the number part, starts with an optional "-" to represent negative value. We use
-   * capturing group to wrap this part as we need the value later.
-   * Finally is the unit name, ends with an optional "s".
-   */
-  private static String unitRegex(String unit) {
-    return "(?:\\s+(-?\\d+)\\s+" + unit + "s?)?";
-  }
-
-  private static Pattern p = Pattern.compile("interval" + unitRegex("year") + unitRegex("month") +
-    unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") +
-    unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond"),
-    Pattern.CASE_INSENSITIVE);
-
-  private static Pattern yearMonthPattern =
-    Pattern.compile("^(?:['|\"])?([+|-])?(\\d+)-(\\d+)(?:['|\"])?$");
+  private static Pattern yearMonthPattern = Pattern.compile(
+    "^([+|-])?(\\d+)-(\\d+)$");
 
   private static Pattern dayTimePattern = Pattern.compile(
-    "^(?:['|\"])?([+|-])?((\\d+) )?((\\d+):)?(\\d+):(\\d+)(\\.(\\d+))?(?:['|\"])?$");
-
-  private static Pattern quoteTrimPattern = Pattern.compile("^(?:['|\"])?(.*?)(?:['|\"])?$");
-
-  private static long toLong(String s) {
-    if (s == null) {
-      return 0;
-    } else {
-      return Long.parseLong(s);
-    }
-  }
-
-  /**
-   * Convert a string to CalendarInterval. Return null if the input string is not a valid interval.
-   * This method is case-insensitive.
-   */
-  public static CalendarInterval fromString(String s) {
-    try {
-      return fromCaseInsensitiveString(s);
-    } catch (IllegalArgumentException e) {
-      return null;
-    }
-  }
-
-  /**
-   * Convert a string to CalendarInterval. This method can handle
-   * strings without the `interval` prefix and throws IllegalArgumentException
-   * when the input string is not a valid interval.
-   *
-   * @throws IllegalArgumentException if the string is not a valid internal.
-   */
-  public static CalendarInterval fromCaseInsensitiveString(String s) {
-    if (s == null) {
-      throw new IllegalArgumentException("Interval cannot be null");
-    }
-    String trimmed = s.trim();
-    if (trimmed.isEmpty()) {
-      throw new IllegalArgumentException("Interval cannot be blank");
-    }
-    String prefix = "interval";
-    String intervalStr = trimmed;
-    // Checks the given interval string does not start with the `interval` prefix
-    if (!intervalStr.regionMatches(true, 0, prefix, 0, prefix.length())) {
-      // Prepend `interval` if it does not present because
-      // the regular expression strictly require it.
-      intervalStr = prefix + " " + trimmed;
-    } else if (intervalStr.length() == prefix.length()) {
-      throw new IllegalArgumentException("Interval string must have time units");
-    }
-
-    Matcher m = p.matcher(intervalStr);
-    if (!m.matches()) {
-      throw new IllegalArgumentException("Invalid interval: " + s);
-    }
-
-    long months = toLong(m.group(1)) * 12 + toLong(m.group(2));
-    long microseconds = toLong(m.group(3)) * MICROS_PER_WEEK;
-    microseconds += toLong(m.group(4)) * MICROS_PER_DAY;
-    microseconds += toLong(m.group(5)) * MICROS_PER_HOUR;
-    microseconds += toLong(m.group(6)) * MICROS_PER_MINUTE;
-    microseconds += toLong(m.group(7)) * MICROS_PER_SECOND;
-    microseconds += toLong(m.group(8)) * MICROS_PER_MILLI;
-    microseconds += toLong(m.group(9));
-    return new CalendarInterval((int) months, microseconds);
-  }
+    "^([+|-])?((\\d+) )?((\\d+):)?(\\d+):(\\d+)(\\.(\\d+))?$");
 
 Review comment:
   Does this end up changing the parsing logic at all? for example I note this doesn't match quotes anymore

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org