You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mh...@apache.org on 2020/07/09 13:23:14 UTC

[lucene-solr] branch branch_8x updated: LUCENE-9386 add case insensitive RegExp matching option. (#1659)

This is an automated email from the ASF dual-hosted git repository.

mharwood pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new fec5d49  LUCENE-9386 add case insensitive RegExp matching option. (#1659)
fec5d49 is described below

commit fec5d49112ab23ce67792ddc821b1addcc8eea5d
Author: markharwood <ma...@gmail.com>
AuthorDate: Thu Jul 9 14:23:03 2020 +0100

    LUCENE-9386 add case insensitive RegExp matching option. (#1659)
    
    Backport of 887fe4c83d4114c6238265ca7f05aa491525af9d
---
 lucene/CHANGES.txt                                 |   3 +-
 .../java/org/apache/lucene/search/RegexpQuery.java |  38 +++-
 .../org/apache/lucene/util/automaton/RegExp.java   | 213 ++++++++++++++-------
 .../org/apache/lucene/search/TestRegexpQuery.java  |  11 ++
 .../apache/lucene/util/automaton/TestRegExp.java   | 150 +++++++++++++++
 5 files changed, 338 insertions(+), 77 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fd7c95f..e20a788 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -11,7 +11,8 @@ API Changes
 
 New Features
 ---------------------
-(No changes)
+
+* LUCENE-9386: RegExpQuery added case insensitive matching option. (Mark Harwood)
 
 Improvements
 ---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
index b3c8ad2..b241ac5 100644
--- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
@@ -83,7 +83,7 @@ public class RegexpQuery extends AutomatonQuery {
    * Constructs a query for terms matching <code>term</code>.
    * 
    * @param term regular expression.
-   * @param flags optional RegExp features from {@link RegExp}
+   * @param flags optional RegExp syntax features from {@link RegExp}
    * @param maxDeterminizedStates maximum number of states that compiling the
    *  automaton for the regexp can result in.  Set higher to allow more complex
    *  queries and lower to prevent memory exhaustion.
@@ -96,16 +96,46 @@ public class RegexpQuery extends AutomatonQuery {
    * Constructs a query for terms matching <code>term</code>.
    * 
    * @param term regular expression.
-   * @param flags optional RegExp features from {@link RegExp}
+   * @param syntax_flags optional RegExp syntax features from {@link RegExp}
+   *  automaton for the regexp can result in.  Set higher to allow more complex
+   *  queries and lower to prevent memory exhaustion.
+   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
+   * @param maxDeterminizedStates maximum number of states that compiling the
+   */
+  public RegexpQuery(Term term, int syntax_flags, int match_flags, int maxDeterminizedStates) {
+    this(term, syntax_flags, match_flags, defaultProvider, maxDeterminizedStates);
+  }
+  
+  /**
+   * Constructs a query for terms matching <code>term</code>.
+   * 
+   * @param term regular expression.
+   * @param syntax_flags optional RegExp features from {@link RegExp}
+   * @param provider custom AutomatonProvider for named automata
+   * @param maxDeterminizedStates maximum number of states that compiling the
+   *  automaton for the regexp can result in.  Set higher to allow more complex
+   *  queries and lower to prevent memory exhaustion.
+   */
+  public RegexpQuery(Term term, int syntax_flags, AutomatonProvider provider,
+      int maxDeterminizedStates) {
+    this(term, syntax_flags, 0, provider, maxDeterminizedStates);
+  }
+  
+  /**
+   * Constructs a query for terms matching <code>term</code>.
+   * 
+   * @param term regular expression.
+   * @param syntax_flags optional RegExp features from {@link RegExp}
+   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
    * @param provider custom AutomatonProvider for named automata
    * @param maxDeterminizedStates maximum number of states that compiling the
    *  automaton for the regexp can result in.  Set higher to allow more complex
    *  queries and lower to prevent memory exhaustion.
    */
-  public RegexpQuery(Term term, int flags, AutomatonProvider provider,
+  public RegexpQuery(Term term, int syntax_flags, int match_flags, AutomatonProvider provider,
       int maxDeterminizedStates) {
     super(term,
-          new RegExp(term.text(), flags).toAutomaton(
+          new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(
                        provider, maxDeterminizedStates), maxDeterminizedStates);
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index 3956486..5186af1 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -32,6 +32,7 @@ package org.apache.lucene.util.automaton;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -354,6 +355,7 @@ public class RegExp {
     /** An expression for a pre-defined class e.g. \w */
   }
   
+  //-----  Syntax flags ( <= 0xff )  ------
   /**
    * Syntax flag, enables intersection (<tt>&amp;</tt>).
    */
@@ -388,13 +390,21 @@ public class RegExp {
   /**
    * Syntax flag, enables all optional regexp syntax.
    */
-  public static final int ALL = 0xffff;
+  public static final int ALL = 0xff;
   
   /**
    * Syntax flag, enables no optional regexp syntax.
    */
   public static final int NONE = 0x0000;
 
+  //-----  Matching flags ( > 0xff )  ------
+
+  /**
+   * Allows case insensitive matching of ASCII characters.
+   */
+  public static final int ASCII_CASE_INSENSITIVE = 0x0100;   
+  
+  
   //Immutable parsed state
   /**
    * The type of expression
@@ -423,7 +433,7 @@ public class RegExp {
 
   // Parser variables
   private final String originalString;
-  int flags;
+  final int flags;
   int pos;
     
   /**
@@ -448,10 +458,31 @@ public class RegExp {
    *              regular expression
    */
   public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
+    this(s, ALL, 0);
+  }
+  
+  /**
+   * Constructs new <code>RegExp</code> from a string.
+   * 
+   * @param s regexp string
+   * @param syntax_flags boolean 'or' of optional syntax constructs to be
+   *          enabled
+   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
+   * @exception IllegalArgumentException if an error occurred while parsing the
+   *              regular expression
+   */
+  public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException {
     originalString = s;
-    flags = syntax_flags;
+    if (syntax_flags >  ALL) {
+      throw new IllegalArgumentException("Illegal syntax flag");
+    }
+    
+    if (match_flags > 0 && match_flags <= ALL) {
+      throw new IllegalArgumentException("Illegal match flag");
+    }
+    flags = syntax_flags | match_flags;
     RegExp e;
-    if (s.length() == 0) e = makeString("");
+    if (s.length() == 0) e = makeString(flags, "");
     else {
       e = parseUnionExp();
       if (pos < originalString.length()) throw new IllegalArgumentException(
@@ -469,10 +500,10 @@ public class RegExp {
     to = e.to;
   }
   
-  RegExp(Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){    
+  RegExp(int flags, Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){
     this.originalString = null;
     this.kind = kind;
-    this.flags = 0;
+    this.flags = flags;
     this.exp1 = exp1;
     this.exp2 = exp2;
     this.s = s;
@@ -485,19 +516,19 @@ public class RegExp {
   }
 
   // Simplified construction of container nodes
-  static RegExp newContainerNode(Kind kind, RegExp exp1, RegExp exp2) {
-    return new RegExp(kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
+  static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) {
+    return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
   }
 
   // Simplified construction of repeating nodes
-  static RegExp newRepeatingNode(Kind kind, RegExp exp,  int min, int max) {
-    return new RegExp(kind, exp, null, null, 0, min, max, 0, 0, 0);
+  static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp,  int min, int max) {
+    return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0);
   }  
   
   
   // Simplified construction of leaf nodes
-  static RegExp newLeafNode(Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
-    return new RegExp(kind, null, null, s, c, min, max, digits, from, to);
+  static RegExp newLeafNode(int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
+    return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to);
   }  
 
   /**
@@ -648,7 +679,11 @@ public class RegExp {
         a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_CHAR:
-        a = Automata.makeChar(c);
+        if (check(ASCII_CASE_INSENSITIVE)) {
+          a = toCaseInsensitiveChar(c, maxDeterminizedStates);
+        } else {
+          a = Automata.makeChar(c);          
+        }
         break;
       case REGEXP_CHAR_RANGE:
         a = Automata.makeCharRange(from, to);
@@ -660,7 +695,11 @@ public class RegExp {
         a = Automata.makeEmpty();
         break;
       case REGEXP_STRING:
-        a = Automata.makeString(s);
+        if (check(ASCII_CASE_INSENSITIVE)) {
+          a = toCaseInsensitiveString(maxDeterminizedStates);
+        } else {
+          a = Automata.makeString(s);
+        }
         break;
       case REGEXP_ANYSTRING:
         a = Automata.makeAnyString();
@@ -689,6 +728,36 @@ public class RegExp {
     return a;
   }
   
+  
+  private Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
+    Automaton case1 = Automata.makeChar(codepoint);
+    // For now we only work with ASCII characters
+    if (codepoint > 128) {
+      return case1;
+    }
+    int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
+    Automaton result;
+    if (altCase != codepoint) {
+      result = Operations.union(case1, Automata.makeChar(altCase));
+      result = MinimizationOperations.minimize(result, maxDeterminizedStates);          
+    } else {
+      result = case1;                      
+    }          
+    return result;
+  }
+  
+  private Automaton toCaseInsensitiveString(int maxDeterminizedStates) {
+    List<Automaton> list = new ArrayList<>();
+    
+    Iterator<Integer> iter = s.codePoints().iterator();
+    while (iter.hasNext()) {
+      list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
+    }
+    Automaton a = Operations.concatenate(list);
+    a = MinimizationOperations.minimize(a, maxDeterminizedStates);
+    return a;
+  }    
+  
   private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
       Map<String,Automaton> automata, AutomatonProvider automaton_provider,
       int maxDeterminizedStates) {
@@ -935,97 +1004,97 @@ public class RegExp {
     }
   }
   
-  static RegExp makeUnion(RegExp exp1, RegExp exp2) {
-    return newContainerNode(Kind.REGEXP_UNION, exp1, exp2);
+  static RegExp makeUnion(int flags, RegExp exp1, RegExp exp2) {
+    return newContainerNode(flags, Kind.REGEXP_UNION, exp1, exp2);
   }
   
-  static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
+  static RegExp makeConcatenation(int flags, RegExp exp1, RegExp exp2) {
     if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
         && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
-        exp1, exp2);
+            flags, exp1, exp2);
     RegExp rexp1, rexp2;
     if (exp1.kind == Kind.REGEXP_CONCATENATION
         && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
         && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
       rexp1 = exp1.exp1;
-      rexp2 = makeString(exp1.exp2, exp2);
+      rexp2 = makeString(flags, exp1.exp2, exp2);
     } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
         && exp2.kind == Kind.REGEXP_CONCATENATION
         && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
-      rexp1 = makeString(exp1, exp2.exp1);
+      rexp1 = makeString(flags, exp1, exp2.exp1);
       rexp2 = exp2.exp2;
     } else {
       rexp1 = exp1;
       rexp2 = exp2;
     }
-    return newContainerNode(Kind.REGEXP_CONCATENATION, rexp1, rexp2);
+    return newContainerNode(flags, Kind.REGEXP_CONCATENATION, rexp1, rexp2);
   }
   
-  static private RegExp makeString(RegExp exp1, RegExp exp2) {
+  static private RegExp makeString(int flags, RegExp exp1, RegExp exp2) {
     StringBuilder b = new StringBuilder();
     if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
     else b.appendCodePoint(exp1.c);
     if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
     else b.appendCodePoint(exp2.c);
-    return makeString(b.toString());
+    return makeString(flags, b.toString());
   }
   
-  static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
-    return newContainerNode(Kind.REGEXP_INTERSECTION, exp1, exp2);
+  static RegExp makeIntersection(int flags, RegExp exp1, RegExp exp2) {
+    return newContainerNode(flags, Kind.REGEXP_INTERSECTION, exp1, exp2);
   }
   
-  static RegExp makeOptional(RegExp exp) {
-    return newContainerNode(Kind.REGEXP_OPTIONAL, exp, null);
+  static RegExp makeOptional(int flags, RegExp exp) {
+    return newContainerNode(flags, Kind.REGEXP_OPTIONAL, exp, null);
   }
   
-  static RegExp makeRepeat(RegExp exp) {
-    return newContainerNode(Kind.REGEXP_REPEAT, exp, null);
+  static RegExp makeRepeat(int flags, RegExp exp) {
+    return newContainerNode(flags, Kind.REGEXP_REPEAT, exp, null);
   }
   
-  static RegExp makeRepeat(RegExp exp, int min) {
-    return newRepeatingNode(Kind.REGEXP_REPEAT_MIN, exp, min, 0);
+  static RegExp makeRepeat(int flags, RegExp exp, int min) {
+    return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MIN, exp, min, 0);
   }
   
-  static RegExp makeRepeat(RegExp exp, int min, int max) {
-    return newRepeatingNode(Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
+  static RegExp makeRepeat(int flags, RegExp exp, int min, int max) {
+    return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
   }
   
-  static RegExp makeComplement(RegExp exp) {
-    return newContainerNode(Kind.REGEXP_COMPLEMENT, exp, null);
+  static RegExp makeComplement(int flags, RegExp exp) {
+    return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null);
   }
   
-  static RegExp makeChar(int c) {
-    return newLeafNode(Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
+  static RegExp makeChar(int flags, int c) {
+    return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
   }
   
-  static RegExp makeCharRange(int from, int to) {
+  static RegExp makeCharRange(int flags, int from, int to) {
     if (from > to) 
       throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
-    return newLeafNode(Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
+    return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
   }
   
-  static RegExp makeAnyChar() {
-    return newContainerNode(Kind.REGEXP_ANYCHAR, null, null);
+  static RegExp makeAnyChar(int flags) {
+    return newContainerNode(flags, Kind.REGEXP_ANYCHAR, null, null);
   }
   
-  static RegExp makeEmpty() {
-    return newContainerNode(Kind.REGEXP_EMPTY, null, null);
+  static RegExp makeEmpty(int flags) {
+    return newContainerNode(flags, Kind.REGEXP_EMPTY, null, null);
   }
   
-  static RegExp makeString(String s) {
-    return newLeafNode(Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
+  static RegExp makeString(int flags, String s) {
+    return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
   }
   
-  static RegExp makeAnyString() {
-    return newContainerNode(Kind.REGEXP_ANYSTRING, null, null);
+  static RegExp makeAnyString(int flags) {
+    return newContainerNode(flags, Kind.REGEXP_ANYSTRING, null, null);
   }
   
-  static RegExp makeAutomaton(String s) {
-    return newLeafNode(Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
+  static RegExp makeAutomaton(int flags, String s) {
+    return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
   }
   
-  static RegExp makeInterval(int min, int max, int digits) {
-  return newLeafNode(Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
+  static RegExp makeInterval(int flags, int min, int max, int digits) {
+  return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
   }
   
   private boolean peek(String s) {
@@ -1058,13 +1127,13 @@ public class RegExp {
   
   final RegExp parseUnionExp() throws IllegalArgumentException {
     RegExp e = parseInterExp();
-    if (match('|')) e = makeUnion(e, parseUnionExp());
+    if (match('|')) e = makeUnion(flags, e, parseUnionExp());
     return e;
   }
   
   final RegExp parseInterExp() throws IllegalArgumentException {
     RegExp e = parseConcatExp();
-    if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
+    if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e,
         parseInterExp());
     return e;
   }
@@ -1072,16 +1141,16 @@ public class RegExp {
   final RegExp parseConcatExp() throws IllegalArgumentException {
     RegExp e = parseRepeatExp();
     if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
-        e, parseConcatExp());
+        flags, e, parseConcatExp());
     return e;
   }
   
   final RegExp parseRepeatExp() throws IllegalArgumentException {
     RegExp e = parseComplExp();
     while (peek("?*+{")) {
-      if (match('?')) e = makeOptional(e);
-      else if (match('*')) e = makeRepeat(e);
-      else if (match('+')) e = makeRepeat(e, 1);
+      if (match('?')) e = makeOptional(flags, e);
+      else if (match('*')) e = makeRepeat(flags, e);
+      else if (match('+')) e = makeRepeat(flags, e, 1);
       else if (match('{')) {
         int start = pos;
         while (peek("0123456789"))
@@ -1099,15 +1168,15 @@ public class RegExp {
         } else m = n;
         if (!match('}')) throw new IllegalArgumentException(
             "expected '}' at position " + pos);
-        if (m == -1) e = makeRepeat(e, n);
-        else e = makeRepeat(e, n, m);
+        if (m == -1) e = makeRepeat(flags, e, n);
+        else e = makeRepeat(flags, e, n, m);
       }
     }
     return e;
   }
   
   final RegExp parseComplExp() throws IllegalArgumentException {
-    if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
+    if (check(COMPLEMENT) && match('~')) return makeComplement(flags, parseComplExp());
     else return parseCharClassExp();
   }
   
@@ -1116,7 +1185,7 @@ public class RegExp {
       boolean negate = false;
       if (match('^')) negate = true;
       RegExp e = parseCharClasses();
-      if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
+      if (negate) e = makeIntersection(flags, makeAnyChar(flags), makeComplement(flags, e));
       if (!match(']')) throw new IllegalArgumentException(
           "expected ']' at position " + pos);
       return e;
@@ -1126,29 +1195,29 @@ public class RegExp {
   final RegExp parseCharClasses() throws IllegalArgumentException {
     RegExp e = parseCharClass();
     while (more() && !peek("]"))
-      e = makeUnion(e, parseCharClass());
+      e = makeUnion(flags, e, parseCharClass());
     return e;
   }
   
   final RegExp parseCharClass() throws IllegalArgumentException {
     int c = parseCharExp();
-    if (match('-')) return makeCharRange(c, parseCharExp());
-    else return makeChar(c);
+    if (match('-')) return makeCharRange(flags, c, parseCharExp());
+    else return makeChar(flags, c);
   }
   
   final RegExp parseSimpleExp() throws IllegalArgumentException {
-    if (match('.')) return makeAnyChar();
-    else if (check(EMPTY) && match('#')) return makeEmpty();
-    else if (check(ANYSTRING) && match('@')) return makeAnyString();
+    if (match('.')) return makeAnyChar(flags);
+    else if (check(EMPTY) && match('#')) return makeEmpty(flags);
+    else if (check(ANYSTRING) && match('@')) return makeAnyString(flags);
     else if (match('"')) {
       int start = pos;
       while (more() && !peek("\""))
         next();
       if (!match('"')) throw new IllegalArgumentException(
           "expected '\"' at position " + pos);
-      return makeString(originalString.substring(start, pos - 1));
+      return makeString(flags, originalString.substring(start, pos - 1));
     } else if (match('(')) {
-      if (match(')')) return makeString("");
+      if (match(')')) return makeString(flags, "");
       RegExp e = parseUnionExp();
       if (!match(')')) throw new IllegalArgumentException(
           "expected ')' at position " + pos);
@@ -1164,7 +1233,7 @@ public class RegExp {
       if (i == -1) {
         if (!check(AUTOMATON)) throw new IllegalArgumentException(
             "interval syntax error at position " + (pos - 1));
-        return makeAutomaton(s);
+        return makeAutomaton(flags, s);
       } else {
         if (!check(INTERVAL)) throw new IllegalArgumentException(
             "illegal identifier at position " + (pos - 1));
@@ -1182,13 +1251,13 @@ public class RegExp {
             imin = imax;
             imax = t;
           }
-          return makeInterval(imin, imax, digits);
+          return makeInterval(flags, imin, imax, digits);
         } catch (NumberFormatException e) {
           throw new IllegalArgumentException(
               "interval syntax error at position " + (pos - 1));
         }
       }
-    } else return makeChar(parseCharExp());
+    } else return makeChar(flags, parseCharExp());
   }
   
   final int parseCharExp() throws IllegalArgumentException {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
index 6a8e183..9d21c9a 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@@ -73,6 +73,12 @@ public class TestRegexpQuery extends LuceneTestCase {
     return searcher.count(query);
   }
   
+  private long caseInsensitiveRegexQueryNrHits(String regex) throws IOException {
+    RegexpQuery query = new RegexpQuery(newTerm(regex), RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE,
+        Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+    return searcher.count(query);
+  }  
+  
   public void testRegex1() throws IOException {
     assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
   }
@@ -90,6 +96,11 @@ public class TestRegexpQuery extends LuceneTestCase {
     assertEquals(0, regexQueryNrHits("<493433-600000>"));
   }
   
+  public void testCaseInsensitive() throws IOException {
+    assertEquals(0, regexQueryNrHits("Quick"));
+    assertEquals(1, caseInsensitiveRegexQueryNrHits("Quick"));
+  }  
+  
   public void testRegexComplement() throws IOException {
     assertEquals(1, regexQueryNrHits("4934~[3]"));
     // not the empty lang, i.e. match all docs
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
index 7d24939..51cbe53 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@@ -17,8 +17,12 @@
 package org.apache.lucene.util.automaton;
 
 
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 public class TestRegExp extends LuceneTestCase {
 
   /**
@@ -83,4 +87,150 @@ public class TestRegExp extends LuceneTestCase {
     a = new RegExp("#?").toAutomaton(1000);
     assertTrue(a.toString().length() > 0);
   }
+  
+  boolean caseSensitiveQuery = true;
+  
+  public void testCoreJavaParity() {
+    // Generate random doc values and random regular expressions
+    // and check for same matching behaviour as Java's Pattern class.
+    for (int i = 0; i < 1000; i++) {
+      caseSensitiveQuery = true;      
+      checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
+    }        
+  }
+  
+  static String randomDocValue(int minLength) {
+    String charPalette = "AAAaaaBbbCccc123456 \t";
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < minLength; i++) {
+      sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
+    }
+    return sb.toString();
+  }
+
+  private static int randomInt(int bound) {
+    return bound == 0 ? 0 : random().nextInt(bound);
+  }
+
+  protected String checkRandomExpression(String docValue) {
+    // Generate and test a random regular expression which should match the given docValue
+    StringBuilder result = new StringBuilder();
+    // Pick a part of the string to change
+    int substitutionPoint = randomInt(docValue.length() - 1);
+    int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));
+
+    // Add any head to the result, unchanged
+    if (substitutionPoint > 0) {
+      result.append(docValue.substring(0, substitutionPoint));
+    }
+
+    // Modify the middle...
+    String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
+    int mutation = random().nextInt(12);
+    switch (mutation) {
+      case 0:
+        // OR with random alpha of same length
+        result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
+        break;
+      case 1:
+        // OR with non-existant value
+        result.append("(" + replacementPart + "|doesnotexist)");
+        break;
+      case 2:
+        // OR with another randomised regex (used to create nested levels of expression).
+        result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
+        break;
+      case 3:
+        // Star-replace all ab sequences.
+        result.append(replacementPart.replaceAll("ab", ".*"));
+        break;
+      case 4:
+        // .-replace all b chars
+        result.append(replacementPart.replaceAll("b", "."));
+        break;
+      case 5:
+        // length-limited stars {1,2}
+        result.append(".{1," + replacementPart.length() + "}");
+        break;
+      case 6:
+        // replace all chars with .
+        result.append(replacementPart.replaceAll(".", "."));
+        break;
+      case 7:
+        // OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
+        char[] chars = replacementPart.toCharArray();
+        for (char c : chars) {
+          result.append("[" + c + Character.toUpperCase(c) + "]");
+        }
+        break;
+      case 8:
+        // NOT a character - replace all b's with "not a"
+        result.append(replacementPart.replaceAll("b", "[^a]"));
+        break;
+      case 9:
+        // Make whole part repeatable 1 or more times
+        result.append("(" + replacementPart + ")+");
+        break;
+      case 10:
+        // Make whole part repeatable 0 or more times
+        result.append("(" + replacementPart + ")?");
+        break;
+      case 11:
+        // Switch case of characters
+        StringBuilder switchedCase = new StringBuilder();
+        replacementPart.codePoints().forEach(
+            p -> {
+              int switchedP = p;
+              if (Character.isLowerCase(p)) {
+                switchedP = Character.toUpperCase(p);
+              } else {
+                switchedP = Character.toLowerCase(p);                
+              }
+              switchedCase.appendCodePoint(switchedP);
+              if (p != switchedP) {
+                caseSensitiveQuery = false;
+              }
+            }
+        );        
+        result.append(switchedCase.toString());
+        break;
+      default:
+        break;
+    }
+    // add any remaining tail, unchanged
+    if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
+      result.append(docValue.substring(substitutionPoint + substitutionLength));
+    }
+
+    String regexPattern = result.toString();
+    // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
+    Pattern pattern = caseSensitiveQuery ? Pattern.compile(regexPattern): 
+                                           Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE); 
+                                             ;
+    Matcher matcher = pattern.matcher(docValue);
+    assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
+
+    int matchFlags = caseSensitiveQuery ? 0 : RegExp.ASCII_CASE_INSENSITIVE;
+    RegExp regex =  new RegExp(regexPattern, RegExp.ALL, matchFlags);
+    Automaton automaton = regex.toAutomaton();
+    ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
+    BytesRef br = new BytesRef(docValue);
+    assertTrue(
+        "[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
+            + docValue.length(),
+        bytesMatcher.run(br.bytes, br.offset, br.length)
+    );
+    if (caseSensitiveQuery == false) {
+      RegExp caseSensitiveRegex = new RegExp(regexPattern);
+      Automaton csAutomaton = caseSensitiveRegex.toAutomaton();
+      ByteRunAutomaton csBytesMatcher = new ByteRunAutomaton(csAutomaton);
+      assertFalse(
+          "[" + regexPattern + "] with case sensitive setting should not match [" + docValue + "]", 
+          csBytesMatcher.run(br.bytes, br.offset, br.length)
+      );
+      
+    }
+    return regexPattern;
+  }
+  
 }