You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/08/19 13:20:42 UTC

svn commit: r987129 - in /lucene/dev/trunk/lucene/contrib: ./ queries/src/java/org/apache/lucene/search/regex/ queries/src/test/org/apache/lucene/search/regex/

Author: rmuir
Date: Thu Aug 19 11:20:42 2010
New Revision: 987129

URL: http://svn.apache.org/viewvc?rev=987129&view=rev
Log:
LUCENE-2606: optimize contrib/regex for flex

Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java
    lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java
    lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java
    lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java
    lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
    lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Aug 19 11:20:42 2010
@@ -23,6 +23,12 @@ New Features
   * LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
     with two implementations.  The existing comparator (score, then frequency) is the default (Grant Ingersoll)
 
+API Changes
+
+  * LUCENE-2606: Changed RegexCapabilities interface to fix thread 
+    safety, serialization, and performance problems. If you have
+    written a custom RegexCapabilities it will need to be updated
+    to the new API.  (Robert Muir, Uwe Schindler)
 ======================= Lucene 3.x (not yet released) =======================
 
 Changes in backwards compatibility policy

Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java Thu Aug 19 11:20:42 2010
@@ -17,6 +17,9 @@ package org.apache.lucene.search.regex;
  * limitations under the License.
  */
 
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.regexp.CharacterIterator;
 import org.apache.regexp.RE;
 import org.apache.regexp.REProgram;
 import java.lang.reflect.Field;
@@ -30,8 +33,6 @@ import java.lang.reflect.Method;
  * it doesn't always provide a prefix even if one would exist.
  */
 public class JakartaRegexpCapabilities implements RegexCapabilities {
-  private RE regexp;
-
   private static Field prefixField;
   private static Method getPrefixMethod;
   static {
@@ -79,45 +80,75 @@ public class JakartaRegexpCapabilities i
     this.flags = flags;
   }
   
-  public void compile(String pattern) {
-    regexp = new RE(pattern, this.flags);
+  public RegexCapabilities.RegexMatcher compile(String regex) {
+    return new JakartaRegexMatcher(regex, flags);
   }
 
-  public boolean match(String string) {
-    return regexp.match(string);
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + flags;
+    return result;
   }
 
-  public String prefix() {
-    try {
-      final char[] prefix;
-      if (getPrefixMethod != null) {
-        prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
-      } else if (prefixField != null) {
-        prefix = (char[]) prefixField.get(regexp.getProgram());
-      } else {
-        return null;
-      }
-      return prefix == null ? null : new String(prefix);
-    } catch (Exception e) {
-      // if we cannot get the prefix, return none
-      return null;
-    }
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (obj == null) return false;
+    if (getClass() != obj.getClass()) return false;
+    JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
+    if (flags != other.flags) return false;
+    return true;
   }
 
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
+  class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
+    private RE regexp;
+    private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+    private final CharacterIterator utf16wrapper = new CharacterIterator() {
 
-    final JakartaRegexpCapabilities that = (JakartaRegexpCapabilities) o;
+      public char charAt(int pos) {
+        return utf16.result[pos];
+      }
 
-    if (regexp != null ? !regexp.equals(that.regexp) : that.regexp != null) return false;
+      public boolean isEnd(int pos) {
+        return pos >= utf16.length;
+      }
 
-    return true;
-  }
+      public String substring(int beginIndex) {
+        return substring(beginIndex, utf16.length);
+      }
 
-  @Override
-  public int hashCode() {
-    return (regexp != null ? regexp.hashCode() : 0);
+      public String substring(int beginIndex, int endIndex) {
+        return new String(utf16.result, beginIndex, endIndex - beginIndex);
+      }
+      
+    };
+    
+    public JakartaRegexMatcher(String regex, int flags) {
+      regexp = new RE(regex, flags);
+    }
+    
+    public boolean match(BytesRef term) {
+      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+      return regexp.match(utf16wrapper, 0);
+    }
+
+    public String prefix() {
+      try {
+        final char[] prefix;
+        if (getPrefixMethod != null) {
+          prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
+        } else if (prefixField != null) {
+          prefix = (char[]) prefixField.get(regexp.getProgram());
+        } else {
+          return null;
+        }
+        return prefix == null ? null : new String(prefix);
+      } catch (Exception e) {
+        // if we cannot get the prefix, return none
+        return null;
+      }
+    }
   }
 }

Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java Thu Aug 19 11:20:42 2010
@@ -17,8 +17,12 @@ package org.apache.lucene.search.regex;
  * limitations under the License.
  */
 
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+
 /**
  * An implementation tying Java's built-in java.util.regex to RegexQuery.
  *
@@ -27,9 +31,8 @@ import java.util.regex.Pattern;
  * attempt to {@link #match} each term for the specified field in the index.
  */
 public class JavaUtilRegexCapabilities implements RegexCapabilities {
-  private Pattern pattern;
   private int flags = 0;
-  
+
   // Define the optional flags from Pattern that can be used.
   // Do this here to keep Pattern contained within this class.
   
@@ -66,32 +69,59 @@ public class JavaUtilRegexCapabilities i
     this.flags = flags;
   }
   
-  public void compile(String pattern) {
-    this.pattern = Pattern.compile(pattern, this.flags);
+  public RegexCapabilities.RegexMatcher compile(String regex) {
+    return new JavaUtilRegexMatcher(regex, flags);
   }
-
-  public boolean match(String string) {
-    return pattern.matcher(string).matches();
-  }
-
-  public String prefix() {
-    return null;
+  
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + flags;
+    return result;
   }
 
   @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-
-    final JavaUtilRegexCapabilities that = (JavaUtilRegexCapabilities) o;
-
-    if (pattern != null ? !pattern.equals(that.pattern) : that.pattern != null) return false;
-
+  public boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (obj == null) return false;
+    if (getClass() != obj.getClass()) return false;
+    JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
+    if (flags != other.flags) return false;
     return true;
   }
 
-  @Override
-  public int hashCode() {
-    return (pattern != null ? pattern.hashCode() : 0);
+  class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
+    private final Pattern pattern;
+    private final Matcher matcher;
+    private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+    private final CharSequence utf16wrapper = new CharSequence() {
+
+      public int length() {
+        return utf16.length;
+      }
+
+      public char charAt(int index) {
+        return utf16.result[index];
+      }
+
+      public CharSequence subSequence(int start, int end) {
+        return new String(utf16.result, start, end - start);
+      }  
+    };
+    
+    public JavaUtilRegexMatcher(String regex, int flags) {
+      this.pattern = Pattern.compile(regex, flags);
+      this.matcher = this.pattern.matcher(utf16wrapper);
+    }
+    
+    public boolean match(BytesRef term) {
+      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+      return matcher.reset().matches();
+    }
+
+    public String prefix() {
+      return null;
+    }
   }
 }

Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java Thu Aug 19 11:20:42 2010
@@ -1,5 +1,9 @@
 package org.apache.lucene.search.regex;
 
+import java.io.Serializable;
+
+import org.apache.lucene.util.BytesRef;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -21,7 +25,7 @@ package org.apache.lucene.search.regex;
  * Defines basic operations needed by {@link RegexQuery} for a regular
  * expression implementation.
  */
-public interface RegexCapabilities {
+public interface RegexCapabilities extends Serializable {
   /**
    * Called by the constructor of {@link RegexTermEnum} allowing
    * implementations to cache a compiled version of the regular
@@ -29,20 +33,22 @@ public interface RegexCapabilities {
    *
    * @param pattern regular expression pattern
    */
-  void compile(String pattern);
+  public RegexMatcher compile(String pattern);
 
-  /**
-   *
-   * @param string
-   * @return true if string matches the pattern last passed to {@link #compile}.
-   */
-  boolean match(String string);
+  public interface RegexMatcher {
+    /**
+     *
+     * @param string
+     * @return true if string matches the pattern last passed to {@link #compile}.
+     */
+    public boolean match(BytesRef term);
 
-  /**
-   * A wise prefix implementation can reduce the term enumeration (and thus increase performance)
-   * of RegexQuery dramatically!
-   *
-   * @return static non-regex prefix of the pattern last passed to {@link #compile}.  May return null.
-   */
-  String prefix();
+    /**
+     * A wise prefix implementation can reduce the term enumeration (and thus increase performance)
+     * of RegexQuery dramatically!
+     *
+     * @return static non-regex prefix of the pattern last passed to {@link #compile}.  May return null.
+     */
+    public String prefix();
+  }
 }

Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java Thu Aug 19 11:20:42 2010
@@ -76,23 +76,27 @@ public class RegexQuery extends MultiTer
     return buffer.toString();
   }
 
-  /* generated by IntelliJ IDEA */
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) return true;
-    if (o == null || getClass() != o.getClass()) return false;
-    if (!super.equals(o)) return false;
-
-    final RegexQuery that = (RegexQuery) o;
-
-    return regexImpl.equals(that.regexImpl);
-  }
-
-  /* generated by IntelliJ IDEA */
   @Override
   public int hashCode() {
+    final int prime = 31;
     int result = super.hashCode();
-    result = 29 * result + regexImpl.hashCode();
+    result = prime * result + ((regexImpl == null) ? 0 : regexImpl.hashCode());
+    result = prime * result + ((term == null) ? 0 : term.hashCode());
     return result;
   }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (!super.equals(obj)) return false;
+    if (getClass() != obj.getClass()) return false;
+    RegexQuery other = (RegexQuery) obj;
+    if (regexImpl == null) {
+      if (other.regexImpl != null) return false;
+    } else if (!regexImpl.equals(other.regexImpl)) return false;
+    if (term == null) {
+      if (other.term != null) return false;
+    } else if (!term.equals(other.term)) return false;
+    return true;
+  }
 }

Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java Thu Aug 19 11:20:42 2010
@@ -34,15 +34,13 @@ import java.io.IOException;
  */
 
 public class RegexTermsEnum extends FilteredTermsEnum {
-  private RegexCapabilities regexImpl;
+  private RegexCapabilities.RegexMatcher regexImpl;
   private final BytesRef prefixRef;
 
-  public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
+  public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexCap) throws IOException {
     super(reader, term.field());
     String text = term.text();
-    this.regexImpl = regexImpl;
-
-    regexImpl.compile(text);
+    this.regexImpl = regexCap.compile(text);
 
     String pre = regexImpl.prefix();
     if (pre == null) pre = "";
@@ -55,8 +53,7 @@ public class RegexTermsEnum extends Filt
     if (term.startsWith(prefixRef)) {
       // TODO: set BoostAttr based on distance of
       // searchTerm.text() and term().text()
-      String text = term.utf8ToString();
-      return regexImpl.match(text) ? AcceptStatus.YES : AcceptStatus.NO;
+      return regexImpl.match(term) ? AcceptStatus.YES : AcceptStatus.NO;
     } else {
       return AcceptStatus.NO;
     }

Modified: lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java Thu Aug 19 11:20:42 2010
@@ -17,6 +17,7 @@ package org.apache.lucene.search.regex;
  * limitations under the License.
  */
 
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 
 /**
@@ -26,21 +27,21 @@ public class TestJakartaRegexpCapabiliti
 
   public void testGetPrefix(){
     JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
-    cap.compile("luc[e]?");
-    assertTrue(cap.match("luce"));
-    assertEquals("luc", cap.prefix());
+    RegexCapabilities.RegexMatcher matcher = cap.compile("luc[e]?");
+    assertTrue(matcher.match(new BytesRef("luce")));
+    assertEquals("luc", matcher.prefix());
     
-    cap.compile("lucene");
-    assertTrue(cap.match("lucene"));
-    assertEquals("lucene", cap.prefix());
+    matcher = cap.compile("lucene");
+    assertTrue(matcher.match(new BytesRef("lucene")));
+    assertEquals("lucene", matcher.prefix());
   }
   
   public void testShakyPrefix(){
     JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
-    cap.compile("(ab|ac)");
-    assertTrue(cap.match("ab"));
-    assertTrue(cap.match("ac"));
+    RegexCapabilities.RegexMatcher matcher = cap.compile("(ab|ac)");
+    assertTrue(matcher.match(new BytesRef("ab")));
+    assertTrue(matcher.match(new BytesRef("ac")));
     // why is it not a???
-    assertNull(cap.prefix());
+    assertNull(matcher.prefix());
   }
 }