You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/08/19 13:20:42 UTC
svn commit: r987129 - in /lucene/dev/trunk/lucene/contrib: ./
queries/src/java/org/apache/lucene/search/regex/
queries/src/test/org/apache/lucene/search/regex/
Author: rmuir
Date: Thu Aug 19 11:20:42 2010
New Revision: 987129
URL: http://svn.apache.org/viewvc?rev=987129&view=rev
Log:
LUCENE-2606: optimize contrib/regex for flex
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java
lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java
lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java
lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java
lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Aug 19 11:20:42 2010
@@ -23,6 +23,12 @@ New Features
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
+API Changes
+
+ * LUCENE-2606: Changed RegexCapabilities interface to fix thread
+ safety, serialization, and performance problems. If you have
+ written a custom RegexCapabilities it will need to be updated
+ to the new API. (Robert Muir, Uwe Schindler)
======================= Lucene 3.x (not yet released) =======================
Changes in backwards compatibility policy
Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java Thu Aug 19 11:20:42 2010
@@ -17,6 +17,9 @@ package org.apache.lucene.search.regex;
* limitations under the License.
*/
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.regexp.CharacterIterator;
import org.apache.regexp.RE;
import org.apache.regexp.REProgram;
import java.lang.reflect.Field;
@@ -30,8 +33,6 @@ import java.lang.reflect.Method;
* it doesn't always provide a prefix even if one would exist.
*/
public class JakartaRegexpCapabilities implements RegexCapabilities {
- private RE regexp;
-
private static Field prefixField;
private static Method getPrefixMethod;
static {
@@ -79,45 +80,75 @@ public class JakartaRegexpCapabilities i
this.flags = flags;
}
- public void compile(String pattern) {
- regexp = new RE(pattern, this.flags);
+ public RegexCapabilities.RegexMatcher compile(String regex) {
+ return new JakartaRegexMatcher(regex, flags);
}
- public boolean match(String string) {
- return regexp.match(string);
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + flags;
+ return result;
}
- public String prefix() {
- try {
- final char[] prefix;
- if (getPrefixMethod != null) {
- prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
- } else if (prefixField != null) {
- prefix = (char[]) prefixField.get(regexp.getProgram());
- } else {
- return null;
- }
- return prefix == null ? null : new String(prefix);
- } catch (Exception e) {
- // if we cannot get the prefix, return none
- return null;
- }
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+ JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
+ if (flags != other.flags) return false;
+ return true;
}
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
+ class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
+ private RE regexp;
+ private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+ private final CharacterIterator utf16wrapper = new CharacterIterator() {
- final JakartaRegexpCapabilities that = (JakartaRegexpCapabilities) o;
+ public char charAt(int pos) {
+ return utf16.result[pos];
+ }
- if (regexp != null ? !regexp.equals(that.regexp) : that.regexp != null) return false;
+ public boolean isEnd(int pos) {
+ return pos >= utf16.length;
+ }
- return true;
- }
+ public String substring(int beginIndex) {
+ return substring(beginIndex, utf16.length);
+ }
- @Override
- public int hashCode() {
- return (regexp != null ? regexp.hashCode() : 0);
+ public String substring(int beginIndex, int endIndex) {
+ return new String(utf16.result, beginIndex, endIndex - beginIndex);
+ }
+
+ };
+
+ public JakartaRegexMatcher(String regex, int flags) {
+ regexp = new RE(regex, flags);
+ }
+
+ public boolean match(BytesRef term) {
+ UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+ return regexp.match(utf16wrapper, 0);
+ }
+
+ public String prefix() {
+ try {
+ final char[] prefix;
+ if (getPrefixMethod != null) {
+ prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
+ } else if (prefixField != null) {
+ prefix = (char[]) prefixField.get(regexp.getProgram());
+ } else {
+ return null;
+ }
+ return prefix == null ? null : new String(prefix);
+ } catch (Exception e) {
+ // if we cannot get the prefix, return none
+ return null;
+ }
+ }
}
}
Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java Thu Aug 19 11:20:42 2010
@@ -17,8 +17,12 @@ package org.apache.lucene.search.regex;
* limitations under the License.
*/
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+
/**
* An implementation tying Java's built-in java.util.regex to RegexQuery.
*
@@ -27,9 +31,8 @@ import java.util.regex.Pattern;
* attempt to {@link #match} each term for the specified field in the index.
*/
public class JavaUtilRegexCapabilities implements RegexCapabilities {
- private Pattern pattern;
private int flags = 0;
-
+
// Define the optional flags from Pattern that can be used.
// Do this here to keep Pattern contained within this class.
@@ -66,32 +69,59 @@ public class JavaUtilRegexCapabilities i
this.flags = flags;
}
- public void compile(String pattern) {
- this.pattern = Pattern.compile(pattern, this.flags);
+ public RegexCapabilities.RegexMatcher compile(String regex) {
+ return new JavaUtilRegexMatcher(regex, flags);
}
-
- public boolean match(String string) {
- return pattern.matcher(string).matches();
- }
-
- public String prefix() {
- return null;
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + flags;
+ return result;
}
@Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
-
- final JavaUtilRegexCapabilities that = (JavaUtilRegexCapabilities) o;
-
- if (pattern != null ? !pattern.equals(that.pattern) : that.pattern != null) return false;
-
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+ JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
+ if (flags != other.flags) return false;
return true;
}
- @Override
- public int hashCode() {
- return (pattern != null ? pattern.hashCode() : 0);
+ class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
+ private final Pattern pattern;
+ private final Matcher matcher;
+ private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+ private final CharSequence utf16wrapper = new CharSequence() {
+
+ public int length() {
+ return utf16.length;
+ }
+
+ public char charAt(int index) {
+ return utf16.result[index];
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ return new String(utf16.result, start, end - start);
+ }
+ };
+
+ public JavaUtilRegexMatcher(String regex, int flags) {
+ this.pattern = Pattern.compile(regex, flags);
+ this.matcher = this.pattern.matcher(utf16wrapper);
+ }
+
+ public boolean match(BytesRef term) {
+ UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+ return matcher.reset().matches();
+ }
+
+ public String prefix() {
+ return null;
+ }
}
}
Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexCapabilities.java Thu Aug 19 11:20:42 2010
@@ -1,5 +1,9 @@
package org.apache.lucene.search.regex;
+import java.io.Serializable;
+
+import org.apache.lucene.util.BytesRef;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -21,7 +25,7 @@ package org.apache.lucene.search.regex;
* Defines basic operations needed by {@link RegexQuery} for a regular
* expression implementation.
*/
-public interface RegexCapabilities {
+public interface RegexCapabilities extends Serializable {
/**
* Called by the constructor of {@link RegexTermEnum} allowing
* implementations to cache a compiled version of the regular
@@ -29,20 +33,22 @@ public interface RegexCapabilities {
*
* @param pattern regular expression pattern
*/
- void compile(String pattern);
+ public RegexMatcher compile(String pattern);
- /**
- *
- * @param string
- * @return true if string matches the pattern last passed to {@link #compile}.
- */
- boolean match(String string);
+ public interface RegexMatcher {
+ /**
+ *
+ * @param string
+ * @return true if string matches the pattern last passed to {@link #compile}.
+ */
+ public boolean match(BytesRef term);
- /**
- * A wise prefix implementation can reduce the term enumeration (and thus increase performance)
- * of RegexQuery dramatically!
- *
- * @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
- */
- String prefix();
+ /**
+ * A wise prefix implementation can reduce the term enumeration (and thus increase performance)
+ * of RegexQuery dramatically!
+ *
+ * @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
+ */
+ public String prefix();
+ }
}
Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java Thu Aug 19 11:20:42 2010
@@ -76,23 +76,27 @@ public class RegexQuery extends MultiTer
return buffer.toString();
}
- /* generated by IntelliJ IDEA */
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
- if (!super.equals(o)) return false;
-
- final RegexQuery that = (RegexQuery) o;
-
- return regexImpl.equals(that.regexImpl);
- }
-
- /* generated by IntelliJ IDEA */
@Override
public int hashCode() {
+ final int prime = 31;
int result = super.hashCode();
- result = 29 * result + regexImpl.hashCode();
+ result = prime * result + ((regexImpl == null) ? 0 : regexImpl.hashCode());
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (!super.equals(obj)) return false;
+ if (getClass() != obj.getClass()) return false;
+ RegexQuery other = (RegexQuery) obj;
+ if (regexImpl == null) {
+ if (other.regexImpl != null) return false;
+ } else if (!regexImpl.equals(other.regexImpl)) return false;
+ if (term == null) {
+ if (other.term != null) return false;
+ } else if (!term.equals(other.term)) return false;
+ return true;
+ }
}
Modified: lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java Thu Aug 19 11:20:42 2010
@@ -34,15 +34,13 @@ import java.io.IOException;
*/
public class RegexTermsEnum extends FilteredTermsEnum {
- private RegexCapabilities regexImpl;
+ private RegexCapabilities.RegexMatcher regexImpl;
private final BytesRef prefixRef;
- public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
+ public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexCap) throws IOException {
super(reader, term.field());
String text = term.text();
- this.regexImpl = regexImpl;
-
- regexImpl.compile(text);
+ this.regexImpl = regexCap.compile(text);
String pre = regexImpl.prefix();
if (pre == null) pre = "";
@@ -55,8 +53,7 @@ public class RegexTermsEnum extends Filt
if (term.startsWith(prefixRef)) {
// TODO: set BoostAttr based on distance of
// searchTerm.text() and term().text()
- String text = term.utf8ToString();
- return regexImpl.match(text) ? AcceptStatus.YES : AcceptStatus.NO;
+ return regexImpl.match(term) ? AcceptStatus.YES : AcceptStatus.NO;
} else {
return AcceptStatus.NO;
}
Modified: lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java?rev=987129&r1=987128&r2=987129&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java (original)
+++ lucene/dev/trunk/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestJakartaRegexpCapabilities.java Thu Aug 19 11:20:42 2010
@@ -17,6 +17,7 @@ package org.apache.lucene.search.regex;
* limitations under the License.
*/
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
/**
@@ -26,21 +27,21 @@ public class TestJakartaRegexpCapabiliti
public void testGetPrefix(){
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
- cap.compile("luc[e]?");
- assertTrue(cap.match("luce"));
- assertEquals("luc", cap.prefix());
+ RegexCapabilities.RegexMatcher matcher = cap.compile("luc[e]?");
+ assertTrue(matcher.match(new BytesRef("luce")));
+ assertEquals("luc", matcher.prefix());
- cap.compile("lucene");
- assertTrue(cap.match("lucene"));
- assertEquals("lucene", cap.prefix());
+ matcher = cap.compile("lucene");
+ assertTrue(matcher.match(new BytesRef("lucene")));
+ assertEquals("lucene", matcher.prefix());
}
public void testShakyPrefix(){
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
- cap.compile("(ab|ac)");
- assertTrue(cap.match("ab"));
- assertTrue(cap.match("ac"));
+ RegexCapabilities.RegexMatcher matcher = cap.compile("(ab|ac)");
+ assertTrue(matcher.match(new BytesRef("ab")));
+ assertTrue(matcher.match(new BytesRef("ac")));
// why is it not a???
- assertNull(cap.prefix());
+ assertNull(matcher.prefix());
}
}