You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2013/04/24 12:32:09 UTC
svn commit: r1471352 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/
lucene/analysis/common/src/resources/META-INF/services/
lucene/analysis/common/src/test/org/apache...
Author: simonw
Date: Wed Apr 24 10:32:08 2013
New Revision: 1471352
URL: http://svn.apache.org/r1471352
Log:
LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to emit multiple tokens one for each capture group in one or more patterns
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java
- copied unchanged from r1471347, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java
- copied unchanged from r1471347, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java
- copied unchanged from r1471347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Apr 24 10:32:08 2013
@@ -23,6 +23,13 @@ Optimizations
a common divisor. In particular, this improves the compression ratio of dates
without time when they are encoded as milliseconds since Epoch. Also support
TABLE compressed numerics in the Disk codec. (Robert Muir, Adrien Grand)
+
+
+New Features
+
+* LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to
+ emit multiple tokens one for each capture group in one or more patterns.
+ (Simon Willnauer, Clinton Gormley)
======================= Lucene 4.3.0 =======================
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java Wed Apr 24 10:32:08 2013
@@ -38,7 +38,6 @@ import java.io.IOException;
* @see Pattern
*/
public final class PatternReplaceFilter extends TokenFilter {
- private final Pattern p;
private final String replacement;
private final boolean all;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -60,7 +59,6 @@ public final class PatternReplaceFilter
String replacement,
boolean all) {
super(in);
- this.p=p;
this.replacement = (null == replacement) ? "" : replacement;
this.all=all;
this.m = p.matcher(termAtt);
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java Wed Apr 24 10:32:08 2013
@@ -60,7 +60,6 @@ public final class PatternTokenizer exte
private final StringBuilder str = new StringBuilder();
private int index;
- private final Pattern pattern;
private final int group;
private final Matcher matcher;
@@ -72,7 +71,6 @@ public final class PatternTokenizer exte
/** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */
public PatternTokenizer(AttributeFactory factory, Reader input, Pattern pattern, int group) {
super(factory, input);
- this.pattern = pattern;
this.group = group;
// Use "" instead of str so don't consume chars
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Wed Apr 24 10:32:08 2013
@@ -72,6 +72,7 @@ org.apache.lucene.analysis.ngram.NGramFi
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
+org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Wed Apr 24 10:32:08 2013
@@ -470,6 +470,12 @@ public class TestRandomChains extends Ba
return Pattern.compile("a");
}
});
+
+ put(Pattern[].class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ return new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")};
+ }
+ });
put(PayloadEncoder.class, new ArgProducer() {
@Override public Object create(Random random) {
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?