You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2013/04/24 12:32:09 UTC

svn commit: r1471352 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/ lucene/analysis/common/src/resources/META-INF/services/ lucene/analysis/common/src/test/org/apache...

Author: simonw
Date: Wed Apr 24 10:32:08 2013
New Revision: 1471352

URL: http://svn.apache.org/r1471352
Log:
LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to emit multiple tokens one for each capture group in one or more patterns

Added:
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java
      - copied unchanged from r1471347, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java
      - copied unchanged from r1471347, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java
      - copied unchanged from r1471347, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Apr 24 10:32:08 2013
@@ -23,6 +23,13 @@ Optimizations
   a common divisor. In particular, this improves the compression ratio of dates
   without time when they are encoded as milliseconds since Epoch. Also support
   TABLE compressed numerics in the Disk codec.  (Robert Muir, Adrien Grand)
+  
+  
+New Features
+
+* LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to 
+  emit multiple tokens one for each capture group in one or more patterns.
+  (Simon Willnauer, Clinton Gormley)
 
 ======================= Lucene 4.3.0 =======================
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java Wed Apr 24 10:32:08 2013
@@ -38,7 +38,6 @@ import java.io.IOException;
  * @see Pattern
  */
 public final class PatternReplaceFilter extends TokenFilter {
-  private final Pattern p;
   private final String replacement;
   private final boolean all;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -60,7 +59,6 @@ public final class PatternReplaceFilter 
                               String replacement,
                               boolean all) {
     super(in);
-    this.p=p;
     this.replacement = (null == replacement) ? "" : replacement;
     this.all=all;
     this.m = p.matcher(termAtt);

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java Wed Apr 24 10:32:08 2013
@@ -60,7 +60,6 @@ public final class PatternTokenizer exte
   private final StringBuilder str = new StringBuilder();
   private int index;
   
-  private final Pattern pattern;
   private final int group;
   private final Matcher matcher;
 
@@ -72,7 +71,6 @@ public final class PatternTokenizer exte
   /** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */
   public PatternTokenizer(AttributeFactory factory, Reader input, Pattern pattern, int group) {
     super(factory, input);
-    this.pattern = pattern;
     this.group = group;
 
     // Use "" instead of str so don't consume chars

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Wed Apr 24 10:32:08 2013
@@ -72,6 +72,7 @@ org.apache.lucene.analysis.ngram.NGramFi
 org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
 org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
 org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
+org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
 org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
 org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
 org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1471352&r1=1471351&r2=1471352&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Wed Apr 24 10:32:08 2013
@@ -470,6 +470,12 @@ public class TestRandomChains extends Ba
         return Pattern.compile("a");
       }
     });
+    
+    put(Pattern[].class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")};
+      }
+    });
     put(PayloadEncoder.class, new ArgProducer() {
       @Override public Object create(Random random) {
         return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?