You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ko...@apache.org on 2009/09/06 05:45:43 UTC
svn commit: r811753 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
Author: koji
Date: Sun Sep 6 03:45:42 2009
New Revision: 811753
URL: http://svn.apache.org/viewvc?rev=811753&view=rev
Log:
SOLR-1398: Add offset corrections in PatternTokenizerFactory
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=811753&r1=811752&r2=811753&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sun Sep 6 03:45:42 2009
@@ -523,6 +523,9 @@
include the "1" (ie: 2**0) bucket in the term histogram data.
(hossman)
+63. SOLR-1398: Add offset corrections in PatternTokenizerFactory.
+ (Anders Melchiorsen, koji)
+
Other Changes
----------------------
1. Upgraded to Lucene 2.4.0 (yonik)
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java?rev=811753&r1=811752&r2=811753&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java Sun Sep 6 03:45:42 2009
@@ -27,6 +27,7 @@
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.common.SolrException;
@@ -112,8 +113,8 @@
Matcher matcher = pattern.matcher( str );
tokens = (group < 0 )
- ? split( matcher, str )
- : group( matcher, str, group );
+ ? split( matcher, str, input )
+ : group( matcher, str, group, input );
iter = tokens.iterator();
}
@@ -151,12 +152,19 @@
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
}
}
+
+ /**
+ * @deprecated
+ */
+ public static List<Token> split( Matcher matcher, String input ){
+ return split(matcher,input,null);
+ }
/**
* This behaves just like String.split( ), but returns a list of Tokens
* rather then an array of strings
*/
- public static List<Token> split( Matcher matcher, String input )
+ public static List<Token> split( Matcher matcher, String input, CharStream stream )
{
int index = 0;
int lastNonEmptySize = Integer.MAX_VALUE;
@@ -165,7 +173,7 @@
// Add segments before each match found
while(matcher.find()) {
String match = input.subSequence(index, matcher.start()).toString();
- matchList.add( new Token( match, index, matcher.start()) );
+ matchList.add( newToken( stream, match, index, matcher.start()) );
index = matcher.end();
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
@@ -174,11 +182,11 @@
// If no match is found, return the full string
if (index == 0) {
- matchList.add( new Token( input, 0, input.length()) );
+ matchList.add( newToken( stream, input, 0, input.length()) );
}
else {
String match = input.subSequence(index, input.length()).toString();
- matchList.add( new Token( match, index, input.length()) );
+ matchList.add( newToken( stream, match, index, input.length()) );
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
}
@@ -193,13 +201,20 @@
/**
+ * @deprecated
+ */
+ public static List<Token> group( Matcher matcher, String input, int group ){
+ return group(matcher, input, group, null);
+ }
+
+ /**
* Create tokens from the matches in a matcher
*/
- public static List<Token> group( Matcher matcher, String input, int group )
+ public static List<Token> group( Matcher matcher, String input, int group, CharStream stream )
{
ArrayList<Token> matchList = new ArrayList<Token>();
while(matcher.find()) {
- Token t = new Token(
+ Token t = newToken( stream,
matcher.group(group),
matcher.start(group),
matcher.end(group) );
@@ -207,4 +222,13 @@
}
return matchList;
}
+
+ private static Token newToken( CharStream stream, String text, int start, int end ){
+ Token token = null;
+ if( stream != null )
+ token = new Token( text, stream.correctOffset( start ), stream.correctOffset( end ) );
+ else
+ token = new Token( text, start, end );
+ return token;
+ }
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java?rev=811753&r1=811752&r2=811753&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java Sun Sep 6 03:45:42 2009
@@ -18,15 +18,19 @@
package org.apache.solr.analysis;
import java.io.StringReader;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
-import junit.framework.TestCase;
-
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.MappingCharFilter;
+import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
-public class TestPatternTokenizerFactory extends AnalysisTestCase
+public class TestPatternTokenizerFactory extends BaseTokenTestCase
{
public void testSplitting() throws Exception
{
@@ -70,4 +74,27 @@
}
}
}
+
+ public void testOffsetCorrection() throws Exception {
+ final String INPUT = "Günther Günther is here";
+
+ // create MappingCharFilter
+ MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
+ List<String> mappingRules = new ArrayList<String>();
+ mappingRules.add( "\"ü\" => \"ü\"" );
+ NormalizeCharMap normMap = new NormalizeCharMap();
+ cfFactory.parseRules( mappingRules, normMap );
+ CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
+
+ // create PatternTokenizer
+ Map<String,String> args = new HashMap<String, String>();
+ args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
+ PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
+ tokFactory.init( args );
+ TokenStream stream = tokFactory.create( charStream );
+
+ List<Token> result = getTokens( stream );
+ List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
+ assertTokEqualOff( expect, result );
+ }
}