You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ko...@apache.org on 2009/09/18 09:37:22 UTC
svn commit: r816502 - in /lucene/solr/trunk: ./
src/java/org/apache/solr/analysis/ src/java/org/apache/solr/schema/
src/test/org/apache/solr/analysis/
Author: koji
Date: Fri Sep 18 07:37:22 2009
New Revision: 816502
URL: http://svn.apache.org/viewvc?rev=816502&view=rev
Log:
SOLR-1423: Use Tokenizer.correctOffset() instead of CharStream.correctOffset()
Added:
lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizer.java (with props)
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
lucene/solr/trunk/src/java/org/apache/solr/schema/FieldType.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=816502&r1=816501&r2=816502&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Fri Sep 18 07:37:22 2009
@@ -691,6 +691,9 @@
RussianLowerCaseFilterFactory or RussianLetterTokenizerFactory.
(Robert Muir via hossman)
+48. SOLR-1423: Due to LUCENE-1906, Solr's tokenizer should use Tokenizer.correctOffset() instead of CharStream.correctOffset().
+ (Uwe Schindler via koji)
+
Build
----------------------
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizer.java?rev=816502&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizer.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizer.java Fri Sep 18 07:37:22 2009
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.commons.io.IOUtils;
+
+/**
+ * This tokenizer uses regex pattern matching to construct distinct tokens
+ * for the input stream. It takes two arguments: "pattern" and "group".
+ * <p/>
+ * <ul>
+ * <li>"pattern" is the regular expression.</li>
+ * <li>"group" says which group to extract into tokens.</li>
+ * </ul>
+ * <p>
+ * group=-1 (the default) is equivalent to "split". In this case, the tokens will
+ * be equivalent to the output from (without empty tokens):
+ * {@link String#split(java.lang.String)}
+ * </p>
+ * <p>
+ * Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
+ * <pre>
+ * pattern = \'([^\']+)\'
+ * group = 0
+ * input = aaa 'bbb' 'ccc'
+ *</pre>
+ * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
+ * but using group=1, the output would be: bbb and ccc (no ' marks)
+ * </p>
+ * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
+ *
+ * @version $Id$
+ * @see Pattern
+ */
+public final class PatternTokenizer extends Tokenizer {
+
+ private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
+ private String str;
+ private int index;
+
+ private final Pattern pattern;
+ private final int group;
+ private final Matcher matcher;
+
+ /** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */
+ public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException {
+ super(input);
+ this.pattern = pattern;
+ this.group = group;
+ str = IOUtils.toString(input);
+ matcher = pattern.matcher(str);
+ index = 0;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (index >= str.length()) return false;
+
+ if (group >= 0) {
+
+ // match a specific group
+ while (matcher.find()) {
+ final String match = matcher.group(group);
+ if (match.length() == 0) continue;
+ termAtt.setTermBuffer(match);
+ index = matcher.start(group);
+ offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
+ return true;
+ }
+
+ index = Integer.MAX_VALUE; // mark exhausted
+ return false;
+
+ } else {
+
+ // String.split() functionality
+ while (matcher.find()) {
+ if (matcher.start() - index > 0) {
+ // found a non-zero-length token
+ termAtt.setTermBuffer(str, index, matcher.start() - index);
+ offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
+ index = matcher.end();
+ return true;
+ }
+
+ index = matcher.end();
+ }
+
+ if (str.length() - index == 0) {
+ index = Integer.MAX_VALUE; // mark exhausted
+ return false;
+ }
+
+ termAtt.setTermBuffer(str, index, str.length() - index);
+ offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
+ index = Integer.MAX_VALUE; // mark exhausted
+ return true;
+ }
+ }
+
+ @Override
+ public void end() throws IOException {
+ final int ofs = correctOffset(str.length());
+ offsetAtt.setOffset(ofs, ofs);
+ }
+
+ @Override
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ str = IOUtils.toString(input);
+ matcher.reset(str);
+ index = 0;
+ }
+
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizer.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java?rev=816502&r1=816501&r2=816502&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java Fri Sep 18 07:37:22 2009
@@ -20,14 +20,11 @@
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.io.IOUtils;
-import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.common.SolrException;
@@ -43,9 +40,8 @@
* </ul>
* <p>
* group=-1 (the default) is equivalent to "split". In this case, the tokens will
- * be equivalent to the output from:
- *
- * http://java.sun.com/j2se/1.4.2/docs/api/java/lang/String.html#split(java.lang.String)
+ * be equivalent to the output from (without empty tokens):
+ * {@link String#split(java.lang.String)}
* </p>
* <p>
* Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
@@ -57,7 +53,9 @@
* the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
* but using group=1, the output would be: bbb and ccc (no ' marks)
* </p>
+ * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
*
+ * @see PatternTokenizer
* @since solr1.2
* @version $Id:$
*/
@@ -101,70 +99,20 @@
*/
public Tokenizer create(final Reader in) {
try {
- return new Tokenizer(in) {
- {init();}
-
- List<Token> tokens;
- Iterator<Token> iter;
-
- void init() throws IOException {
- // Read the input into a single string
- String str = IOUtils.toString( input );
-
- Matcher matcher = pattern.matcher( str );
- tokens = (group < 0 )
- ? split( matcher, str, input )
- : group( matcher, str, group, input );
- iter = tokens.iterator();
- }
-
-// @Override
-// public boolean incrementToken() throws IOException {
-// return super.incrementToken();
-// }
-
- @Override
- public void end() throws IOException {
- super.end();
- }
-
-// @Override
-// public Token next(Token reusableToken) throws IOException {
-// return super.next(reusableToken);
-// }
-
- @Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
- init();
- }
-
- @Override
- public Token next() throws IOException {
- if( iter.hasNext() ) {
- return iter.next();
- }
- return null;
- }
- };
- }
- catch( IOException ex ) {
+ return new PatternTokenizer(in, pattern, group);
+ } catch( IOException ex ) {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
}
}
-
- /**
- * @deprecated
- */
- public static List<Token> split( Matcher matcher, String input ){
- return split(matcher,input,null);
- }
/**
* This behaves just like String.split( ), but returns a list of Tokens
* rather then an array of strings
+ * NOTE: This method is not used in 1.4.
+ * @deprecated
*/
- public static List<Token> split( Matcher matcher, String input, Reader stream )
+ @Deprecated
+ public static List<Token> split( Matcher matcher, String input )
{
int index = 0;
int lastNonEmptySize = Integer.MAX_VALUE;
@@ -173,7 +121,7 @@
// Add segments before each match found
while(matcher.find()) {
String match = input.subSequence(index, matcher.start()).toString();
- matchList.add( newToken( stream, match, index, matcher.start()) );
+ matchList.add( new Token( match, index, matcher.start()) );
index = matcher.end();
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
@@ -182,11 +130,11 @@
// If no match is found, return the full string
if (index == 0) {
- matchList.add( newToken( stream, input, 0, input.length()) );
+ matchList.add( new Token( input, 0, input.length()) );
}
else {
String match = input.subSequence(index, input.length()).toString();
- matchList.add( newToken( stream, match, index, input.length()) );
+ matchList.add( new Token( match, index, input.length()) );
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
}
@@ -199,22 +147,17 @@
return matchList;
}
-
- /**
- * @deprecated
- */
- public static List<Token> group( Matcher matcher, String input, int group ){
- return group(matcher, input, group, null);
- }
-
/**
* Create tokens from the matches in a matcher
+ * NOTE: This method is not used in 1.4.
+ * @deprecated
*/
- public static List<Token> group( Matcher matcher, String input, int group, Reader stream )
+ @Deprecated
+ public static List<Token> group( Matcher matcher, String input, int group )
{
ArrayList<Token> matchList = new ArrayList<Token>();
while(matcher.find()) {
- Token t = newToken( stream,
+ Token t = new Token(
matcher.group(group),
matcher.start(group),
matcher.end(group) );
@@ -222,15 +165,4 @@
}
return matchList;
}
-
- private static Token newToken(Reader reader, String text, int start, int end ){
- Token token;
- if( reader instanceof CharStream) {
- CharStream stream = (CharStream)reader;
- token = new Token( text, stream.correctOffset( start ), stream.correctOffset( end ) );
- } else {
- token = new Token( text, start, end );
- }
- return token;
- }
}
Modified: lucene/solr/trunk/src/java/org/apache/solr/schema/FieldType.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/schema/FieldType.java?rev=816502&r1=816501&r2=816502&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/schema/FieldType.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/schema/FieldType.java Fri Sep 18 07:37:22 2009
@@ -311,7 +311,7 @@
if (n<=0) return false;
String s = toInternal(new String(cbuf,0,n));
termAtt.setTermBuffer(s);
- offsetAtt.setOffset(0,n);
+ offsetAtt.setOffset(correctOffset(0),correctOffset(n));
return true;
}
};
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java?rev=816502&r1=816501&r2=816502&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java Fri Sep 18 07:37:22 2009
@@ -39,9 +39,9 @@
// group pattern input output
{ "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
{ "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
- { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
+ { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
{ "-1", ":", "boo:and:foo", "boo and foo" },
- { "-1", "o", "boo:and:foo", "b :and:f" },
+ { "-1", "o", "boo:and:foo", "b :and:f" },
{ "0", ":", "boo:and:foo", ": :" },
{ "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
{ "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
@@ -60,10 +60,11 @@
String out = TestHyphenatedWordsFilter.tsToString( stream );
System.out.println( test[2] + " ==> " + out );
- assertEquals("pattern: "+test[2], test[3], out );
+ assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
// Make sure it is the same as if we called 'split'
- if( "-1".equals( test[0] ) ) {
+ // test disabled, as we remove empty tokens
+ /*if( "-1".equals( test[0] ) ) {
String[] split = test[2].split( test[1] );
stream = tokenizer.create( new StringReader( test[2] ) );
int i=0;
@@ -71,7 +72,7 @@
{
assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
}
- }
+ }*/
}
}
@@ -96,5 +97,16 @@
List<Token> result = getTokens( stream );
List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
assertTokEqualOff( expect, result );
+
+ charStream.reset();
+ args.put( PatternTokenizerFactory.PATTERN, "Günther" );
+ args.put( PatternTokenizerFactory.GROUP, "0" );
+ tokFactory = new PatternTokenizerFactory();
+ tokFactory.init( args );
+ stream = tokFactory.create( charStream );
+
+ result = getTokens( stream );
+ expect = tokens( "Günther,1,0,12 Günther,1,13,25" );
+ assertTokEqualOff( expect, result );
}
}