You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ry...@apache.org on 2007/04/26 00:23:41 UTC
svn commit: r532508 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
Author: ryan
Date: Wed Apr 25 15:23:40 2007
New Revision: 532508
URL: http://svn.apache.org/viewvc?view=rev&rev=532508
Log:
Adding a regex pattern TokenizerFactory.
Added:
lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (with props)
Modified:
lucene/solr/trunk/CHANGES.txt
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?view=diff&rev=532508&r1=532507&r2=532508
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Wed Apr 25 15:23:40 2007
@@ -141,6 +141,9 @@
21. SOLR-184: add echoHandler=true to responseHeader, support echoParams=all
(Ryan McKinley via ehatcher)
+22. SOLR-211: Added a regex PatternTokenizerFactory. This extracts tokens
+ from the input string using a regex Pattern. (Ryan McKinley)
+
Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas).
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java?view=auto&rev=532508
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java Wed Apr 25 15:23:40 2007
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.core.SolrException;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * This tokenizer uses regex pattern matching to construct distinct tokens
+ * for the input stream. It takes two arguments: "pattern" and "group"
+ *
+ * "pattern" is the regular expression.
+ * "group" says which group to extract into tokens.
+ *
+ * group=-1 (the default) is equuivolent to "split". In this case, the tokes will
+ * be equivolent to the output from:
+ *
+ * http://java.sun.com/j2se/1.4.2/docs/api/java/lang/String.html#split(java.lang.String)
+ *
+ * Using group >= 0 selects the matching group as the token. For example, it you have:
+ *
+ * pattern = \'([^\']+)\'
+ * group = 0
+ * input = aaa 'bbb' 'ccc'
+ *
+ * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
+ * but using group=1, the output would be: bbb and ccc (no ' marks)
+ *
+ *
+ * @author ryan
+ * @since solr1.2
+ * @version $Id:$
+ */
+public class PatternTokenizerFactory implements TokenizerFactory
+{
+ public static final String PATTERN = "pattern";
+ public static final String GROUP = "group";
+
+ protected Map<String,String> args;
+ protected Pattern pattern;
+ protected int group;
+
+ /**
+ * Require a configured pattern
+ */
+ public void init(Map<String,String> args)
+ {
+ this.args = args;
+ String regex = args.get( PATTERN );
+ if( regex == null ) {
+ throw new SolrException( 500, "missing required argument: "+PATTERN );
+ }
+ int flags = 0; // TODO? -- read flags from config CASE_INSENSITIVE, etc
+ pattern = Pattern.compile( regex, flags );
+
+ group = -1; // use 'split'
+ String g = args.get( GROUP );
+ if( g != null ) {
+ try {
+ group = Integer.parseInt( g );
+ }
+ catch( Exception ex ) {
+ throw new SolrException( 500, "invalid group argument: "+g );
+ }
+ }
+ }
+
+ /**
+ * The arguments passed to init()
+ */
+ public Map<String, String> getArgs() {
+ return this.args;
+ }
+
+ /**
+ * Split the input using configured pattern
+ */
+ public TokenStream create(Reader input) {
+ try {
+ // Read the input into a single string
+ String str = IOUtils.toString( input );
+
+ Matcher matcher = pattern.matcher( str );
+ List<Token> tokens = (group < 0 )
+ ? split( matcher, str )
+ : group( matcher, str, group );
+
+ final Iterator<Token> iter = tokens.iterator();
+ return new TokenStream() {
+ @Override
+ public Token next() throws IOException {
+ if( iter.hasNext() ) {
+ return iter.next();
+ }
+ return null;
+ }
+ };
+ }
+ catch( IOException ex ) {
+ throw new SolrException( 500, ex );
+ }
+ }
+
+ /**
+ * This behaves just like String.split( ), but returns a list of Tokens
+ * rather then an array of strings
+ */
+ public static List<Token> split( Matcher matcher, String input )
+ {
+ int index = 0;
+ int lastNonEmptySize = Integer.MAX_VALUE;
+ ArrayList<Token> matchList = new ArrayList<Token>();
+
+ // Add segments before each match found
+ while(matcher.find()) {
+ String match = input.subSequence(index, matcher.start()).toString();
+ matchList.add( new Token( match, index, matcher.start()) );
+ index = matcher.end();
+ if( match.length() > 0 ) {
+ lastNonEmptySize = matchList.size();
+ }
+ }
+
+ // If no match is found, return the full string
+ if (index == 0) {
+ matchList.add( new Token( input, 0, input.length()) );
+ }
+ else {
+ String match = input.subSequence(index, input.length()).toString();
+ matchList.add( new Token( match, index, input.length()) );
+ if( match.length() > 0 ) {
+ lastNonEmptySize = matchList.size();
+ }
+ }
+
+ // Don't use trailing empty strings. This behavior matches String.split();
+ if( lastNonEmptySize < matchList.size() ) {
+ return matchList.subList( 0, lastNonEmptySize );
+ }
+ return matchList;
+ }
+
+
+ /**
+ * Create tokens from the matches in a matcher
+ */
+ public static List<Token> group( Matcher matcher, String input, int group )
+ {
+ ArrayList<Token> matchList = new ArrayList<Token>();
+ while(matcher.find()) {
+ Token t = new Token(
+ matcher.group(group),
+ matcher.start(group),
+ matcher.end(group) );
+ matchList.add( t );
+ }
+ return matchList;
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java?view=auto&rev=532508
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java Wed Apr 25 15:23:40 2007
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestPatternTokenizerFactory extends TestCase
+{
+ public void testSplitting() throws Exception
+ {
+ String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
+ String[][] tests = {
+ // group pattern input output
+ { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
+ { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
+ { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
+ { "-1", ":", "boo:and:foo", "boo and foo" },
+ { "-1", "o", "boo:and:foo", "b :and:f" },
+ { "0", ":", "boo:and:foo", ": :" },
+ { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
+ { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
+ };
+
+
+ Map<String,String> args = new HashMap<String, String>();
+ for( String[] test : tests ) {
+ args.put( PatternTokenizerFactory.GROUP, test[0] );
+ args.put( PatternTokenizerFactory.PATTERN, test[1] );
+
+ PatternTokenizerFactory tokenizer = new PatternTokenizerFactory();
+ tokenizer.init( args );
+
+ TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
+ String out = TestHyphenatedWordsFilter.tsToString( stream );
+ System.out.println( test[2] + " ==> " + out );
+
+ assertEquals("pattern: "+test[2], test[3], out );
+
+ // Make sure it is the same as if we called 'split'
+ if( "-1".equals( test[0] ) ) {
+ String[] split = test[2].split( test[1] );
+ stream = tokenizer.create( new StringReader( test[2] ) );
+ int i=0;
+ for( Token t = stream.next(); null != t; t = stream.next() )
+ {
+ assertEquals( "split: "+test[1] + " "+i, split[i++], t.termText() );
+ }
+ }
+ }
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native