You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ko...@apache.org on 2008/05/19 15:38:39 UTC

svn commit: r657829 - in /lucene/solr/trunk: CHANGES.txt src/java/org/apache/solr/analysis/SynonymFilterFactory.java src/test/org/apache/solr/analysis/TestSynonymMap.java

Author: koji
Date: Mon May 19 06:38:38 2008
New Revision: 657829

URL: http://svn.apache.org/viewvc?rev=657829&view=rev
Log:
SOLR-319: Changed SynonymFilterFactory to "tokenize" synonyms file.

Added:
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymMap.java
Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/analysis/SynonymFilterFactory.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=657829&r1=657828&r2=657829&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon May 19 06:38:38 2008
@@ -257,6 +257,15 @@
 
 48. SOLR-537: Use of hl.maxAlternateFieldLength parameter from solr-ruby
     (koji)
+
+49. SOLR-319: Changed SynonymFilterFactory to "tokenize" synonyms file.
+    To use a tokenizer, specify "tokenizerFactory" attribute in <filter>.
+    For example:
+    <tokenizer class="solr.CJKTokenizerFactory"/>
+    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" expand="true"
+      ignoreCase="true" tokenizerFactory="solr.CJKTokenizerFactory"/>
+    (koji)
+
     
 Changes in runtime behavior
  1. SOLR-559: use Lucene updateDocument, deleteDocuments methods.  This

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/SynonymFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/SynonymFilterFactory.java?rev=657829&r1=657828&r2=657829&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/SynonymFilterFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/SynonymFilterFactory.java Mon May 19 06:38:38 2008
@@ -17,6 +17,7 @@
 
 package org.apache.solr.analysis;
 
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.common.util.StrUtils;
@@ -24,8 +25,11 @@
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 
 import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 
 /**
  * @version $Id$
@@ -38,6 +42,12 @@
     boolean ignoreCase = getBoolean("ignoreCase", false);
     boolean expand = getBoolean("expand", true);
 
+    String tf = args.get("tokenizerFactory");
+    TokenizerFactory tokFactory = null;
+    if( tf != null ){
+      tokFactory = loadTokenizerFactory( loader, tf, args );
+    }
+
     if (synonyms != null) {
       List<String> wlist=null;
       try {
@@ -46,7 +56,7 @@
         throw new RuntimeException(e);
       }
       synMap = new SynonymMap(ignoreCase);
-      parseRules(wlist, synMap, "=>", ",", expand);
+      parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
       if (wlist.size()<=20) {
         SolrCore.log.fine("SynonymMap "+synonyms +":"+synMap);
       }
@@ -55,7 +65,8 @@
 
   private SynonymMap synMap;
 
-  private static void parseRules(List<String> rules, SynonymMap map, String mappingSep, String synSep, boolean expansion) {
+  static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
+    String synSep, boolean expansion, TokenizerFactory tokFactory) {
     int count=0;
     for (String rule : rules) {
       // To use regexes, we need an expression that specifies an odd number of chars.
@@ -71,10 +82,10 @@
       if (mapping.size() > 2) {
         throw new RuntimeException("Invalid Synonym Rule:" + rule);
       } else if (mapping.size()==2) {
-        source = getSynList(mapping.get(0), synSep);
-        target = getSynList(mapping.get(1), synSep);
+        source = getSynList(mapping.get(0), synSep, tokFactory);
+        target = getSynList(mapping.get(1), synSep, tokFactory);
       } else {
-        source = getSynList(mapping.get(0), synSep);
+        source = getSynList(mapping.get(0), synSep, tokFactory);
         if (expansion) {
           // expand to all arguments
           target = source;
@@ -100,21 +111,48 @@
   }
 
   // a , b c , d e f => [[a],[b,c],[d,e,f]]
-  private static List<List<String>> getSynList(String str, String separator) {
+  private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
     List<String> strList = StrUtils.splitSmart(str, separator, false);
     // now split on whitespace to get a list of token strings
     List<List<String>> synList = new ArrayList<List<String>>();
     for (String toks : strList) {
-      List<String> tokList = StrUtils.splitWS(toks, true);
+      List<String> tokList = tokFactory == null ?
+        StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
       synList.add(tokList);
     }
     return synList;
   }
+  
+  private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
+    StringReader reader = new StringReader( source );
+    TokenStream ts = loadTokenizer(tokFactory, reader);
+    List<String> tokList = new ArrayList<String>();
+    try {
+      for( Token token = ts.next(); token != null; token = ts.next() ){
+        String text = token.termText();
+        if( text.length() > 0 )
+          tokList.add( text );
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+    finally{
+      reader.close();
+    }
+    return tokList;
+  }
 
+  private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
+    TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
+    tokFactory.init( args );
+    return tokFactory;
+  }
+  
+  private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
+    return tokFactory.create( reader );
+  }
 
   public SynonymFilter create(TokenStream input) {
     return new SynonymFilter(input,synMap);
   }
-
-
 }

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymMap.java?rev=657829&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymMap.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymMap.java Mon May 19 06:38:38 2008
@@ -0,0 +1,271 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Token;
+
+public class TestSynonymMap extends AnalysisTestCase {
+
+  public void testInvalidMappingRules() throws Exception {
+    SynonymMap synMap = new SynonymMap( true );
+    List<String> rules = new ArrayList<String>( 1 );
+    rules.add( "a=>b=>c" );
+    try{
+        SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+        fail( "RuntimeException must be thrown." );
+    }
+    catch( RuntimeException expected ){}
+  }
+  
+  public void testReadMappingRules() throws Exception {
+	SynonymMap synMap;
+
+    // (a)->[b]
+    List<String> rules = new ArrayList<String>();
+    rules.add( "a=>b" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 1, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "b" );
+
+    // (a)->[c]
+    // (b)->[c]
+    rules.clear();
+    rules.add( "a,b=>c" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "c" );
+    assertTokIncludes( synMap, "b", "c" );
+
+    // (a)->[b][c]
+    rules.clear();
+    rules.add( "a=>b,c" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 1, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "b" );
+    assertTokIncludes( synMap, "a", "c" );
+
+    // (a)->(b)->[a2]
+    //      [a1]
+    rules.clear();
+    rules.add( "a=>a1" );
+    rules.add( "a b=>a2" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 1, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a1" );
+    assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" );
+
+    // (a)->(b)->[a2]
+    //      (c)->[a3]
+    //      [a1]
+    rules.clear();
+    rules.add( "a=>a1" );
+    rules.add( "a b=>a2" );
+    rules.add( "a c=>a3" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 1, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a1" );
+    assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" );
+    assertTokIncludes( getSubSynonymMap( synMap, "a" ), "c", "a3" );
+
+    // (a)->(b)->[a2]
+    //      [a1]
+    // (b)->(c)->[b2]
+    //      [b1]
+    rules.clear();
+    rules.add( "a=>a1" );
+    rules.add( "a b=>a2" );
+    rules.add( "b=>b1" );
+    rules.add( "b c=>b2" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a1" );
+    assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" );
+    assertTokIncludes( synMap, "b", "b1" );
+    assertEquals( 1, getSubSynonymMap( synMap, "b" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "b" ), "c", "b2" );
+  }
+  
+  public void testRead1waySynonymRules() throws Exception {
+    SynonymMap synMap;
+
+    // (a)->[a]
+    // (b)->[a]
+    List<String> rules = new ArrayList<String>();
+    rules.add( "a,b" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a" );
+    assertTokIncludes( synMap, "b", "a" );
+
+    // (a)->[a]
+    // (b)->[a]
+    // (c)->[a]
+    rules.clear();
+    rules.add( "a,b,c" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+    assertEquals( 3, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a" );
+    assertTokIncludes( synMap, "b", "a" );
+    assertTokIncludes( synMap, "c", "a" );
+
+    // (a)->[a]
+    // (b1)->(b2)->[a]
+    rules.clear();
+    rules.add( "a,b1 b2" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a" );
+    assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "a" );
+
+    // (a1)->(a2)->[a1][a2]
+    // (b)->[a1][a2]
+    rules.clear();
+    rules.add( "a1 a2,b" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
+    assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a2" );
+    assertTokIncludes( synMap, "b", "a1" );
+    assertTokIncludes( synMap, "b", "a2" );
+  }
+  
+  public void testRead2waySynonymRules() throws Exception {
+    SynonymMap synMap;
+
+    // (a)->[a][b]
+    // (b)->[a][b]
+    List<String> rules = new ArrayList<String>();
+    rules.add( "a,b" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a" );
+    assertTokIncludes( synMap, "a", "b" );
+    assertTokIncludes( synMap, "b", "a" );
+    assertTokIncludes( synMap, "b", "b" );
+
+    // (a)->[a][b][c]
+    // (b)->[a][b][c]
+    // (c)->[a][b][c]
+    rules.clear();
+    rules.add( "a,b,c" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 3, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a" );
+    assertTokIncludes( synMap, "a", "b" );
+    assertTokIncludes( synMap, "a", "c" );
+    assertTokIncludes( synMap, "b", "a" );
+    assertTokIncludes( synMap, "b", "b" );
+    assertTokIncludes( synMap, "b", "c" );
+    assertTokIncludes( synMap, "c", "a" );
+    assertTokIncludes( synMap, "c", "b" );
+    assertTokIncludes( synMap, "c", "c" );
+
+    // (a)->[a]
+    //      [b1][b2]
+    // (b1)->(b2)->[a]
+    //             [b1][b2]
+    rules.clear();
+    rules.add( "a,b1 b2" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertTokIncludes( synMap, "a", "a" );
+    assertTokIncludes( synMap, "a", "b1" );
+    assertTokIncludes( synMap, "a", "b2" );
+    assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "a" );
+    assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "b1" );
+    assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "b2" );
+
+    // (a1)->(a2)->[a1][a2]
+    //             [b]
+    // (b)->[a1][a2]
+    //      [b]
+    rules.clear();
+    rules.add( "a1 a2,b" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+    assertEquals( 2, synMap.submap.size() );
+    assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
+    assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a2" );
+    assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "b" );
+    assertTokIncludes( synMap, "b", "a1" );
+    assertTokIncludes( synMap, "b", "a2" );
+    assertTokIncludes( synMap, "b", "b" );
+  }
+  
+  public void testBigramTokenizer() throws Exception {
+	SynonymMap synMap;
+	
+	// prepare bi-gram tokenizer factory
+	BaseTokenizerFactory tf = new NGramTokenizerFactory();
+	Map<String, String> args = new HashMap<String, String>();
+	args.put("minGramSize","2");
+	args.put("maxGramSize","2");
+	tf.init( args );
+
+    // (ab)->(bc)->(cd)->[ef][fg][gh]
+    List<String> rules = new ArrayList<String>();
+    rules.add( "abcd=>efgh" );
+    synMap = new SynonymMap( true );
+    SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
+    assertEquals( 1, synMap.submap.size() );
+    assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
+    assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
+    assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
+    assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
+    assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
+  }
+  
+  private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception {
+    Token[] tokens = ((SynonymMap)map.submap.get( src )).synonyms;
+    boolean inc = false;
+    for( Token token : tokens ){
+      if( exp.equals( token.termText() ) )
+        inc = true;
+    }
+    assertTrue( inc );
+  }
+  
+  private SynonymMap getSubSynonymMap( SynonymMap map, String src ){
+    return (SynonymMap)map.submap.get( src );
+  }
+}