You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2014/03/24 14:47:28 UTC

svn commit: r1580853 - in /lucene/dev/trunk/lucene: ./ analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/ analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/

Author: dweiss
Date: Mon Mar 24 13:47:28 2014
New Revision: 1580853

URL: http://svn.apache.org/r1580853
Log:
LUCENE-5356: Morfologik filter can accept custom dictionary resources.

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Mar 24 13:47:28 2014
@@ -76,6 +76,9 @@ Changes in Runtime Behavior
 
 New Features
 
+* LUCENE-5356: Morfologik filter can accept custom dictionary resources.
+  (Michal Hlavac, Dawid Weiss)
+
 * LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
   on multi-valued field. (Robert Muir)
 

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java Mon Mar 24 13:47:28 2014
@@ -31,26 +31,34 @@ import org.apache.lucene.util.Version;
  * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
  */
 public class MorfologikAnalyzer extends Analyzer {
+  private final String dictionary;
   private final Version version;
 
   /**
    * Builds an analyzer with the default Morfologik's dictionary (polimorf).
    * 
-   * @param version
-   *          Lucene compatibility version
+   * @param version Lucene compatibility version
+   * @param dictionaryResource A constant specifying which dictionary to choose. The
+   * dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
+   * and have an associated <code>.info</code> metadata file. See the Morfologik project
+   * for details.
+   * 
+   * @see "http://morfologik.blogspot.com/"
    */
-  public MorfologikAnalyzer(final Version version) {
+  public MorfologikAnalyzer(final Version version, final String dictionaryResource) {
     this.version = version;
+      this.dictionary = dictionaryResource;
+  }
+  public MorfologikAnalyzer(final Version version) {
+    this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
   }
-
   /**
    * Creates a
    * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    * which tokenizes all the text in the provided {@link Reader}.
    * 
    * @param field ignored field name
-   * @return A
-   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from an {@link StandardTokenizer} filtered with
    *         {@link StandardFilter} and {@link MorfologikFilter}.
    */
@@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends 
     
     return new TokenStreamComponents(
         src, 
-        new MorfologikFilter(new StandardFilter(this.version, src), this.version));
+        new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version));
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java Mon Mar 24 13:47:28 2014
@@ -61,20 +61,29 @@ public class MorfologikFilter extends To
   private int lemmaListIndex;
 
   /**
-   * Creates MorfologikFilter
-   * @param in   input token stream
-   * @param version Lucene version compatibility for lowercasing.
+   * Creates a filter with the default (Polish) dictionary.
    */
   public MorfologikFilter(final TokenStream in, final Version version) {
+    this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version);
+  }
+
+  /**
+   * Creates a filter with a given dictionary resource.
+   *
+   * @param in input token stream.
+   * @param dict Dictionary resource from classpath.
+   * @param version Lucene version compatibility for lowercasing.
+   */
+  public MorfologikFilter(final TokenStream in, final String dict, final Version version) {
     super(in);
     this.input = in;
-    
+
     // SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources.
     Thread me = Thread.currentThread();
     ClassLoader cl = me.getContextClassLoader();
     try {
-      me.setContextClassLoader(PolishStemmer.class.getClassLoader());
-      this.stemmer = new PolishStemmer();
+      me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader());
+      this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict));
       this.charUtils = CharacterUtils.getInstance(version);
       this.lemmaList = Collections.emptyList();
     } finally {

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Mon Mar 24 13:47:28 2014
@@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
- * Filter factory for {@link MorfologikFilter}.
+ * Filter factory for {@link MorfologikFilter}. For backward compatibility polish
+ * dictionary is used as default. You can change dictionary resource 
+ * by dictionary-resource parameter.
  * <pre class="prettyprint">
  * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
  *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- *     &lt;filter class="solr.MorfologikFilterFactory" /&gt;
+ *     &lt;filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" /&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
  */
 public class MorfologikFilterFactory extends TokenFilterFactory {
+  /**
+   * The default dictionary resource (for Polish). 
+   */
+  public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
+
+  /**
+   * Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details. 
+   */
+  private final String dictionaryResource;
+
   /** Schema attribute. */
   @Deprecated
   public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
 
+  /** Dictionary resource */
+  public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
+
   /** Creates a new MorfologikFilterFactory */
   public MorfologikFilterFactory(Map<String,String> args) {
     super(args);
@@ -47,9 +62,12 @@ public class MorfologikFilterFactory ext
     String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
     if (dictionaryName != null && !dictionaryName.isEmpty()) {
       throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
-          + "longer supported (Morfologik has one dictionary): " + dictionaryName);
+          + "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName
+          + ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
     }
 
+    dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+    
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -57,6 +75,6 @@ public class MorfologikFilterFactory ext
 
   @Override
   public TokenStream create(TokenStream ts) {
-    return new MorfologikFilter(ts, luceneMatchVersion);
+    return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion);
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Mon Mar 24 13:47:28 2014
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfo
  */
 
 import java.io.IOException;
-import java.io.Reader;
 import java.util.TreeSet;
 
 import org.apache.lucene.analysis.Analyzer;

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Mon Mar 24 13:47:28 2014
@@ -22,7 +22,6 @@ import java.util.Collections;
 import java.util.HashMap;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 
 /**
@@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     try {
-      new MorfologikFilterFactory(new HashMap<String,String>() {{
-        put("bogusArg", "bogusValue");
-      }});
+      HashMap<String,String> map = new HashMap<String,String>();
+      map.put("bogusArg", "bogusValue");
+      new MorfologikFilterFactory(map);
       fail();
     } catch (IllegalArgumentException expected) {
       assertTrue(expected.getMessage().contains("Unknown parameters"));