You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/09/09 23:52:49 UTC

svn commit: r1702119 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/ lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/

Author: uschindler
Date: Wed Sep  9 21:52:48 2015
New Revision: 1702119

URL: http://svn.apache.org/r1702119
Log:
Merged revision(s) 1702118 from lucene/dev/trunk:
LUCENE-6775: Improved MorfologikFilterFactory to allow loading of custom dictionaries from ResourceLoader

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
    lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1702119&r1=1702118&r2=1702119&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Wed Sep  9 21:52:48 2015
@@ -94,6 +94,9 @@ Other
 * LUCENE-6761: MatchAllDocsQuery's Scorers do not expose approximations
   anymore. (Adrien Grand)
 
+* LUCENE-6775: Improved MorfologikFilterFactory to allow loading of
+  custom dictionaries from ResourceLoader.  (Uwe Schindler)
+
 Build
 
 * LUCENE-6732: Improve checker for invalid source patterns to also

Modified: lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1702119&r1=1702118&r2=1702119&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Wed Sep  9 21:52:48 2015
@@ -17,15 +17,23 @@ package org.apache.lucene.analysis.morfo
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
 import java.util.Map;
+import java.util.Objects;
+
+import morfologik.stemming.Dictionary;
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Filter factory for {@link MorfologikFilter}. For backward compatibility polish
  * dictionary is used as default. You can change dictionary resource 
- * by dictionary-resource parameter.
+ * by dictionary-resource parameter:
  * <pre class="prettyprint">
  * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
@@ -34,19 +42,24 @@ import org.apache.lucene.analysis.util.T
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  * 
+ * <p>Alternatively, you can pass in the filenames of FSA ({@code ".dict"} and features "{@code ".info"}" file
+ * (if the features file is not given, its name is derived from the FSA file):
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.MorfologikFilterFactory" dictionary-fsa-file="mylang.dict" dictionary-features-file="mylang.info" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
  */
-public class MorfologikFilterFactory extends TokenFilterFactory {
+public class MorfologikFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
   /**
    * The default dictionary resource (for Polish). 
    */
   public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
 
-  /**
-   * Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details. 
-   */
-  private final String dictionaryResource;
-
   /** Schema attribute. */
   @Deprecated
   public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
@@ -54,6 +67,18 @@ public class MorfologikFilterFactory ext
   /** Dictionary resource */
   public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
 
+  /** Dictionary FSA file (should have {@code ".dict"} suffix), loaded from {@link ResourceLoader}. */
+  public static final String DICTIONARY_FSA_FILE_ATTRIBUTE = "dictionary-fsa-file";
+
+  /** Dictionary features/properties file, loaded from {@link ResourceLoader}. If not given, this
+   * loads the file with same name like {@link #DICTIONARY_FSA_FILE_ATTRIBUTE}, but with
+   * {@code ".info"} suffix.
+   */
+  public static final String DICTIONARY_FEATURES_FILE_ATTRIBUTE = "dictionary-features-file";
+
+  private final String dictionaryFsaFile, dictionaryFeaturesFile, dictionaryResource;
+  private Dictionary dictionary; // initialized on inform()
+
   /** Creates a new MorfologikFilterFactory */
   public MorfologikFilterFactory(Map<String,String> args) {
     super(args);
@@ -66,7 +91,26 @@ public class MorfologikFilterFactory ext
           + ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
     }
 
-    dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+    // first check FSA and features (at least FSA must be given, features name is guessed):
+    dictionaryFsaFile = get(args, DICTIONARY_FSA_FILE_ATTRIBUTE);
+    dictionaryFeaturesFile = get(args, DICTIONARY_FEATURES_FILE_ATTRIBUTE,
+        (dictionaryFsaFile == null) ? null : Dictionary.getExpectedFeaturesName(dictionaryFsaFile));
+    
+    if (dictionaryFsaFile == null && dictionaryFeaturesFile == null) {
+      // if we have no FSA/features combination, we resolve the classpath resource:
+      dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+    } else if (dictionaryFsaFile == null || dictionaryFeaturesFile == null) {
+      // if we have incomplete FSA/features tuple in args
+      throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Missing '%s' or '%s' attribute.",
+          DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));      
+    } else {
+      dictionaryResource = null;
+      if (get(args, DICTIONARY_RESOURCE_ATTRIBUTE) != null) {
+        // fail if both is given: FSA/features files + classpath resource
+        throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Cannot give '%s' and '%s'/'%s' at the same time.",
+            DICTIONARY_RESOURCE_ATTRIBUTE, DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
+      }
+    }
     
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
@@ -74,7 +118,22 @@ public class MorfologikFilterFactory ext
   }
 
   @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    if (dictionaryFsaFile != null) {
+      assert dictionaryFeaturesFile != null;
+      assert dictionaryResource == null;
+      try (final InputStream dictIn = loader.openResource(dictionaryFsaFile);
+          final InputStream metaIn = loader.openResource(dictionaryFeaturesFile)) {
+        this.dictionary = Dictionary.readAndClose(dictIn, metaIn);
+      }
+    } else {
+      assert dictionaryResource != null;
+      this.dictionary = MorfologikFilter.loadDictionaryResource(dictionaryResource);
+    }
+  }
+
+  @Override
   public TokenStream create(TokenStream ts) {
-    return new MorfologikFilter(ts, dictionaryResource);
+    return new MorfologikFilter(ts, Objects.requireNonNull(dictionary, "MorfologikFilterFactory was not fully initialized."));
   }
 }

Modified: lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1702119&r1=1702118&r2=1702119&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Wed Sep  9 21:52:48 2015
@@ -20,17 +20,57 @@ package org.apache.lucene.analysis.morfo
 import java.io.StringReader;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoader;
 
 /**
  * Test for {@link MorfologikFilterFactory}.
  */
 public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
-  public void testCreateDictionary() throws Exception {
+  final ResourceLoader loader = new ClasspathResourceLoader(getClass());
+
+  public void testDefaultDictionary() throws Exception {
     StringReader reader = new StringReader("rowery bilety");
     MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
+    factory.inform(loader);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+  }
+  
+  public void testResourceDictionary() throws Exception {
+    StringReader reader = new StringReader("rowery bilety");
+    Map<String,String> params = new HashMap<>();
+    params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+    factory.inform(loader);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+  }
+  
+  public void testResourceLoaderDictionary1() throws Exception {
+    StringReader reader = new StringReader("rowery bilety");
+    Map<String,String> params = new HashMap<>();
+    params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+    factory.inform(loader);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+  }
+  
+  public void testResourceLoaderDictionary2() throws Exception {
+    StringReader reader = new StringReader("rowery bilety");
+    Map<String,String> params = new HashMap<>();
+    params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+    params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+    factory.inform(loader);
     TokenStream stream = whitespaceMockTokenizer(reader);
     stream = factory.create(stream);
     assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
@@ -39,12 +79,48 @@ public class TestMorfologikFilterFactory
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     try {
-      HashMap<String,String> map = new HashMap<String,String>();
-      map.put("bogusArg", "bogusValue");
-      new MorfologikFilterFactory(map);
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put("bogusArg", "bogusValue");
+      new MorfologikFilterFactory(params);
       fail();
     } catch (IllegalArgumentException expected) {
       assertTrue(expected.getMessage().contains("Unknown parameters"));
     }
   }
+  
+  public void testIncompatibleArgs1() throws Exception {
+    try {
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+      params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+      new MorfologikFilterFactory(params);
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("at the same time"));
+    }
+  }
+  
+  public void testIncompatibleArgs2() throws Exception {
+    try {
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+      params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+      params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+      new MorfologikFilterFactory(params);
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("at the same time"));
+    }
+  }
+  
+  public void testMissingArgs1() throws Exception {
+    try {
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+      new MorfologikFilterFactory(params);
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Missing"));
+    }
+  }
 }