You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/09/09 23:48:38 UTC
svn commit: r1702118 - in /lucene/dev/trunk/lucene: ./
analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/
analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/
Author: uschindler
Date: Wed Sep 9 21:48:38 2015
New Revision: 1702118
URL: http://svn.apache.org/r1702118
Log:
LUCENE-6775: Improved MorfologikFilterFactory to allow loading of custom dictionaries from ResourceLoader
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1702118&r1=1702117&r2=1702118&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Sep 9 21:48:38 2015
@@ -144,6 +144,9 @@ Other
* LUCENE-6761: MatchAllDocsQuery's Scorers do not expose approximations
anymore. (Adrien Grand)
+* LUCENE-6775: Improved MorfologikFilterFactory to allow loading of
+ custom dictionaries from ResourceLoader. (Uwe Schindler)
+
Build
* LUCENE-6732: Improve checker for invalid source patterns to also
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1702118&r1=1702117&r2=1702118&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Wed Sep 9 21:48:38 2015
@@ -17,15 +17,23 @@ package org.apache.lucene.analysis.morfo
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
import java.util.Map;
+import java.util.Objects;
+
+import morfologik.stemming.Dictionary;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Filter factory for {@link MorfologikFilter}. For backward compatibility polish
* dictionary is used as default. You can change dictionary resource
- * by dictionary-resource parameter.
+ * by dictionary-resource parameter:
* <pre class="prettyprint">
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -34,27 +42,63 @@ import org.apache.lucene.analysis.util.T
* </analyzer>
* </fieldType></pre>
*
+ * <p>Alternatively, you can pass in the filenames of FSA ({@code ".dict"} and features "{@code ".info"}" file
+ * (if the features file is not given, its name is derived from the FSA file):
+ * <pre class="prettyprint">
+ * <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.MorfologikFilterFactory" dictionary-fsa-file="mylang.dict" dictionary-features-file="mylang.info" />
+ * </analyzer>
+ * </fieldType></pre>
+ *
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
*/
-public class MorfologikFilterFactory extends TokenFilterFactory {
+public class MorfologikFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
/**
* The default dictionary resource (for Polish).
*/
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
- /**
- * Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
- */
- private final String dictionaryResource;
-
/** Dictionary resource */
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
+ /** Dictionary FSA file (should have {@code ".dict"} suffix), loaded from {@link ResourceLoader}. */
+ public static final String DICTIONARY_FSA_FILE_ATTRIBUTE = "dictionary-fsa-file";
+
+ /** Dictionary features/properties file, loaded from {@link ResourceLoader}. If not given, this
+ * loads the file with same name like {@link #DICTIONARY_FSA_FILE_ATTRIBUTE}, but with
+ * {@code ".info"} suffix.
+ */
+ public static final String DICTIONARY_FEATURES_FILE_ATTRIBUTE = "dictionary-features-file";
+
+ private final String dictionaryFsaFile, dictionaryFeaturesFile, dictionaryResource;
+ private Dictionary dictionary; // initialized on inform()
+
/** Creates a new MorfologikFilterFactory */
public MorfologikFilterFactory(Map<String,String> args) {
super(args);
- dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+ // first check FSA and features (at least FSA must be given, features name is guessed):
+ dictionaryFsaFile = get(args, DICTIONARY_FSA_FILE_ATTRIBUTE);
+ dictionaryFeaturesFile = get(args, DICTIONARY_FEATURES_FILE_ATTRIBUTE,
+ (dictionaryFsaFile == null) ? null : Dictionary.getExpectedFeaturesName(dictionaryFsaFile));
+
+ if (dictionaryFsaFile == null && dictionaryFeaturesFile == null) {
+ // if we have no FSA/features combination, we resolve the classpath resource:
+ dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+ } else if (dictionaryFsaFile == null || dictionaryFeaturesFile == null) {
+ // if we have incomplete FSA/features tuple in args
+ throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Missing '%s' or '%s' attribute.",
+ DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
+ } else {
+ dictionaryResource = null;
+ if (get(args, DICTIONARY_RESOURCE_ATTRIBUTE) != null) {
+ // fail if both is given: FSA/features files + classpath resource
+ throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Cannot give '%s' and '%s'/'%s' at the same time.",
+ DICTIONARY_RESOURCE_ATTRIBUTE, DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
+ }
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
@@ -62,7 +106,22 @@ public class MorfologikFilterFactory ext
}
@Override
+ public void inform(ResourceLoader loader) throws IOException {
+ if (dictionaryFsaFile != null) {
+ assert dictionaryFeaturesFile != null;
+ assert dictionaryResource == null;
+ try (final InputStream dictIn = loader.openResource(dictionaryFsaFile);
+ final InputStream metaIn = loader.openResource(dictionaryFeaturesFile)) {
+ this.dictionary = Dictionary.readAndClose(dictIn, metaIn);
+ }
+ } else {
+ assert dictionaryResource != null;
+ this.dictionary = MorfologikFilter.loadDictionaryResource(dictionaryResource);
+ }
+ }
+
+ @Override
public TokenStream create(TokenStream ts) {
- return new MorfologikFilter(ts, dictionaryResource);
+ return new MorfologikFilter(ts, Objects.requireNonNull(dictionary, "MorfologikFilterFactory was not fully initialized."));
}
}
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1702118&r1=1702117&r2=1702118&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Wed Sep 9 21:48:38 2015
@@ -20,17 +20,57 @@ package org.apache.lucene.analysis.morfo
import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;
+import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoader;
/**
* Test for {@link MorfologikFilterFactory}.
*/
public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
- public void testCreateDictionary() throws Exception {
+ final ResourceLoader loader = new ClasspathResourceLoader(getClass());
+
+ public void testDefaultDictionary() throws Exception {
StringReader reader = new StringReader("rowery bilety");
MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
+ factory.inform(loader);
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+ }
+
+ public void testResourceDictionary() throws Exception {
+ StringReader reader = new StringReader("rowery bilety");
+ Map<String,String> params = new HashMap<>();
+ params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+ MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+ factory.inform(loader);
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+ }
+
+ public void testResourceLoaderDictionary1() throws Exception {
+ StringReader reader = new StringReader("rowery bilety");
+ Map<String,String> params = new HashMap<>();
+ params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+ MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+ factory.inform(loader);
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+ }
+
+ public void testResourceLoaderDictionary2() throws Exception {
+ StringReader reader = new StringReader("rowery bilety");
+ Map<String,String> params = new HashMap<>();
+ params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+ params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+ MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+ factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
@@ -39,12 +79,48 @@ public class TestMorfologikFilterFactory
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
- HashMap<String,String> map = new HashMap<String,String>();
- map.put("bogusArg", "bogusValue");
- new MorfologikFilterFactory(map);
+ HashMap<String,String> params = new HashMap<String,String>();
+ params.put("bogusArg", "bogusValue");
+ new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
+
+ public void testIncompatibleArgs1() throws Exception {
+ try {
+ HashMap<String,String> params = new HashMap<String,String>();
+ params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+ params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+ new MorfologikFilterFactory(params);
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("at the same time"));
+ }
+ }
+
+ public void testIncompatibleArgs2() throws Exception {
+ try {
+ HashMap<String,String> params = new HashMap<String,String>();
+ params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+ params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+ params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+ new MorfologikFilterFactory(params);
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("at the same time"));
+ }
+ }
+
+ public void testMissingArgs1() throws Exception {
+ try {
+ HashMap<String,String> params = new HashMap<String,String>();
+ params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+ new MorfologikFilterFactory(params);
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Missing"));
+ }
+ }
}