You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2014/03/24 14:47:28 UTC
svn commit: r1580853 - in /lucene/dev/trunk/lucene: ./
analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/
analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/
Author: dweiss
Date: Mon Mar 24 13:47:28 2014
New Revision: 1580853
URL: http://svn.apache.org/r1580853
Log:
LUCENE-5356: Morfologik filter can accept custom dictionary resources.
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Mar 24 13:47:28 2014
@@ -76,6 +76,9 @@ Changes in Runtime Behavior
New Features
+* LUCENE-5356: Morfologik filter can accept custom dictionary resources.
+ (Michal Hlavac, Dawid Weiss)
+
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
on multi-valued field. (Robert Muir)
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java Mon Mar 24 13:47:28 2014
@@ -31,26 +31,34 @@ import org.apache.lucene.util.Version;
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/
public class MorfologikAnalyzer extends Analyzer {
+ private final String dictionary;
private final Version version;
/**
* Builds an analyzer with the default Morfologik's dictionary (polimorf).
*
- * @param version
- * Lucene compatibility version
+ * @param version Lucene compatibility version
+ * @param dictionaryResource A constant specifying which dictionary to choose. The
+ * dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
+ * and have an associated <code>.info</code> metadata file. See the Morfologik project
+ * for details.
+ *
+ * @see "http://morfologik.blogspot.com/"
*/
- public MorfologikAnalyzer(final Version version) {
+ public MorfologikAnalyzer(final Version version, final String dictionaryResource) {
this.version = version;
+ this.dictionary = dictionaryResource;
+ }
+ public MorfologikAnalyzer(final Version version) {
+ this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
}
-
/**
* Creates a
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @param field ignored field name
- * @return A
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+ * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter} and {@link MorfologikFilter}.
*/
@@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends
return new TokenStreamComponents(
src,
- new MorfologikFilter(new StandardFilter(this.version, src), this.version));
+ new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version));
}
}
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java Mon Mar 24 13:47:28 2014
@@ -61,20 +61,29 @@ public class MorfologikFilter extends To
private int lemmaListIndex;
/**
- * Creates MorfologikFilter
- * @param in input token stream
- * @param version Lucene version compatibility for lowercasing.
+ * Creates a filter with the default (Polish) dictionary.
*/
public MorfologikFilter(final TokenStream in, final Version version) {
+ this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version);
+ }
+
+ /**
+ * Creates a filter with a given dictionary resource.
+ *
+ * @param in input token stream.
+ * @param dict Dictionary resource from classpath.
+ * @param version Lucene version compatibility for lowercasing.
+ */
+ public MorfologikFilter(final TokenStream in, final String dict, final Version version) {
super(in);
this.input = in;
-
+
// SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources.
Thread me = Thread.currentThread();
ClassLoader cl = me.getContextClassLoader();
try {
- me.setContextClassLoader(PolishStemmer.class.getClassLoader());
- this.stemmer = new PolishStemmer();
+ me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader());
+ this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict));
this.charUtils = CharacterUtils.getInstance(version);
this.lemmaList = Collections.emptyList();
} finally {
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Mon Mar 24 13:47:28 2014
@@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
- * Filter factory for {@link MorfologikFilter}.
+ * Filter factory for {@link MorfologikFilter}. For backward compatibility polish
+ * dictionary is used as default. You can change dictionary resource
+ * by dictionary-resource parameter.
* <pre class="prettyprint">
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.MorfologikFilterFactory" />
+ * <filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" />
* </analyzer>
* </fieldType></pre>
*
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
*/
public class MorfologikFilterFactory extends TokenFilterFactory {
+ /**
+ * The default dictionary resource (for Polish).
+ */
+ public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
+
+ /**
+ * Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
+ */
+ private final String dictionaryResource;
+
/** Schema attribute. */
@Deprecated
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
+ /** Dictionary resource */
+ public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
+
/** Creates a new MorfologikFilterFactory */
public MorfologikFilterFactory(Map<String,String> args) {
super(args);
@@ -47,9 +62,12 @@ public class MorfologikFilterFactory ext
String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
if (dictionaryName != null && !dictionaryName.isEmpty()) {
throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
- + "longer supported (Morfologik has one dictionary): " + dictionaryName);
+ + "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName
+ + ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
}
+ dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -57,6 +75,6 @@ public class MorfologikFilterFactory ext
@Override
public TokenStream create(TokenStream ts) {
- return new MorfologikFilter(ts, luceneMatchVersion);
+ return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion);
}
}
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Mon Mar 24 13:47:28 2014
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfo
*/
import java.io.IOException;
-import java.io.Reader;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1580853&r1=1580852&r2=1580853&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Mon Mar 24 13:47:28 2014
@@ -22,7 +22,6 @@ import java.util.Collections;
import java.util.HashMap;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
/**
@@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
- new MorfologikFilterFactory(new HashMap<String,String>() {{
- put("bogusArg", "bogusValue");
- }});
+ HashMap<String,String> map = new HashMap<String,String>();
+ map.put("bogusArg", "bogusValue");
+ new MorfologikFilterFactory(map);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));