You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2012/03/28 09:33:55 UTC

svn commit: r1306198 - in /lucene/dev/trunk: dev-tools/maven/solr/core/ lucene/contrib/ solr/ solr/contrib/analysis-extras/ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/

Author: dweiss
Date: Wed Mar 28 07:33:55 2012
New Revision: 1306198

URL: http://svn.apache.org/viewvc?rev=1306198&view=rev
Log:
SOLR-3272: Solr filter factory for MorfologikFilter (Polish lemmatisation).
(Rafał Kuć via Dawid Weiss, Steven Rowe, Uwe Schindler).

Added:
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
Modified:
    lucene/dev/trunk/dev-tools/maven/solr/core/pom.xml.template
    lucene/dev/trunk/lucene/contrib/contrib-build.xml
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/common-build.xml
    lucene/dev/trunk/solr/contrib/analysis-extras/build.xml

Modified: lucene/dev/trunk/dev-tools/maven/solr/core/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/maven/solr/core/pom.xml.template?rev=1306198&r1=1306197&r2=1306198&view=diff
==============================================================================
--- lucene/dev/trunk/dev-tools/maven/solr/core/pom.xml.template (original)
+++ lucene/dev/trunk/dev-tools/maven/solr/core/pom.xml.template Wed Mar 28 07:33:55 2012
@@ -88,6 +88,11 @@
     </dependency>
     <dependency>
       <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers-morfologik</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-analyzers-phonetic</artifactId>
       <version>${project.version}</version>
     </dependency>

Modified: lucene/dev/trunk/lucene/contrib/contrib-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/contrib-build.xml?rev=1306198&r1=1306197&r2=1306198&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/contrib-build.xml (original)
+++ lucene/dev/trunk/lucene/contrib/contrib-build.xml Wed Mar 28 07:33:55 2012
@@ -163,16 +163,31 @@
   </target>
 
   <property name="analyzers-uima.jar" value="${common.dir}/../modules/analysis/build/uima/lucene-analyzers-uima-${version}.jar"/>
-    <target name="check-analyzers-uima-uptodate" unless="analyzers-uima.uptodate">
-      <module-uptodate name="analysis/uima" jarfile="${analyzers-uima.jar}" property="analyzers-uima.uptodate"/>
-    </target>
-    <target name="jar-analyzers-uima" unless="analyzers-uima.uptodate" depends="check-analyzers-uima-uptodate">
-    	<ant dir="${common.dir}/../modules/analysis/uima" target="jar-core" inheritAll="false">
-        <propertyset refid="uptodate.and.compiled.properties"/>
-      </ant>
+  <target name="check-analyzers-uima-uptodate" unless="analyzers-uima.uptodate">
+    <module-uptodate name="analysis/uima" jarfile="${analyzers-uima.jar}" property="analyzers-uima.uptodate"/>
+  </target>
+  <target name="jar-analyzers-uima" unless="analyzers-uima.uptodate" depends="check-analyzers-uima-uptodate">
+    <ant dir="${common.dir}/../modules/analysis/uima" target="jar-core" inheritAll="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+    </ant>
     <property name="analyzers-uima.uptodate" value="true"/>
   </target>
 
+  <property name="analyzers-morfologik.jar" value="${common.dir}/../modules/analysis/build/morfologik/lucene-analyzers-morfologik-${version}.jar"/>
+  <fileset id="analyzers-morfologik.fileset" dir="${common.dir}/../modules/analysis">
+    <include name="build/morfologik/lucene-analyzers-morfologik-${version}.jar" />
+    <include name="morfologik/lib/*.jar" />
+  </fileset>
+  <target name="check-analyzers-morfologik-uptodate" unless="analyzers-morfologik.uptodate">
+    <module-uptodate name="analysis/morfologik" jarfile="${analyzers-morfologik.jar}" property="analyzers-morfologik.uptodate"/>
+  </target>
+  <target name="jar-analyzers-morfologik" unless="analyzers-morfologik.uptodate" depends="check-analyzers-morfologik-uptodate">
+    <ant dir="${common.dir}/../modules/analysis/morfologik" target="jar-core" inheritAll="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+    </ant>
+    <property name="analyzers-morfologik.uptodate" value="true"/>
+  </target>
+
   <property name="grouping.jar" value="${common.dir}/../modules/grouping/build/lucene-grouping-${version}.jar"/>
   <target name="check-grouping-uptodate" unless="grouping.uptodate">
     <module-uptodate name="grouping" jarfile="${grouping.jar}" property="grouping.uptodate"/>

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1306198&r1=1306197&r2=1306198&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Mar 28 07:33:55 2012
@@ -1,4 +1,4 @@
-                      Apache Solr Release Notes
+                      Apache Solr Release Notes
 
 Introduction
 ------------
@@ -74,6 +74,9 @@ Detailed Change List
 New Features
 ----------------------
 
+* SOLR-3272: Solr filter factory for MorfologikFilter (Polish lemmatisation).
+  (Rafał Kuć via Dawid Weiss, Steven Rowe, Uwe Schindler).
+
 * SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now 
   supports "percentages" which get evaluated  relative the current size of 
   the cache when warming happens. 

Modified: lucene/dev/trunk/solr/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/common-build.xml?rev=1306198&r1=1306197&r2=1306198&view=diff
==============================================================================
--- lucene/dev/trunk/solr/common-build.xml (original)
+++ lucene/dev/trunk/solr/common-build.xml Wed Mar 28 07:33:55 2012
@@ -133,7 +133,7 @@
   </target>
 
   <target name="prep-lucene-jars" 
-  	      depends="jar-lucene-core, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-suggest, jar-highlighter, jar-memory,
+  	      depends="jar-lucene-core, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-analyzers-morfologik, jar-suggest, jar-highlighter, jar-memory,
   	               jar-misc, jar-spatial, jar-grouping, jar-queries, jar-queryparser">
   	  <property name="solr.deps.compiled" value="true"/>
   </target>
@@ -157,6 +157,7 @@
       <fileset file="${memory.jar}" />
       <fileset file="${misc.jar}" />
       <fileset file="${spatial.jar}" />
+      <fileset refid="analyzers-morfologik.fileset" />
     </copy>
     </sequential>
   </target>

Modified: lucene/dev/trunk/solr/contrib/analysis-extras/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/build.xml?rev=1306198&r1=1306197&r2=1306198&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/build.xml (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/build.xml Wed Mar 28 07:33:55 2012
@@ -29,19 +29,21 @@
   	<pathelement path="${analyzers-icu.jar}"/>
   	<pathelement path="${analyzers-smartcn.jar}"/>
   	<pathelement path="${analyzers-stempel.jar}"/>
+  	<fileset refid="analyzers-morfologik.fileset" />
     <path refid="solr.base.classpath"/>
   </path>
 
   <target name="module-jars-to-solr"
-          depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel">
+          depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, jar-analyzers-morfologik">
     <mkdir dir="${build.dir}/lucene-libs"/>
     <copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
       <fileset file="${analyzers-icu.jar}"/>
       <fileset file="${analyzers-smartcn.jar}"/>
       <fileset file="${analyzers-stempel.jar}"/>
+      <fileset refid="analyzers-morfologik.fileset" />
     </copy>
   </target>
 
-  <target name="compile-core" depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, solr-contrib-build.compile-core"/>
+  <target name="compile-core" depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, jar-analyzers-morfologik, solr-contrib-build.compile-core"/>
   <target name="dist" depends="module-jars-to-solr, common-solr.dist"/>
 </project>

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java?rev=1306198&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java Wed Mar 28 07:33:55 2012
@@ -0,0 +1,81 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.Arrays;
+import java.util.Locale;
+import java.util.Map;
+
+import morfologik.stemming.PolishStemmer.DICTIONARY;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.morfologik.MorfologikFilter;
+
+/**
+ * Filter factory for {@link MorfologikFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * 
+ * <p>Any of Morfologik dictionaries can be used, these are at the moment:
+ * <code>MORFOLOGIK</code> (Morfologik's original dictionary),
+ * <code>MORFEUSZ</code> (Morfeusz-SIAT),
+ * <code>COMBINED</code> (both of the dictionaries above, combined).
+ * 
+ * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
+ */
+public class MorfologikFilterFactory extends BaseTokenFilterFactory {
+  /** Dictionary. */
+  private DICTIONARY dictionary = DICTIONARY.MORFOLOGIK;
+  
+  /** Schema attribute. */
+  public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
+  
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public TokenStream create(TokenStream ts) {
+    return new MorfologikFilter(ts, dictionary, luceneMatchVersion);
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    String dictionaryName = args.get(DICTIONARY_SCHEMA_ATTRIBUTE);
+    if (dictionaryName != null && !dictionaryName.isEmpty()) {
+      try {
+        DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ENGLISH));
+        assert dictionary != null;
+        this.dictionary = dictionary;
+      } catch (IllegalArgumentException e) {
+        throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute accepts the "
+            + "following constants: " + Arrays.toString(DICTIONARY.values()) + ", this value is invalid: "  
+            + dictionaryName);
+      }
+    }
+  }
+}

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java?rev=1306198&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java Wed Mar 28 07:33:55 2012
@@ -0,0 +1,45 @@
+package org.apache.solr.analysis;
+
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.solr.schema.IndexSchema;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Test for {@link MorfologikFilterFactory}.
+ */
+public class TestMorfologikFilterFactory extends BaseTokenTestCase {
+  public void testCreateDictionary() throws Exception {
+    StringReader reader = new StringReader("rowery bilety");
+    Map<String,String> initParams = new HashMap<String,String>();
+    initParams.put(IndexSchema.LUCENE_MATCH_VERSION_PARAM,
+        DEFAULT_VERSION.toString());
+    initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE,
+        "morfologik");
+    MorfologikFilterFactory factory = new MorfologikFilterFactory();
+    factory.init(initParams);
+    TokenStream ts = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION,
+        reader));
+    assertTokenStreamContents(ts, new String[] {"rower", "bilet"});
+  }
+}