You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/11/02 13:03:20 UTC
svn commit: r1030012 - in /lucene/dev/trunk/solr: ./
contrib/analysis-extras/ contrib/analysis-extras/lib/
contrib/analysis-extras/src/ contrib/analysis-extras/src/java/
contrib/analysis-extras/src/java/org/
contrib/analysis-extras/src/java/org/apache/...
Author: rmuir
Date: Tue Nov 2 12:03:18 2010
New Revision: 1030012
URL: http://svn.apache.org/viewvc?rev=1030012&view=rev
Log:
SOLR-2210: add factories for icu analyzers
Added:
lucene/dev/trunk/solr/contrib/analysis-extras/
lucene/dev/trunk/solr/contrib/analysis-extras/README.txt (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/build.xml (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/lib/
lucene/dev/trunk/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/test-files/
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/build.xml
lucene/dev/trunk/solr/common-build.xml
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1030012&r1=1030011&r2=1030012&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Tue Nov 2 12:03:18 2010
@@ -297,6 +297,8 @@ New Features
built-in load balancing, and infrastructure for future SolrCloud work.
(yonik, Mark Miller)
+* SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)
+
Optimizations
----------------------
Modified: lucene/dev/trunk/solr/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/build.xml?rev=1030012&r1=1030011&r2=1030012&view=diff
==============================================================================
--- lucene/dev/trunk/solr/build.xml (original)
+++ lucene/dev/trunk/solr/build.xml Tue Nov 2 12:03:18 2010
@@ -34,9 +34,6 @@
<property name="clover.db.dir" location="${dest}/tests/clover/db"/>
<property name="clover.report.dir" location="${dest}/tests/clover/reports"/>
- <!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
- <property name="tests.luceneMatchVersion" value="4.0"/>
-
<available
property="clover.present"
classname="com.cenqua.clover.tasks.CloverReportTask"
@@ -221,6 +218,7 @@
<packageset dir="contrib/dataimporthandler/src/main/java" />
<!--<packageset dir="contrib/clustering/src/main/java" />-->
<packageset dir="contrib/extraction/src/main/java" />
+ <packageset dir="contrib/analysis-extras/src/java" />
<group title="Core" packages="org.apache.*" />
<group title="Common" packages="org.apache.solr.common.*" />
<group title="SolrJ" packages="org.apache.solr.client.solrj*" />
@@ -509,6 +507,7 @@
<fileset dir="contrib/dataimporthandler/src/main/java" />
<fileset dir="contrib/clustering/src/main/java" />
<fileset dir="contrib/extraction/src/main/java" />
+ <fileset dir="contrib/analysis-extras/src/java" />
</clover-setup>
</target>
@@ -609,6 +608,8 @@
basedir="contrib/extraction/src" />
<!--<solr-jar destfile="${dist}/apache-solr-clustering-src-${version}.jar"
basedir="contrib/clustering/src" />-->
+ <solr-jar destfile="${dist}/apache-solr-analysis-extras-src-${version}.jar"
+ basedir="contrib/analysis-extras/src" />
</target>
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
@@ -625,6 +626,8 @@
basedir="${build.javadoc}/contrib-solr-clustering" />-->
<solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar"
basedir="${build.javadoc}/contrib-solr-cell" />
+ <solr-jar destfile="${dist}/apache-solr-analysis-extras-docs-${version}.jar"
+ basedir="${build.javadoc}/contrib-solr-analysis-extras" />
</target>
<!-- Creates the solr jar. -->
@@ -721,7 +724,7 @@
<tarfileset dir="."
prefix="${fullnamever}"
includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/"
- excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/**" />
+ excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" />
<tarfileset dir="."
prefix="${fullnamever}"
includes="src/test/test-files/solr/lib/classes/empty-file-main-lib.txt" />
@@ -952,6 +955,8 @@
<fileset dir="contrib/clustering/src/test/java"/>
<fileset dir="contrib/extraction/src/main/java"/>
<fileset dir="contrib/extraction/src/test/java"/>
+ <fileset dir="contrib/analysis-extras/src/test"/>
+ <fileset dir="contrib/analysis-extras/src/test"/>
</rat:report>
</target>
Modified: lucene/dev/trunk/solr/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/common-build.xml?rev=1030012&r1=1030011&r2=1030012&view=diff
==============================================================================
--- lucene/dev/trunk/solr/common-build.xml (original)
+++ lucene/dev/trunk/solr/common-build.xml Tue Nov 2 12:03:18 2010
@@ -23,6 +23,9 @@
<dirname file="${ant.file.common-solr}" property="common-solr.dir"/>
+ <!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
+ <property name="tests.luceneMatchVersion" value="4.0"/>
+
<!-- Initialize property values: allow easy customization via build.properties -->
<property file="build.properties" />
Added: lucene/dev/trunk/solr/contrib/analysis-extras/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/README.txt?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/README.txt (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/README.txt Tue Nov 2 12:03:18 2010
@@ -0,0 +1,16 @@
+The analysis-extras plugin provides additional analyzers that rely
+upon large dependencies/dictionaries.
+
+It includes integration with ICU for multilingual support, and
+analyzers for Chinese and Polish.
+
+Relies upon the following lucene components (in lucene-libs/):
+
+ * lucene-analyzers-icu-X.Y.jar
+ * lucene-analyzers-smartcn-X.Y.jar
+ * lucene-analyzers-stempel-X.Y.jar
+
+And the ICU library (in lib/):
+
+ * icu4j-X.Y.jar
+
\ No newline at end of file
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/README.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/build.xml?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/build.xml (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/build.xml Tue Nov 2 12:03:18 2010
@@ -0,0 +1,203 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project name="solr-extraAnalyzers" default="build">
+
+ <property name="solr-path" value="../.."/>
+
+ <import file="../../common-build.xml"/>
+
+ <description>
+ Additional analysis components
+ </description>
+
+ <property name="example.local" value="example"/>
+
+ <!-- support for the additional analyzers modules -->
+ <path id="modules.classpath">
+ <pathelement location="${common-solr.dir}/../modules/analysis/build/icu/classes/java" />
+ <pathelement location="${common-solr.dir}/../modules/analysis/build/smartcn/classes/java" />
+ <pathelement location="${common-solr.dir}/../modules/analysis/build/stempel/classes/java" />
+ </path>
+
+ <target name="prep-module-jars">
+ <subant target="jar" inheritall="false" failonerror="true">
+ <fileset dir="${common-solr.dir}/../modules/analysis/icu" includes="build.xml" />
+ <fileset dir="${common-solr.dir}/../modules/analysis/smartcn" includes="build.xml" />
+ <fileset dir="${common-solr.dir}/../modules/analysis/stempel" includes="build.xml" />
+ </subant>
+ </target>
+
+ <target name="module-jars-to-solr" depends="prep-module-jars">
+ <mkdir dir="${lucene-libs}"/>
+ <copy todir="${lucene-libs}" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
+ <fileset dir="${common-solr.dir}/../modules/analysis/build/icu">
+ <include name="lucene-analyzers-icu-${version}.jar" />
+ </fileset>
+ <fileset dir="${common-solr.dir}/../modules/analysis/build/smartcn">
+ <include name="lucene-analyzers-smartcn-${version}.jar" />
+ </fileset>
+ <fileset dir="${common-solr.dir}/../modules/analysis/build/stempel">
+ <include name="lucene-analyzers-stempel-${version}.jar" />
+ </fileset>
+ </copy>
+ </target>
+
+ <path id="common.classpath">
+ <fileset dir="lib"/>
+ <pathelement location="${solr-path}/build/solr"/>
+ <pathelement location="${solr-path}/build/solrj"/>
+ <path refid="lucene.classpath"/>
+ <path refid="modules.classpath"/>
+ <fileset dir="${solr-path}/lib" includes="*.jar"/>
+ </path>
+
+ <path id="test.classpath">
+ <pathelement path="${dest}/classes"/>
+ <pathelement path="${dest}/test-classes"/>
+ <pathelement path="${java.class.path}"/>
+ <pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
+ <pathelement location="${common-solr.dir}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
+ <path refid="common.classpath"/>
+ </path>
+
+ <target name="clean">
+ <delete failonerror="false" dir="${dest}"/>
+
+ <!-- example doesn't create this anymore, but clean it up
+ if it's still there from an old build
+ -->
+ <delete dir="example/lib" />
+ <delete dir="${lucene-libs}" />
+ </target>
+
+
+ <target name="init" depends="module-jars-to-solr">
+ <mkdir dir="${dest}/classes"/>
+
+ <mkdir dir="${build.javadoc}"/>
+ <subant target="compileTests">
+ <fileset dir="${solr-path}" includes="build.xml"/>
+ </subant>
+ <subant target="make-manifest">
+ <fileset dir="${solr-path}" includes="build.xml"/>
+ </subant>
+ </target>
+
+
+ <target name="compile" depends="init">
+ <solr-javac destdir="${dest}/classes"
+ classpathref="common.classpath">
+ <src path="src/java"/>
+ </solr-javac>
+ </target>
+
+ <target name="build" depends="compile">
+ <solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
+ manifest="../../${dest}/META-INF/MANIFEST.MF"/>
+ </target>
+
+ <target name="compileTests" depends="compile">
+ <solr-javac destdir="${dest}/test-classes"
+ classpathref="test.classpath">
+ <src path="src/test"/>
+ </solr-javac>
+ </target>
+
+ <target name="example" depends="build,dist">
+ <!-- this task use to copy lib's but that's no longer needed because
+ ../lib and ../lib/downloads are now included explicitly by
+ example/conf/solrconfig.xml
+ -->
+ </target>
+
+
+ <target name="test" depends="compileTests">
+ <mkdir dir="${junit.output.dir}"/>
+
+ <junit printsummary="no"
+ haltonfailure="no"
+ maxmemory="512M"
+ errorProperty="tests.failed"
+ failureProperty="tests.failed"
+ dir="src/test/test-files/"
+ tempdir="${junit.output.dir}"
+ forkmode="perBatch"
+ >
+ <sysproperty key="java.util.logging.config.file" value="${common-solr.dir}/testlogging.properties"/>
+ <sysproperty key="tests.luceneMatchVersion" value="${tests.luceneMatchVersion}"/>
+ <sysproperty key="tests.codec" value="${tests.codec}"/>
+ <sysproperty key="tests.locale" value="${tests.locale}"/>
+ <sysproperty key="tests.timezone" value="${tests.timezone}"/>
+ <sysproperty key="tests.multiplier" value="${tests.multiplier}"/>
+ <sysproperty key="tests.seed" value="${tests.seed}"/>
+ <sysproperty key="tests.iter" value="${tests.iter}"/>
+ <sysproperty key="jetty.testMode" value="1"/>
+ <sysproperty key="tempDir" file="${junit.output.dir}"/>
+ <sysproperty key="testmethod" value="${testmethod}"/>
+ <jvmarg line="${args}"/>
+ <formatter classname="${junit.details.formatter}" usefile="false" if="junit.details"/>
+ <classpath refid="test.classpath"/>
+ <assertions>
+ <enable package="org.apache.lucene"/>
+ <enable package="org.apache.solr"/>
+ </assertions>
+ <formatter type="${junit.formatter}"/>
+ <batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
+ <fileset dir="src/test" includes="${junit.includes}"/>
+ </batchtest>
+ <batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
+ <fileset dir="src/test" includes="**/${testcase}.java"/>
+ </batchtest>
+ </junit>
+
+ <fail if="tests.failed">Tests failed!</fail>
+ </target>
+
+ <target name="dist" depends="build">
+ <!--
+ <copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/build/web/WEB-INF/lib"/>
+ <copy todir="${solr-path}/build/web/WEB-INF/lib" flatten="true">
+ <fileset dir="lib">
+ <include name="**/*.jar"/>
+ </fileset>
+ </copy>
+ -->
+ <copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/dist"/>
+ </target>
+
+ <target name="javadoc">
+ <sequential>
+ <mkdir dir="${build.javadoc}/contrib-${name}"/>
+
+ <path id="javadoc.classpath">
+ <path refid="common.classpath"/>
+ </path>
+
+ <invoke-javadoc
+ destdir="${build.javadoc}/contrib-${name}"
+ title="${Name} ${version} contrib-${fullnamever} API">
+ <sources>
+ <packageset dir="src/java"/>
+ </sources>
+ </invoke-javadoc>
+ </sequential>
+ </target>
+
+</project>
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar?rev=1030012&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,142 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.collation.ICUCollationKeyFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Factory for {@link ICUCollationKeyFilter}.
+ * <p>
+ * This factory can be created in two ways:
+ * <ul>
+ * <li>Based upon a system collator associated with a Locale.
+ * <li>Based upon a tailored ruleset.
+ * </ul>
+ * <p>
+ * Using a System collator:
+ * <ul>
+ * <li>locale: RFC 3066 locale ID (mandatory)
+ * <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ * <li>decomposition: 'no', or 'canonical' (optional)
+ * </ul>
+ * <p>
+ * Using a Tailored ruleset:
+ * <ul>
+ * <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
+ * <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ * <li>decomposition: 'no' or 'canonical' (optional)
+ * </ul>
+ *
+ * @see Collator
+ * @see ULocale
+ * @see RuleBasedCollator
+ */
+public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private Collator collator;
+
+ public void inform(ResourceLoader loader) {
+ String custom = args.get("custom");
+ String localeID = args.get("locale");
+ String strength = args.get("strength");
+ String decomposition = args.get("decomposition");
+
+ if (custom == null && localeID == null)
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
+
+ if (custom != null && localeID != null)
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
+ + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+ + "Then save the entire customized ruleset to a file, and use with the custom parameter");
+
+ if (localeID != null) {
+ // create from a system collator, based on Locale.
+ collator = createFromLocale(localeID);
+ } else {
+ // create from a custom ruleset
+ collator = createFromRules(custom, loader);
+ }
+
+ // set the strength flag, otherwise it will be the default.
+ if (strength != null) {
+ if (strength.equalsIgnoreCase("primary"))
+ collator.setStrength(Collator.PRIMARY);
+ else if (strength.equalsIgnoreCase("secondary"))
+ collator.setStrength(Collator.SECONDARY);
+ else if (strength.equalsIgnoreCase("tertiary"))
+ collator.setStrength(Collator.TERTIARY);
+ else if (strength.equalsIgnoreCase("quaternary"))
+ collator.setStrength(Collator.QUATERNARY);
+ else if (strength.equalsIgnoreCase("identical"))
+ collator.setStrength(Collator.IDENTICAL);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
+ }
+
+ // set the decomposition flag, otherwise it will be the default.
+ if (decomposition != null) {
+ if (decomposition.equalsIgnoreCase("no"))
+ collator.setDecomposition(Collator.NO_DECOMPOSITION);
+ else if (decomposition.equalsIgnoreCase("canonical"))
+ collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
+ }
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new ICUCollationKeyFilter(input, collator);
+ }
+
+ /*
+ * Create a locale from localeID.
+ * Then return the appropriate collator for the locale.
+ */
+ private Collator createFromLocale(String localeID) {
+ return Collator.getInstance(new ULocale(localeID));
+ }
+
+ /*
+ * Read custom rules from a file, and create a RuleBasedCollator
+ * The file cannot support comments, as # might be in the rules!
+ */
+ private Collator createFromRules(String fileName, ResourceLoader loader) {
+ InputStream input = null;
+ try {
+ input = loader.openResource(fileName);
+ String rules = IOUtils.toString(input, "UTF-8");
+ return new RuleBasedCollator(rules);
+ } catch (Exception e) {
+ // io error or invalid rules
+ throw new RuntimeException(e);
+ } finally {
+ IOUtils.closeQuietly(input);
+ }
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,30 @@
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUFoldingFilter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Factory for {@link ICUFoldingFilter} */
+public class ICUFoldingFilterFactory extends BaseTokenFilterFactory {
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new ICUFoldingFilter(input);
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,81 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+import com.ibm.icu.text.FilteredNormalizer2;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Factory for {@link ICUNormalizer2Filter}
+ * <p>
+ * Supports the following attributes:
+ * <ul>
+ * <li>name: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>,
+ * one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf.
+ * <li>mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc
+ * or nfkc, to get nfd or nfkd, respectively.
+ * <li>filter: A {@link UnicodeSet} pattern. Codepoints outside the set are
+ * always left unchanged. Default is [] (the null set, no filtering).
+ * </ul>
+ * @see ICUNormalizer2Filter
+ * @see Normalizer2
+ * @see FilteredNormalizer2
+ */
+public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
+ private Normalizer2 normalizer;
+
+ // TODO: support custom normalization
+ @Override
+ public void init(Map<String,String> args) {
+ super.init(args);
+ String name = args.get("name");
+ if (name == null)
+ name = "nfkc_cf";
+ String mode = args.get("mode");
+ if (mode == null)
+ mode = "compose";
+
+ if (mode.equals("compose"))
+ normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE);
+ else if (mode.equals("decompose"))
+ normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode);
+
+ String filter = args.get("filter");
+ if (filter != null) {
+ UnicodeSet set = new UnicodeSet(filter);
+ if (!set.isEmpty()) {
+ set.freeze();
+ normalizer = new FilteredNormalizer2(normalizer, set);
+ }
+ }
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new ICUNormalizer2Filter(input, normalizer);
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,32 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
+
+/** Factory for {@link ICUTokenizer} */
+public class ICUTokenizerFactory extends BaseTokenizerFactory {
+ // TODO: add support for custom configs
+ @Override
+ public Tokenizer create(Reader input) {
+ return new ICUTokenizer(input);
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,67 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUTransformFilter;
+import org.apache.solr.analysis.BaseTokenFilterFactory;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+import com.ibm.icu.text.Transliterator;
+
+/**
+ * Factory for {@link ICUTransformFilter}.
+ * <p>
+ * Supports the following attributes:
+ * <ul>
+ * <li>id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()}
+ * <li>direction (optional): Either 'forward' or 'reverse'. Default is forward.
+ * </ul>
+ * @see Transliterator
+ */
+public class ICUTransformFilterFactory extends BaseTokenFilterFactory {
+ private Transliterator transliterator;
+
+ // TODO: add support for custom rules
+ @Override
+ public void init(Map<String,String> args) {
+ super.init(args);
+ String id = args.get("id");
+ if (id == null) {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "id is required.");
+ }
+
+ int dir;
+ String direction = args.get("direction");
+ if (direction == null || direction.equalsIgnoreCase("forward"))
+ dir = Transliterator.FORWARD;
+ else if (direction.equalsIgnoreCase("reverse"))
+ dir = Transliterator.REVERSE;
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction);
+
+ transliterator = Transliterator.getInstance(id, dir);
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new ICUTransformFilter(input, transliterator);
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,170 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.solr.common.ResourceLoader;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
+
+ /*
+ * Turkish has some funny casing.
+ * This test shows how you can solve this kind of thing easily with collation.
+ * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
+ * Then things will sort and match correctly.
+ */
+ public void testBasicUsage() throws IOException {
+ String turkishUpperCase = "I WÄ°LL USE TURKÄ°SH CASING";
+ String turkishLowerCase = "ı will use turkish casıng";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "tr");
+ args.put("strength", "primary");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(turkishUpperCase)));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(turkishLowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * Test usage of the decomposition option for unicode normalization.
+ */
+ public void testNormalization() throws IOException {
+ String turkishUpperCase = "I W\u0049\u0307LL USE TURKÄ°SH CASING";
+ String turkishLowerCase = "ı will use turkish casıng";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "tr");
+ args.put("strength", "primary");
+ args.put("decomposition", "canonical");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(turkishUpperCase)));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(turkishLowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * Test secondary strength, for english case is not significant.
+ */
+ public void testSecondaryStrength() throws IOException {
+ String upperCase = "TESTING";
+ String lowerCase = "testing";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "secondary");
+ args.put("decomposition", "no");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(upperCase)));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(lowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * For german, you might want oe to sort and match with o umlaut.
+ * This is not the default, but you can make a customized ruleset to do this.
+ *
+ * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
+ * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
+ */
+ public void testCustomRules() throws Exception {
+ RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
+
+ String DIN5007_2_tailorings =
+ "& ae , a\u0308 & AE , A\u0308"+
+ "& oe , o\u0308 & OE , O\u0308"+
+ "& ue , u\u0308 & UE , u\u0308";
+
+ RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+ String tailoredRules = tailoredCollator.getRules();
+ //
+ // at this point, you would save these tailoredRules to a file,
+ // and use the custom parameter.
+ //
+ String germanUmlaut = "Töne";
+ String germanOE = "Toene";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("custom", "rules.txt");
+ args.put("strength", "primary");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(tailoredRules));
+ TokenStream tsUmlaut = factory.create(
+ new KeywordTokenizer(new StringReader(germanUmlaut)));
+ TokenStream tsOE = factory.create(
+ new KeywordTokenizer(new StringReader(germanOE)));
+
+ assertCollatesToSame(tsUmlaut, tsOE);
+ }
+
+ private class StringMockSolrResourceLoader implements ResourceLoader {
+ String text;
+
+ StringMockSolrResourceLoader(String text) {
+ this.text = text;
+ }
+
+ public List<String> getLines(String resource) throws IOException {
+ return null;
+ }
+
+ public Object newInstance(String cname, String... subpackages) {
+ return null;
+ }
+
+ public InputStream openResource(String resource) throws IOException {
+ return new ByteArrayInputStream(text.getBytes("UTF-8"));
+ }
+ }
+
+ private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
+ throws IOException {
+ CharTermAttribute term1 = stream1
+ .addAttribute(CharTermAttribute.class);
+ CharTermAttribute term2 = stream2
+ .addAttribute(CharTermAttribute.class);
+ assertTrue(stream1.incrementToken());
+ assertTrue(stream2.incrementToken());
+ assertEquals(term1.toString(), term2.toString());
+ assertFalse(stream1.incrementToken());
+ assertFalse(stream2.incrementToken());
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,39 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUFoldingFilterFactory} */
+public class TestICUFoldingFilterFactory extends BaseTokenTestCase {
+
+ /** basic tests to ensure the folding is working */
+ public void test() throws Exception {
+ Reader reader = new StringReader("Résumé");
+ ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory();
+ factory.init(DEFAULT_VERSION_PARAM);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "resume" });
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,41 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUNormalizer2FilterFactory} */
+public class TestICUNormalizer2FilterFactory extends BaseTokenTestCase {
+
+ /** Test nfkc_cf defaults */
+ public void testDefaults() throws Exception {
+ Reader reader = new StringReader("This is a ï¼´ï½
ï½ï½");
+ ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory();
+ factory.init(DEFAULT_VERSION_PARAM);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
+ }
+
+ // TODO: add tests for different forms
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,35 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/** basic tests for {@link ICUTokenizerFactory} **/
+public class TestICUTokenizerFactory extends BaseTokenTestCase {
+ public void testMixedText() throws Exception {
+ Reader reader = new StringReader("à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ This is a test àºàº§à»àº²àºàºàº");
+ ICUTokenizerFactory factory = new ICUTokenizerFactory();
+ TokenStream stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "วà¹à¸²", "à¸à¸²à¸", "à¸à¸µ",
+ "This", "is", "a", "test", "àºàº§à»àº²", "àºàºàº"});
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java?rev=1030012&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java Tue Nov 2 12:03:18 2010
@@ -0,0 +1,64 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUTransformFilterFactory} */
+public class TestICUTransformFilterFactory extends BaseTokenTestCase {
+
+ /** ensure the transform is working */
+ public void test() throws Exception {
+ Reader reader = new StringReader("ç°¡åå");
+ ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("id", "Traditional-Simplified");
+ factory.init(args);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "ç®åå" });
+ }
+
+ /** test forward and reverse direction */
+ public void testDirection() throws Exception {
+ // forward
+ Reader reader = new StringReader("РоÑÑийÑÐºÐ°Ñ Ð¤ÐµÐ´ÐµÑаÑиÑ");
+ ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("id", "Cyrillic-Latin");
+ factory.init(args);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" });
+
+ // backward (invokes Latin-Cyrillic)
+ reader = new StringReader("Rossijskaâ Federaciâ");
+ args.put("direction", "reverse");
+ factory.init(args);
+ tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "РоÑÑийÑкаÑ", "ФедеÑаÑиÑ" });
+ }
+}
Propchange: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native