You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 07:30:10 UTC
svn commit: r1226670 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji: ./
src/java/org/apache/lucene/analysis/kuromoji/util/ src/tools/
src/tools/java/ src/tools/java/org/ src/tools/java/org/apache/
src/tools/java/org/apache/lucene/ src/t...
Author: rmuir
Date: Tue Jan 3 06:30:09 2012
New Revision: 1226670
URL: http://svn.apache.org/viewvc?rev=1226670&view=rev
Log:
LUCENE-3305: move dictionary building to tools and use icu for nfkc
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
- copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
- copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
- copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
- copied, changed from r1226640, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
- copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
Removed:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml?rev=1226670&r1=1226669&r2=1226670&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml Tue Jan 3 06:30:09 2012
@@ -50,11 +50,17 @@
<untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
</target>
- <target name="build-dict" depends="compile-core, download-dict">
+ <path id="tools.dependencies">
+ <fileset dir="../icu/lib" includes="icu4j-*.jar"/>
+ </path>
+
+ <target name="build-dict" depends="compile-tools, download-dict">
<java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
<classpath>
<pathelement path="${classpath}"/>
<pathelement path="${build.dir}/classes/java"/>
+ <pathelement path="${build.dir}/classes/tools"/>
+ <path refid="tools.dependencies"/>
</classpath>
<arg value="${dict.format}"/>
<arg value="${dict.src.dir}"/>
@@ -63,4 +69,16 @@
<arg value="${dict.normalize}"/>
</java>
</target>
+
+ <target name="compile-tools" depends="compile-core, common.compile-tools">
+ <compile
+ srcdir="src/tools/java"
+ destdir="${build.dir}/classes/tools">
+ <classpath>
+ <pathelement path="${classpath}"/>
+ <pathelement path="${build.dir}/classes/java"/>
+ <path refid="tools.dependencies"/>
+ </classpath>
+ </compile>
+ </target>
</project>
Copied: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (from r1226640, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?p2=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java&p1=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java&r1=1226640&r2=1226670&rev=1226670&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Tue Jan 3 06:30:09 2012
@@ -23,7 +23,6 @@ import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
@@ -33,6 +32,8 @@ import java.util.TreeMap;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder.DictionaryFormat;
+import com.ibm.icu.text.Normalizer2;
+
/**
*/
@@ -46,6 +47,7 @@ public class TokenInfoDictionaryBuilder
private String encoding = "euc-jp";
private boolean normalizeEntries = false;
+ private Normalizer2 normalizer;
private DictionaryFormat format = DictionaryFormat.IPADIC;
@@ -57,6 +59,7 @@ public class TokenInfoDictionaryBuilder
this.encoding = encoding;
this.dictionaryEntries = new TreeMap<Integer, String>();
this.normalizeEntries = normalizeEntries;
+ this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
}
public TokenInfoDictionary build(String dirname) throws IOException {
@@ -100,12 +103,12 @@ public class TokenInfoDictionaryBuilder
// NFKC normalize dictionary entry
if (normalizeEntries) {
- if (entry[0].equals(Normalizer.normalize(entry[0], Normalizer.Form.NFKC))){
+ if (normalizer.isNormalized(entry[0])){
continue;
}
String[] normalizedEntry = new String[entry.length];
for (int i = 0; i < entry.length; i++) {
- normalizedEntry[i] = Normalizer.normalize(entry[i], Normalizer.Form.NFKC);
+ normalizedEntry[i] = normalizer.normalize(entry[i]);
}
next = dictionary.put(formatEntry(normalizedEntry));