You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 07:30:10 UTC

svn commit: r1226670 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji: ./ src/java/org/apache/lucene/analysis/kuromoji/util/ src/tools/ src/tools/java/ src/tools/java/org/ src/tools/java/org/apache/ src/tools/java/org/apache/lucene/ src/t...

Author: rmuir
Date: Tue Jan  3 06:30:09 2012
New Revision: 1226670

URL: http://svn.apache.org/viewvc?rev=1226670&view=rev
Log:
LUCENE-3305: move dictionary building to tools and use icu for nfkc

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
      - copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
      - copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
      - copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
      - copied, changed from r1226640, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
      - copied unchanged from r1226637, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
Removed:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml?rev=1226670&r1=1226669&r2=1226670&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml Tue Jan  3 06:30:09 2012
@@ -50,11 +50,17 @@
      <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
   </target>
 
-  <target name="build-dict" depends="compile-core, download-dict">
+  <path id="tools.dependencies">
+    <fileset dir="../icu/lib" includes="icu4j-*.jar"/>
+  </path>
+
+  <target name="build-dict" depends="compile-tools, download-dict">
     <java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
       <classpath>
         <pathelement path="${classpath}"/>
         <pathelement path="${build.dir}/classes/java"/>
+        <pathelement path="${build.dir}/classes/tools"/>
+        <path refid="tools.dependencies"/>
       </classpath>
       <arg value="${dict.format}"/>
       <arg value="${dict.src.dir}"/>
@@ -63,4 +69,16 @@
       <arg value="${dict.normalize}"/>
     </java>
   </target>
+
+    <target name="compile-tools" depends="compile-core, common.compile-tools">
+    <compile
+      srcdir="src/tools/java"
+      destdir="${build.dir}/classes/tools">
+      <classpath>
+        <pathelement path="${classpath}"/>
+        <pathelement path="${build.dir}/classes/java"/>
+        <path refid="tools.dependencies"/>
+      </classpath>
+    </compile>
+  </target>
 </project>

Copied: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (from r1226640, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?p2=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java&p1=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java&r1=1226640&r2=1226670&rev=1226670&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Tue Jan  3 06:30:09 2012
@@ -23,7 +23,6 @@ import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map.Entry;
@@ -33,6 +32,8 @@ import java.util.TreeMap;
 import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder.DictionaryFormat;
 
+import com.ibm.icu.text.Normalizer2;
+
 
 /**
  */
@@ -46,6 +47,7 @@ public class TokenInfoDictionaryBuilder 
   private String encoding = "euc-jp";
   
   private boolean normalizeEntries = false;
+  private Normalizer2 normalizer;
   
   private DictionaryFormat format = DictionaryFormat.IPADIC;
   
@@ -57,6 +59,7 @@ public class TokenInfoDictionaryBuilder 
     this.encoding = encoding;
     this.dictionaryEntries = new TreeMap<Integer, String>();		
     this.normalizeEntries = normalizeEntries;
+    this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
   }
   
   public TokenInfoDictionary build(String dirname) throws IOException {
@@ -100,12 +103,12 @@ public class TokenInfoDictionaryBuilder 
         
         // NFKC normalize dictionary entry
         if (normalizeEntries) {
-          if (entry[0].equals(Normalizer.normalize(entry[0], Normalizer.Form.NFKC))){
+          if (normalizer.isNormalized(entry[0])){
             continue;
           }
           String[] normalizedEntry = new String[entry.length];
           for (int i = 0; i < entry.length; i++) {
-            normalizedEntry[i] = Normalizer.normalize(entry[i], Normalizer.Form.NFKC);
+            normalizedEntry[i] = normalizer.normalize(entry[i]);
           }
           
           next = dictionary.put(formatEntry(normalizedEntry));